MAESTRO: Implement KeywordScorer with presence/absence keyword checking and ratio scoring

This commit is contained in:
John Lightner 2026-04-07 03:02:40 -05:00
parent bc1d41e3a6
commit 0d5a6169c5
4 changed files with 261 additions and 2 deletions

View file

@ -23,7 +23,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
- [x] Implement backend/engine/scorers/format.py — checks if output matches expected format. Supports: json (valid JSON parse), markdown (has headers, lists), length (within min/max token count), structure (matches a provided JSON schema). - [x] Implement backend/engine/scorers/format.py — checks if output matches expected format. Supports: json (valid JSON parse), markdown (has headers, lists), length (within min/max token count), structure (matches a provided JSON schema).
<!-- Completed: FormatScorer with 4 format checks (json, markdown, length, structure). JSON schema validation via jsonschema library with basic fallback. 38 tests in test_scorer_format.py, all passing. --> <!-- Completed: FormatScorer with 4 format checks (json, markdown, length, structure). JSON schema validation via jsonschema library with basic fallback. 38 tests in test_scorer_format.py, all passing. -->
- [ ] Implement backend/engine/scorers/keyword.py — checks for presence/absence of required keywords in output. Configurable with required_present and required_absent lists. Score = (found / required) ratio. - [x] Implement backend/engine/scorers/keyword.py — checks for presence/absence of required keywords in output. Configurable with required_present and required_absent lists. Score = (found / required) ratio.
<!-- Completed: KeywordScorer with required_present/required_absent lists, case-sensitive option, combined ratio scoring. 37 tests in test_scorer_keyword.py, all passing. -->
- [ ] Implement backend/engine/scorers/llm_judge.py — sends the output to a separate LLM with a configurable judge prompt and asks for a 1-10 rating. Parses the numeric score from the response. This scorer requires an LLM call so it should be clearly marked as "costs tokens" in the UI. Cache the judge's response too. - [ ] Implement backend/engine/scorers/llm_judge.py — sends the output to a separate LLM with a configurable judge prompt and asks for a 1-10 rating. Parses the numeric score from the response. This scorer requires an LLM call so it should be clearly marked as "costs tokens" in the UI. Cache the judge's response too.

View file

@ -3,5 +3,6 @@
from engine.scorers.base import BaseScorer from engine.scorers.base import BaseScorer
from engine.scorers.embedding import EmbeddingScorer from engine.scorers.embedding import EmbeddingScorer
from engine.scorers.format import FormatScorer from engine.scorers.format import FormatScorer
from engine.scorers.keyword import KeywordScorer
__all__ = ["BaseScorer", "EmbeddingScorer", "FormatScorer"] __all__ = ["BaseScorer", "EmbeddingScorer", "FormatScorer", "KeywordScorer"]

View file

@ -0,0 +1,64 @@
"""Keyword scorer — checks for presence/absence of required keywords in output.
Configurable with required_present and required_absent keyword lists.
Score is computed as (matches / total_requirements) ratio.
"""
from typing import Any
from engine.scorers.base import BaseScorer
class KeywordScorer(BaseScorer):
"""Score outputs based on keyword presence and absence.
Args:
required_present: Keywords that must appear in the output.
required_absent: Keywords that must not appear in the output.
case_sensitive: Whether keyword matching is case-sensitive (default False).
"""
def __init__(
self,
required_present: list[str] | None = None,
required_absent: list[str] | None = None,
case_sensitive: bool = False,
) -> None:
self.required_present = required_present or []
self.required_absent = required_absent or []
self.case_sensitive = case_sensitive
if not self.required_present and not self.required_absent:
raise ValueError(
"At least one of required_present or required_absent must be provided."
)
@property
def name(self) -> str:
return "keyword"
def score(self, input_data: Any, output: str, context: dict) -> float:
"""Score output based on keyword presence/absence.
Returns the ratio of satisfied keyword requirements to total requirements.
Each keyword in required_present scores a point if found.
Each keyword in required_absent scores a point if NOT found.
"""
total = len(self.required_present) + len(self.required_absent)
if total == 0:
return 1.0
check_output = output if self.case_sensitive else output.lower()
satisfied = 0
for keyword in self.required_present:
check_keyword = keyword if self.case_sensitive else keyword.lower()
if check_keyword in check_output:
satisfied += 1
for keyword in self.required_absent:
check_keyword = keyword if self.case_sensitive else keyword.lower()
if check_keyword not in check_output:
satisfied += 1
return satisfied / total

View file

@ -0,0 +1,193 @@
"""Tests for the KeywordScorer."""
import asyncio
import pytest
from engine.scorers.keyword import KeywordScorer
class TestKeywordScorerInit:
def test_with_required_present(self):
scorer = KeywordScorer(required_present=["hello"])
assert scorer.required_present == ["hello"]
assert scorer.required_absent == []
def test_with_required_absent(self):
scorer = KeywordScorer(required_absent=["bad"])
assert scorer.required_present == []
assert scorer.required_absent == ["bad"]
def test_with_both_lists(self):
scorer = KeywordScorer(required_present=["good"], required_absent=["bad"])
assert scorer.required_present == ["good"]
assert scorer.required_absent == ["bad"]
def test_empty_lists_raises(self):
with pytest.raises(ValueError, match="At least one of"):
KeywordScorer()
def test_both_none_raises(self):
with pytest.raises(ValueError, match="At least one of"):
KeywordScorer(required_present=None, required_absent=None)
def test_both_empty_raises(self):
with pytest.raises(ValueError, match="At least one of"):
KeywordScorer(required_present=[], required_absent=[])
def test_name_property(self):
scorer = KeywordScorer(required_present=["test"])
assert scorer.name == "keyword"
def test_is_base_scorer(self):
from engine.scorers.base import BaseScorer
scorer = KeywordScorer(required_present=["test"])
assert isinstance(scorer, BaseScorer)
def test_case_sensitive_default_false(self):
scorer = KeywordScorer(required_present=["test"])
assert scorer.case_sensitive is False
def test_case_sensitive_explicit(self):
scorer = KeywordScorer(required_present=["test"], case_sensitive=True)
assert scorer.case_sensitive is True
class TestRequiredPresent:
def test_all_present_scores_1(self):
scorer = KeywordScorer(required_present=["hello", "world"])
assert scorer.score(None, "hello world", {}) == 1.0
def test_none_present_scores_0(self):
scorer = KeywordScorer(required_present=["hello", "world"])
assert scorer.score(None, "nothing here", {}) == 0.0
def test_partial_present_scores_ratio(self):
scorer = KeywordScorer(required_present=["hello", "world"])
assert scorer.score(None, "hello there", {}) == 0.5
def test_single_keyword_present(self):
scorer = KeywordScorer(required_present=["python"])
assert scorer.score(None, "I love python", {}) == 1.0
def test_single_keyword_absent(self):
scorer = KeywordScorer(required_present=["python"])
assert scorer.score(None, "I love java", {}) == 0.0
def test_keyword_substring_match(self):
scorer = KeywordScorer(required_present=["test"])
assert scorer.score(None, "testing is important", {}) == 1.0
def test_case_insensitive_by_default(self):
scorer = KeywordScorer(required_present=["Hello", "WORLD"])
assert scorer.score(None, "hello world", {}) == 1.0
def test_case_sensitive_match(self):
scorer = KeywordScorer(required_present=["Hello"], case_sensitive=True)
assert scorer.score(None, "Hello world", {}) == 1.0
def test_case_sensitive_no_match(self):
scorer = KeywordScorer(required_present=["Hello"], case_sensitive=True)
assert scorer.score(None, "hello world", {}) == 0.0
def test_three_of_four_present(self):
scorer = KeywordScorer(required_present=["a", "b", "c", "d"])
assert scorer.score(None, "a b c", {}) == 0.75
class TestRequiredAbsent:
def test_all_absent_scores_1(self):
scorer = KeywordScorer(required_absent=["error", "fail"])
assert scorer.score(None, "success", {}) == 1.0
def test_all_present_scores_0(self):
scorer = KeywordScorer(required_absent=["error", "fail"])
assert scorer.score(None, "error and fail", {}) == 0.0
def test_partial_absent_scores_ratio(self):
scorer = KeywordScorer(required_absent=["error", "fail"])
assert scorer.score(None, "error occurred", {}) == 0.5
def test_case_insensitive_absent(self):
scorer = KeywordScorer(required_absent=["ERROR"])
assert scorer.score(None, "an error occurred", {}) == 0.0
def test_case_sensitive_absent_not_found(self):
scorer = KeywordScorer(required_absent=["ERROR"], case_sensitive=True)
assert scorer.score(None, "an error occurred", {}) == 1.0
class TestCombinedPresenceAbsence:
def test_all_satisfied(self):
scorer = KeywordScorer(
required_present=["python", "code"],
required_absent=["error", "bug"],
)
assert scorer.score(None, "python code is great", {}) == 1.0
def test_none_satisfied(self):
scorer = KeywordScorer(
required_present=["python", "code"],
required_absent=["error", "bug"],
)
assert scorer.score(None, "error and bug", {}) == 0.0
def test_mixed_satisfaction(self):
# 1 present ("python") + 1 absent ("bug" not in output) = 2/4
scorer = KeywordScorer(
required_present=["python", "code"],
required_absent=["error", "bug"],
)
assert scorer.score(None, "python error", {}) == 0.5
def test_present_satisfied_absent_not(self):
# 2 present + 0 absent satisfied = 2/3
scorer = KeywordScorer(
required_present=["hello", "world"],
required_absent=["bad"],
)
result = scorer.score(None, "hello world bad", {})
assert abs(result - 2 / 3) < 1e-9
def test_absent_satisfied_present_not(self):
# 0 present + 1 absent satisfied = 1/2
scorer = KeywordScorer(
required_present=["hello"],
required_absent=["bad"],
)
assert scorer.score(None, "nothing here", {}) == 0.5
class TestAsyncScore:
def test_async_delegates_to_sync(self):
scorer = KeywordScorer(required_present=["hello"])
result = asyncio.get_event_loop().run_until_complete(
scorer.score_async(None, "hello world", {})
)
assert result == 1.0
class TestEdgeCases:
def test_empty_output(self):
scorer = KeywordScorer(required_present=["hello"])
assert scorer.score(None, "", {}) == 0.0
def test_empty_output_with_absent(self):
scorer = KeywordScorer(required_absent=["hello"])
assert scorer.score(None, "", {}) == 1.0
def test_multiline_output(self):
scorer = KeywordScorer(required_present=["line1", "line2"])
assert scorer.score(None, "line1\nline2", {}) == 1.0
def test_special_characters_in_keyword(self):
scorer = KeywordScorer(required_present=["c++", "c#"])
assert scorer.score(None, "I know c++ and c#", {}) == 1.0
def test_context_ignored(self):
scorer = KeywordScorer(required_present=["test"])
assert scorer.score("input", "test output", {"key": "val"}) == 1.0
def test_input_data_ignored(self):
scorer = KeywordScorer(required_present=["test"])
assert scorer.score({"complex": "input"}, "test output", {}) == 1.0