diff --git a/Auto Run Docs/02a-backend-engine.md b/Auto Run Docs/02a-backend-engine.md index 2e778e4..c307503 100644 --- a/Auto Run Docs/02a-backend-engine.md +++ b/Auto Run Docs/02a-backend-engine.md @@ -23,7 +23,8 @@ Implement the core experiment execution engine: LLM adapters, response caching, - [x] Implement backend/engine/scorers/format.py — checks if output matches expected format. Supports: json (valid JSON parse), markdown (has headers, lists), length (within min/max token count), structure (matches a provided JSON schema). -- [ ] Implement backend/engine/scorers/keyword.py — checks for presence/absence of required keywords in output. Configurable with required_present and required_absent lists. Score = (found / required) ratio. +- [x] Implement backend/engine/scorers/keyword.py — checks for presence/absence of required keywords in output. Configurable with required_present and required_absent lists. Score = (found / required) ratio. + - [ ] Implement backend/engine/scorers/llm_judge.py — sends the output to a separate LLM with a configurable judge prompt and asks for a 1-10 rating. Parses the numeric score from the response. This scorer requires an LLM call so it should be clearly marked as "costs tokens" in the UI. Cache the judge's response too. diff --git a/backend/engine/scorers/__init__.py b/backend/engine/scorers/__init__.py index 1c70065..8582c87 100644 --- a/backend/engine/scorers/__init__.py +++ b/backend/engine/scorers/__init__.py @@ -3,5 +3,6 @@ from engine.scorers.base import BaseScorer from engine.scorers.embedding import EmbeddingScorer from engine.scorers.format import FormatScorer +from engine.scorers.keyword import KeywordScorer -__all__ = ["BaseScorer", "EmbeddingScorer", "FormatScorer"] +__all__ = ["BaseScorer", "EmbeddingScorer", "FormatScorer", "KeywordScorer"] diff --git a/backend/engine/scorers/keyword.py b/backend/engine/scorers/keyword.py new file mode 100644 index 0000000..57a3fe1 --- /dev/null +++ b/backend/engine/scorers/keyword.py @@ -0,0 +1,64 @@ +"""Keyword scorer — checks for presence/absence of required keywords in output. + +Configurable with required_present and required_absent keyword lists. +Score is computed as (matches / total_requirements) ratio. +""" + +from typing import Any + +from engine.scorers.base import BaseScorer + + +class KeywordScorer(BaseScorer): + """Score outputs based on keyword presence and absence. + + Args: + required_present: Keywords that must appear in the output. + required_absent: Keywords that must not appear in the output. + case_sensitive: Whether keyword matching is case-sensitive (default False). + """ + + def __init__( + self, + required_present: list[str] | None = None, + required_absent: list[str] | None = None, + case_sensitive: bool = False, + ) -> None: + self.required_present = required_present or [] + self.required_absent = required_absent or [] + self.case_sensitive = case_sensitive + + if not self.required_present and not self.required_absent: + raise ValueError( + "At least one of required_present or required_absent must be provided." + ) + + @property + def name(self) -> str: + return "keyword" + + def score(self, input_data: Any, output: str, context: dict) -> float: + """Score output based on keyword presence/absence. + + Returns the ratio of satisfied keyword requirements to total requirements. + Each keyword in required_present scores a point if found. + Each keyword in required_absent scores a point if NOT found. + """ + total = len(self.required_present) + len(self.required_absent) + if total == 0: + return 1.0 + + check_output = output if self.case_sensitive else output.lower() + satisfied = 0 + + for keyword in self.required_present: + check_keyword = keyword if self.case_sensitive else keyword.lower() + if check_keyword in check_output: + satisfied += 1 + + for keyword in self.required_absent: + check_keyword = keyword if self.case_sensitive else keyword.lower() + if check_keyword not in check_output: + satisfied += 1 + + return satisfied / total diff --git a/backend/tests/test_scorer_keyword.py b/backend/tests/test_scorer_keyword.py new file mode 100644 index 0000000..76459b4 --- /dev/null +++ b/backend/tests/test_scorer_keyword.py @@ -0,0 +1,193 @@ +"""Tests for the KeywordScorer.""" + +import asyncio + +import pytest + +from engine.scorers.keyword import KeywordScorer + + +class TestKeywordScorerInit: + def test_with_required_present(self): + scorer = KeywordScorer(required_present=["hello"]) + assert scorer.required_present == ["hello"] + assert scorer.required_absent == [] + + def test_with_required_absent(self): + scorer = KeywordScorer(required_absent=["bad"]) + assert scorer.required_present == [] + assert scorer.required_absent == ["bad"] + + def test_with_both_lists(self): + scorer = KeywordScorer(required_present=["good"], required_absent=["bad"]) + assert scorer.required_present == ["good"] + assert scorer.required_absent == ["bad"] + + def test_empty_lists_raises(self): + with pytest.raises(ValueError, match="At least one of"): + KeywordScorer() + + def test_both_none_raises(self): + with pytest.raises(ValueError, match="At least one of"): + KeywordScorer(required_present=None, required_absent=None) + + def test_both_empty_raises(self): + with pytest.raises(ValueError, match="At least one of"): + KeywordScorer(required_present=[], required_absent=[]) + + def test_name_property(self): + scorer = KeywordScorer(required_present=["test"]) + assert scorer.name == "keyword" + + def test_is_base_scorer(self): + from engine.scorers.base import BaseScorer + scorer = KeywordScorer(required_present=["test"]) + assert isinstance(scorer, BaseScorer) + + def test_case_sensitive_default_false(self): + scorer = KeywordScorer(required_present=["test"]) + assert scorer.case_sensitive is False + + def test_case_sensitive_explicit(self): + scorer = KeywordScorer(required_present=["test"], case_sensitive=True) + assert scorer.case_sensitive is True + + +class TestRequiredPresent: + def test_all_present_scores_1(self): + scorer = KeywordScorer(required_present=["hello", "world"]) + assert scorer.score(None, "hello world", {}) == 1.0 + + def test_none_present_scores_0(self): + scorer = KeywordScorer(required_present=["hello", "world"]) + assert scorer.score(None, "nothing here", {}) == 0.0 + + def test_partial_present_scores_ratio(self): + scorer = KeywordScorer(required_present=["hello", "world"]) + assert scorer.score(None, "hello there", {}) == 0.5 + + def test_single_keyword_present(self): + scorer = KeywordScorer(required_present=["python"]) + assert scorer.score(None, "I love python", {}) == 1.0 + + def test_single_keyword_absent(self): + scorer = KeywordScorer(required_present=["python"]) + assert scorer.score(None, "I love java", {}) == 0.0 + + def test_keyword_substring_match(self): + scorer = KeywordScorer(required_present=["test"]) + assert scorer.score(None, "testing is important", {}) == 1.0 + + def test_case_insensitive_by_default(self): + scorer = KeywordScorer(required_present=["Hello", "WORLD"]) + assert scorer.score(None, "hello world", {}) == 1.0 + + def test_case_sensitive_match(self): + scorer = KeywordScorer(required_present=["Hello"], case_sensitive=True) + assert scorer.score(None, "Hello world", {}) == 1.0 + + def test_case_sensitive_no_match(self): + scorer = KeywordScorer(required_present=["Hello"], case_sensitive=True) + assert scorer.score(None, "hello world", {}) == 0.0 + + def test_three_of_four_present(self): + scorer = KeywordScorer(required_present=["a", "b", "c", "d"]) + assert scorer.score(None, "a b c", {}) == 0.75 + + +class TestRequiredAbsent: + def test_all_absent_scores_1(self): + scorer = KeywordScorer(required_absent=["error", "fail"]) + assert scorer.score(None, "success", {}) == 1.0 + + def test_all_present_scores_0(self): + scorer = KeywordScorer(required_absent=["error", "fail"]) + assert scorer.score(None, "error and fail", {}) == 0.0 + + def test_partial_absent_scores_ratio(self): + scorer = KeywordScorer(required_absent=["error", "fail"]) + assert scorer.score(None, "error occurred", {}) == 0.5 + + def test_case_insensitive_absent(self): + scorer = KeywordScorer(required_absent=["ERROR"]) + assert scorer.score(None, "an error occurred", {}) == 0.0 + + def test_case_sensitive_absent_not_found(self): + scorer = KeywordScorer(required_absent=["ERROR"], case_sensitive=True) + assert scorer.score(None, "an error occurred", {}) == 1.0 + + +class TestCombinedPresenceAbsence: + def test_all_satisfied(self): + scorer = KeywordScorer( + required_present=["python", "code"], + required_absent=["error", "bug"], + ) + assert scorer.score(None, "python code is great", {}) == 1.0 + + def test_none_satisfied(self): + scorer = KeywordScorer( + required_present=["python", "code"], + required_absent=["error", "bug"], + ) + assert scorer.score(None, "error and bug", {}) == 0.0 + + def test_mixed_satisfaction(self): + # 1 present ("python") + 1 absent ("bug" not in output) = 2/4 + scorer = KeywordScorer( + required_present=["python", "code"], + required_absent=["error", "bug"], + ) + assert scorer.score(None, "python error", {}) == 0.5 + + def test_present_satisfied_absent_not(self): + # 2 present + 0 absent satisfied = 2/3 + scorer = KeywordScorer( + required_present=["hello", "world"], + required_absent=["bad"], + ) + result = scorer.score(None, "hello world bad", {}) + assert abs(result - 2 / 3) < 1e-9 + + def test_absent_satisfied_present_not(self): + # 0 present + 1 absent satisfied = 1/2 + scorer = KeywordScorer( + required_present=["hello"], + required_absent=["bad"], + ) + assert scorer.score(None, "nothing here", {}) == 0.5 + + +class TestAsyncScore: + def test_async_delegates_to_sync(self): + scorer = KeywordScorer(required_present=["hello"]) + result = asyncio.get_event_loop().run_until_complete( + scorer.score_async(None, "hello world", {}) + ) + assert result == 1.0 + + +class TestEdgeCases: + def test_empty_output(self): + scorer = KeywordScorer(required_present=["hello"]) + assert scorer.score(None, "", {}) == 0.0 + + def test_empty_output_with_absent(self): + scorer = KeywordScorer(required_absent=["hello"]) + assert scorer.score(None, "", {}) == 1.0 + + def test_multiline_output(self): + scorer = KeywordScorer(required_present=["line1", "line2"]) + assert scorer.score(None, "line1\nline2", {}) == 1.0 + + def test_special_characters_in_keyword(self): + scorer = KeywordScorer(required_present=["c++", "c#"]) + assert scorer.score(None, "I know c++ and c#", {}) == 1.0 + + def test_context_ignored(self): + scorer = KeywordScorer(required_present=["test"]) + assert scorer.score("input", "test output", {"key": "val"}) == 1.0 + + def test_input_data_ignored(self): + scorer = KeywordScorer(required_present=["test"]) + assert scorer.score({"complex": "input"}, "test output", {}) == 1.0