From 405bbf8206d5253ae8d6232e4656c1dbd3176446 Mon Sep 17 00:00:00 2001 From: John Lightner Date: Tue, 7 Apr 2026 02:55:05 -0500 Subject: [PATCH] MAESTRO: Implement BaseScorer abstract class with sync/async scoring interface Adds backend/engine/scorers/base.py with abstract name property, score() method, and score_async() default implementation. Updates scorers __init__.py to export BaseScorer. Includes 9 tests covering instantiation guards, sync/async dispatch, context dict usage, and partial implementation rejection. --- Auto Run Docs/02a-backend-engine.md | 3 +- backend/engine/scorers/__init__.py | 5 ++ backend/engine/scorers/base.py | 45 +++++++++++ backend/tests/test_scorer_base.py | 112 ++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 backend/engine/scorers/base.py create mode 100644 backend/tests/test_scorer_base.py diff --git a/Auto Run Docs/02a-backend-engine.md b/Auto Run Docs/02a-backend-engine.md index df5697d..68bcfe5 100644 --- a/Auto Run Docs/02a-backend-engine.md +++ b/Auto Run Docs/02a-backend-engine.md @@ -14,7 +14,8 @@ Implement the core experiment execution engine: LLM adapters, response caching, - [x] Implement backend/engine/sweep.py for sweep orchestration. Support three sweep types: GridSweep (enumerate all combinations from parameter_space), RandomSweep (sample N random configs from parameter ranges), GuidedSweep (use previous results to inform next config — start with top-K exploitation + random exploration). The sweep runner should: respect MAX_CONCURRENT_RUNS for parallelism, track token budget and stop at MAX_TOKENS_PER_SWEEP, emit WebSocket events for each run completion, handle pause/resume/stop via Redis flags. -- [ ] Implement backend/engine/scorers/base.py defining the BaseScorer abstract class with: name property, score(input_data, output, context) → float (0.0 to 1.0), and an optional async variant. The context dict should include the experiment config, stage results, and any reference data. +- [x] Implement backend/engine/scorers/base.py defining the BaseScorer abstract class with: name property, score(input_data, output, context) → float (0.0 to 1.0), and an optional async variant. The context dict should include the experiment config, stage results, and any reference data. + - [ ] Implement backend/engine/scorers/embedding.py — uses a configurable embedding endpoint (Ollama nomic-embed-text or any OpenAI-compatible embedding API) to compute cosine similarity between output and reference answer. Normalize to 0.0–1.0 range. diff --git a/backend/engine/scorers/__init__.py b/backend/engine/scorers/__init__.py index e69de29..eb83c02 100644 --- a/backend/engine/scorers/__init__.py +++ b/backend/engine/scorers/__init__.py @@ -0,0 +1,5 @@ +"""Scorer framework for evaluating LLM outputs.""" + +from engine.scorers.base import BaseScorer + +__all__ = ["BaseScorer"] diff --git a/backend/engine/scorers/base.py b/backend/engine/scorers/base.py new file mode 100644 index 0000000..83e2c29 --- /dev/null +++ b/backend/engine/scorers/base.py @@ -0,0 +1,45 @@ +"""Base scorer abstract class for PromptLooper scoring framework.""" + +from abc import ABC, abstractmethod +from typing import Any + + +class BaseScorer(ABC): + """Abstract base class for all scorers. + + Scorers evaluate LLM outputs and return a float score in the 0.0–1.0 range. + """ + + @property + @abstractmethod + def name(self) -> str: + """Unique identifier for this scorer.""" + ... + + @abstractmethod + def score(self, input_data: Any, output: str, context: dict) -> float: + """Score an LLM output. + + Args: + input_data: The original input fed to the experiment. + output: The LLM-generated output text to evaluate. + context: Dict containing: + - config: The experiment configuration dict. + - stages: List of completed stage result dicts. + - input_data: Same as the input_data argument (for convenience). + Implementations may also receive reference data or other + experiment-specific keys. + + Returns: + A float between 0.0 and 1.0 (inclusive). + """ + ... + + async def score_async(self, input_data: Any, output: str, context: dict) -> float: + """Async variant of score. + + The default implementation delegates to the synchronous ``score`` method. + Override this in scorers that need to perform async I/O (e.g. LLM calls, + HTTP requests). + """ + return self.score(input_data, output, context) diff --git a/backend/tests/test_scorer_base.py b/backend/tests/test_scorer_base.py new file mode 100644 index 0000000..9651707 --- /dev/null +++ b/backend/tests/test_scorer_base.py @@ -0,0 +1,112 @@ +"""Tests for the BaseScorer abstract class.""" + +import asyncio +from typing import Any + +import pytest + +from engine.scorers.base import BaseScorer + + +class ConcreteScorer(BaseScorer): + """Minimal concrete scorer for testing.""" + + @property + def name(self) -> str: + return "concrete" + + def score(self, input_data: Any, output: str, context: dict) -> float: + return 0.75 + + +class AsyncOverrideScorer(BaseScorer): + """Scorer that overrides the async variant.""" + + @property + def name(self) -> str: + return "async_override" + + def score(self, input_data: Any, output: str, context: dict) -> float: + return 0.5 + + async def score_async(self, input_data: Any, output: str, context: dict) -> float: + return 0.99 + + +class ContextAwareScorer(BaseScorer): + """Scorer that uses context dict fields.""" + + @property + def name(self) -> str: + return "context_aware" + + def score(self, input_data: Any, output: str, context: dict) -> float: + # Use all expected context keys + config = context.get("config", {}) + stages = context.get("stages", []) + ref = context.get("input_data") + if config and stages and ref is not None: + return 1.0 + return 0.0 + + +class TestBaseScorerInterface: + def test_cannot_instantiate_abstract_class(self): + with pytest.raises(TypeError): + BaseScorer() + + def test_concrete_scorer_has_name(self): + scorer = ConcreteScorer() + assert scorer.name == "concrete" + + def test_concrete_scorer_returns_float(self): + scorer = ConcreteScorer() + result = scorer.score("input", "output", {}) + assert isinstance(result, float) + assert result == 0.75 + + def test_score_async_defaults_to_sync(self): + scorer = ConcreteScorer() + result = asyncio.get_event_loop().run_until_complete( + scorer.score_async("input", "output", {}) + ) + assert result == 0.75 + + def test_score_async_override(self): + scorer = AsyncOverrideScorer() + sync_result = scorer.score("input", "output", {}) + async_result = asyncio.get_event_loop().run_until_complete( + scorer.score_async("input", "output", {}) + ) + assert sync_result == 0.5 + assert async_result == 0.99 + + def test_context_dict_keys(self): + scorer = ContextAwareScorer() + context = { + "config": {"model": "gpt-4"}, + "stages": [{"output": "stage1 output"}], + "input_data": "some input", + } + result = scorer.score("some input", "output", context) + assert result == 1.0 + + def test_context_dict_missing_keys(self): + scorer = ContextAwareScorer() + result = scorer.score("input", "output", {}) + assert result == 0.0 + + def test_isinstance_check(self): + scorer = ConcreteScorer() + assert isinstance(scorer, BaseScorer) + + def test_partial_implementation_raises(self): + """A class that only implements name but not score cannot be instantiated.""" + + class PartialScorer(BaseScorer): + @property + def name(self) -> str: + return "partial" + + with pytest.raises(TypeError): + PartialScorer()