MAESTRO: Implement BaseScorer abstract class with sync/async scoring interface
Adds backend/engine/scorers/base.py with abstract name property, score() method, and score_async() default implementation. Updates scorers __init__.py to export BaseScorer. Includes 9 tests covering instantiation guards, sync/async dispatch, context dict usage, and partial implementation rejection.
This commit is contained in:
parent
ba8cb7e2c6
commit
405bbf8206
4 changed files with 164 additions and 1 deletions
|
|
@ -14,7 +14,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
|
||||||
- [x] Implement backend/engine/sweep.py for sweep orchestration. Support three sweep types: GridSweep (enumerate all combinations from parameter_space), RandomSweep (sample N random configs from parameter ranges), GuidedSweep (use previous results to inform next config — start with top-K exploitation + random exploration). The sweep runner should: respect MAX_CONCURRENT_RUNS for parallelism, track token budget and stop at MAX_TOKENS_PER_SWEEP, emit WebSocket events for each run completion, handle pause/resume/stop via Redis flags.
|
- [x] Implement backend/engine/sweep.py for sweep orchestration. Support three sweep types: GridSweep (enumerate all combinations from parameter_space), RandomSweep (sample N random configs from parameter ranges), GuidedSweep (use previous results to inform next config — start with top-K exploitation + random exploration). The sweep runner should: respect MAX_CONCURRENT_RUNS for parallelism, track token budget and stop at MAX_TOKENS_PER_SWEEP, emit WebSocket events for each run completion, handle pause/resume/stop via Redis flags.
|
||||||
<!-- Completed: Implemented all 3 sweep types (grid/random/guided), bounded parallelism via asyncio.Semaphore, token budget enforcement, Redis-based pause/resume/stop flags, sweep-level events. 36 tests in test_sweep.py, all passing. -->
|
<!-- Completed: Implemented all 3 sweep types (grid/random/guided), bounded parallelism via asyncio.Semaphore, token budget enforcement, Redis-based pause/resume/stop flags, sweep-level events. 36 tests in test_sweep.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/engine/scorers/base.py defining the BaseScorer abstract class with: name property, score(input_data, output, context) → float (0.0 to 1.0), and an optional async variant. The context dict should include the experiment config, stage results, and any reference data.
|
- [x] Implement backend/engine/scorers/base.py defining the BaseScorer abstract class with: name property, score(input_data, output, context) → float (0.0 to 1.0), and an optional async variant. The context dict should include the experiment config, stage results, and any reference data.
|
||||||
|
<!-- Completed: BaseScorer ABC with name property, score() abstract method, score_async() default implementation. 9 tests in test_scorer_base.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/engine/scorers/embedding.py — uses a configurable embedding endpoint (Ollama nomic-embed-text or any OpenAI-compatible embedding API) to compute cosine similarity between output and reference answer. Normalize to 0.0–1.0 range.
|
- [ ] Implement backend/engine/scorers/embedding.py — uses a configurable embedding endpoint (Ollama nomic-embed-text or any OpenAI-compatible embedding API) to compute cosine similarity between output and reference answer. Normalize to 0.0–1.0 range.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
"""Scorer framework for evaluating LLM outputs."""
|
||||||
|
|
||||||
|
from engine.scorers.base import BaseScorer
|
||||||
|
|
||||||
|
__all__ = ["BaseScorer"]
|
||||||
45
backend/engine/scorers/base.py
Normal file
45
backend/engine/scorers/base.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
"""Base scorer abstract class for PromptLooper scoring framework."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class BaseScorer(ABC):
|
||||||
|
"""Abstract base class for all scorers.
|
||||||
|
|
||||||
|
Scorers evaluate LLM outputs and return a float score in the 0.0–1.0 range.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Unique identifier for this scorer."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def score(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
"""Score an LLM output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_data: The original input fed to the experiment.
|
||||||
|
output: The LLM-generated output text to evaluate.
|
||||||
|
context: Dict containing:
|
||||||
|
- config: The experiment configuration dict.
|
||||||
|
- stages: List of completed stage result dicts.
|
||||||
|
- input_data: Same as the input_data argument (for convenience).
|
||||||
|
Implementations may also receive reference data or other
|
||||||
|
experiment-specific keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A float between 0.0 and 1.0 (inclusive).
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def score_async(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
"""Async variant of score.
|
||||||
|
|
||||||
|
The default implementation delegates to the synchronous ``score`` method.
|
||||||
|
Override this in scorers that need to perform async I/O (e.g. LLM calls,
|
||||||
|
HTTP requests).
|
||||||
|
"""
|
||||||
|
return self.score(input_data, output, context)
|
||||||
112
backend/tests/test_scorer_base.py
Normal file
112
backend/tests/test_scorer_base.py
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
"""Tests for the BaseScorer abstract class."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from engine.scorers.base import BaseScorer
|
||||||
|
|
||||||
|
|
||||||
|
class ConcreteScorer(BaseScorer):
|
||||||
|
"""Minimal concrete scorer for testing."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "concrete"
|
||||||
|
|
||||||
|
def score(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
return 0.75
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncOverrideScorer(BaseScorer):
|
||||||
|
"""Scorer that overrides the async variant."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "async_override"
|
||||||
|
|
||||||
|
def score(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
async def score_async(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
return 0.99
|
||||||
|
|
||||||
|
|
||||||
|
class ContextAwareScorer(BaseScorer):
|
||||||
|
"""Scorer that uses context dict fields."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "context_aware"
|
||||||
|
|
||||||
|
def score(self, input_data: Any, output: str, context: dict) -> float:
|
||||||
|
# Use all expected context keys
|
||||||
|
config = context.get("config", {})
|
||||||
|
stages = context.get("stages", [])
|
||||||
|
ref = context.get("input_data")
|
||||||
|
if config and stages and ref is not None:
|
||||||
|
return 1.0
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class TestBaseScorerInterface:
|
||||||
|
def test_cannot_instantiate_abstract_class(self):
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
BaseScorer()
|
||||||
|
|
||||||
|
def test_concrete_scorer_has_name(self):
|
||||||
|
scorer = ConcreteScorer()
|
||||||
|
assert scorer.name == "concrete"
|
||||||
|
|
||||||
|
def test_concrete_scorer_returns_float(self):
|
||||||
|
scorer = ConcreteScorer()
|
||||||
|
result = scorer.score("input", "output", {})
|
||||||
|
assert isinstance(result, float)
|
||||||
|
assert result == 0.75
|
||||||
|
|
||||||
|
def test_score_async_defaults_to_sync(self):
|
||||||
|
scorer = ConcreteScorer()
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
scorer.score_async("input", "output", {})
|
||||||
|
)
|
||||||
|
assert result == 0.75
|
||||||
|
|
||||||
|
def test_score_async_override(self):
|
||||||
|
scorer = AsyncOverrideScorer()
|
||||||
|
sync_result = scorer.score("input", "output", {})
|
||||||
|
async_result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
scorer.score_async("input", "output", {})
|
||||||
|
)
|
||||||
|
assert sync_result == 0.5
|
||||||
|
assert async_result == 0.99
|
||||||
|
|
||||||
|
def test_context_dict_keys(self):
|
||||||
|
scorer = ContextAwareScorer()
|
||||||
|
context = {
|
||||||
|
"config": {"model": "gpt-4"},
|
||||||
|
"stages": [{"output": "stage1 output"}],
|
||||||
|
"input_data": "some input",
|
||||||
|
}
|
||||||
|
result = scorer.score("some input", "output", context)
|
||||||
|
assert result == 1.0
|
||||||
|
|
||||||
|
def test_context_dict_missing_keys(self):
|
||||||
|
scorer = ContextAwareScorer()
|
||||||
|
result = scorer.score("input", "output", {})
|
||||||
|
assert result == 0.0
|
||||||
|
|
||||||
|
def test_isinstance_check(self):
|
||||||
|
scorer = ConcreteScorer()
|
||||||
|
assert isinstance(scorer, BaseScorer)
|
||||||
|
|
||||||
|
def test_partial_implementation_raises(self):
|
||||||
|
"""A class that only implements name but not score cannot be instantiated."""
|
||||||
|
|
||||||
|
class PartialScorer(BaseScorer):
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "partial"
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
PartialScorer()
|
||||||
Loading…
Add table
Reference in a new issue