Adds backend/engine/scorers/base.py with abstract name property, score() method, and score_async() default implementation. Updates scorers __init__.py to export BaseScorer. Includes 9 tests covering instantiation guards, sync/async dispatch, context dict usage, and partial implementation rejection.
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
"""Base scorer abstract class for PromptLooper scoring framework."""
|
||
|
||
from abc import ABC, abstractmethod
|
||
from typing import Any
|
||
|
||
|
||
class BaseScorer(ABC):
|
||
"""Abstract base class for all scorers.
|
||
|
||
Scorers evaluate LLM outputs and return a float score in the 0.0–1.0 range.
|
||
"""
|
||
|
||
@property
|
||
@abstractmethod
|
||
def name(self) -> str:
|
||
"""Unique identifier for this scorer."""
|
||
...
|
||
|
||
@abstractmethod
|
||
def score(self, input_data: Any, output: str, context: dict) -> float:
|
||
"""Score an LLM output.
|
||
|
||
Args:
|
||
input_data: The original input fed to the experiment.
|
||
output: The LLM-generated output text to evaluate.
|
||
context: Dict containing:
|
||
- config: The experiment configuration dict.
|
||
- stages: List of completed stage result dicts.
|
||
- input_data: Same as the input_data argument (for convenience).
|
||
Implementations may also receive reference data or other
|
||
experiment-specific keys.
|
||
|
||
Returns:
|
||
A float between 0.0 and 1.0 (inclusive).
|
||
"""
|
||
...
|
||
|
||
async def score_async(self, input_data: Any, output: str, context: dict) -> float:
|
||
"""Async variant of score.
|
||
|
||
The default implementation delegates to the synchronous ``score`` method.
|
||
Override this in scorers that need to perform async I/O (e.g. LLM calls,
|
||
HTTP requests).
|
||
"""
|
||
return self.score(input_data, output, context)
|