promptlooper/backend/engine/scorers/llm_judge.py

"""LLM-as-judge scorer.

Sends the LLM output to a separate LLM with a configurable judge prompt,
asks for a 1–10 rating, and normalizes to the 0.0–1.0 range.

**This scorer costs tokens** — every evaluation makes an LLM call.  The
judge's response is cached via PromptLooper's response cache layer to
avoid redundant calls when re-scoring the same output.
"""

import re
from typing import Any

import httpx

from engine.scorers.base import BaseScorer


# Default judge system prompt — can be overridden at construction time.
DEFAULT_JUDGE_PROMPT = (
    "You are an impartial evaluator. You will receive an input and an LLM-generated "
    "output. Rate the quality of the output on a scale of 1 to 10, where 1 is terrible "
    "and 10 is perfect.\n\n"
    "Respond with ONLY a single integer between 1 and 10. Do not include any other text."
)

# Regex to extract the first integer 1–10 from the judge response.
_RATING_RE = re.compile(r"\b(10|[1-9])\b")


class LLMJudgeScorer(BaseScorer):
    """Score outputs by asking a separate LLM to rate them 1–10.

    Args:
        base_url: Chat completions API base URL.
        model: Model to use for judging.
        api_key: Optional API key.
        judge_prompt: System prompt for the judge LLM.
        timeout: HTTP request timeout in seconds.
        max_retries: Retry attempts on transient failures.
        cache_layer: Optional ``ResponseCacheLayer`` instance.  When provided,
            judge responses are cached to avoid duplicate LLM calls.
        db_session_factory: Callable returning a SQLAlchemy ``Session``.
            Required when *cache_layer* is supplied.
    """

    # Marker for the UI so it can warn users about token cost.
    COSTS_TOKENS = True

    def __init__(
        self,
        base_url: str = "http://localhost:11434/v1",
        model: str = "llama3",
        api_key: str | None = None,
        judge_prompt: str = DEFAULT_JUDGE_PROMPT,
        timeout: float = 120.0,
        max_retries: int = 3,
        cache_layer: Any = None,
        db_session_factory: Any = None,
    ) -> None:
        self.base_url = base_url.rstrip("/")
        self.model = model
        self.api_key = api_key
        self.judge_prompt = judge_prompt
        self.timeout = timeout
        self.max_retries = max_retries
        self._cache_layer = cache_layer
        self._db_session_factory = db_session_factory

    @property
    def name(self) -> str:
        return "llm_judge"

    # ------------------------------------------------------------------
    # Synchronous entry point
    # ------------------------------------------------------------------

    def score(self, input_data: Any, output: str, context: dict) -> float:
        """Synchronous scoring — delegates to the async variant."""
        import asyncio

        try:
            loop = asyncio.get_running_loop()
        except RuntimeError:
            loop = None

        if loop and loop.is_running():
            raise RuntimeError(
                "LLMJudgeScorer.score() cannot be called from an async context. "
                "Use score_async() instead."
            )

        return asyncio.get_event_loop().run_until_complete(
            self.score_async(input_data, output, context)
        )

    # ------------------------------------------------------------------
    # Async entry point
    # ------------------------------------------------------------------

    async def score_async(
        self, input_data: Any, output: str, context: dict
    ) -> float:
        """Ask the judge LLM to rate the output and return a normalised score."""
        user_message = self._build_user_message(input_data, output, context)

        # Check cache first.
        config_hash: str | None = None
        if self._cache_layer and self._db_session_factory:
            from engine.cache import compute_config_hash

            config_hash = compute_config_hash(
                prompt=self.judge_prompt,
                model=self.model,
                params={"scorer": "llm_judge"},
                input_data=user_message,
            )
            db = self._db_session_factory()
            try:
                cached = self._cache_layer.get(db, config_hash)
                if cached is not None:
                    return _parse_rating(cached.response)
            finally:
                db.close()

        # Call the judge LLM.
        judge_response = await self._call_judge(user_message)

        # Cache the judge response.
        if self._cache_layer and self._db_session_factory and config_hash:
            db = self._db_session_factory()
            try:
                self._cache_layer.put(
                    db,
                    config_hash=config_hash,
                    response=judge_response,
                    model=self.model,
                )
            finally:
                db.close()

        return _parse_rating(judge_response)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _build_user_message(
        self, input_data: Any, output: str, context: dict
    ) -> str:
        """Build the user message sent to the judge LLM."""
        parts = []
        if input_data is not None:
            parts.append(f"## Input\n{input_data}")
        parts.append(f"## Output\n{output}")

        # Include reference answer if available — helps the judge compare.
        reference = context.get("reference")
        if reference:
            parts.append(f"## Reference Answer\n{reference}")

        return "\n\n".join(parts)

    async def _call_judge(self, user_message: str) -> str:
        """Send a chat completion request to the judge LLM with retries."""
        url = f"{self.base_url}/chat/completions"

        headers: dict[str, str] = {"Content-Type": "application/json"}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        body = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": self.judge_prompt},
                {"role": "user", "content": user_message},
            ],
            "temperature": 0.0,
            "max_tokens": 16,
        }

        last_exc: Exception | None = None
        retryable = {429, 500, 502, 503, 504}

        for attempt in range(self.max_retries):
            try:
                async with httpx.AsyncClient(
                    timeout=httpx.Timeout(self.timeout), headers=headers
                ) as client:
                    resp = await client.post(url, json=body)

                if resp.status_code == 200:
                    data = resp.json()
                    choices = data.get("choices", [])
                    if choices:
                        return choices[0].get("message", {}).get("content", "").strip()
                    return ""

                if resp.status_code not in retryable:
                    resp.raise_for_status()

                last_exc = httpx.HTTPStatusError(
                    f"HTTP {resp.status_code}",
                    request=resp.request,
                    response=resp,
                )
            except httpx.HTTPStatusError:
                raise
            except httpx.HTTPError as exc:
                last_exc = exc

            if attempt < self.max_retries - 1:
                import asyncio

                await asyncio.sleep(2**attempt)

        raise RuntimeError(
            f"All {self.max_retries} attempts failed for judge LLM at {url}"
        ) from last_exc


def _parse_rating(text: str) -> float:
    """Extract a 1–10 rating from the judge response and normalise to 0.0–1.0.

    Falls back to 0.0 if no valid rating is found.
    """
    match = _RATING_RE.search(text)
    if match is None:
        return 0.0
    rating = int(match.group(1))
    # Normalise: 1 → ~0.0, 10 → 1.0
    return max(0.0, min(1.0, (rating - 1) / 9.0))