chrysopedia/backend/pipeline/quality/chat_scorer.py

"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.

Scores chat responses across 5 dimensions:
- citation_accuracy: Are citations real and correctly numbered?
- response_structure: Concise, well-organized, uses appropriate formatting?
- domain_expertise: Music production terminology used naturally?
- source_grounding: Claims backed by provided sources, no fabrication?
- personality_fidelity: At weight>0, response reflects creator voice?

Run via: python -m pipeline.quality chat_eval --suite <path>
"""
from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field

import openai

from pipeline.llm_client import LLMClient

logger = logging.getLogger(__name__)

CHAT_DIMENSIONS = [
    "citation_accuracy",
    "response_structure",
    "domain_expertise",
    "source_grounding",
    "personality_fidelity",
]

CHAT_RUBRIC = """\
You are an expert evaluator of AI chat response quality for a music production knowledge base.

You will be given:
1. The user's query
2. The assistant's response
3. The numbered source citations that were provided to the assistant
4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
5. The creator_name (if any)

Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:

**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims

**response_structure** — Response is concise, well-organized, uses appropriate formatting
- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan

**domain_expertise** — Music production terminology used naturally and correctly
- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly

**source_grounding** — Claims are backed by provided sources, no fabrication
- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source

**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.

Return ONLY a JSON object with this exact structure:
{
  "citation_accuracy": <float 0.0-1.0>,
  "response_structure": <float 0.0-1.0>,
  "domain_expertise": <float 0.0-1.0>,
  "source_grounding": <float 0.0-1.0>,
  "personality_fidelity": <float 0.0-1.0>,
  "justifications": {
    "citation_accuracy": "<1-2 sentence justification>",
    "response_structure": "<1-2 sentence justification>",
    "domain_expertise": "<1-2 sentence justification>",
    "source_grounding": "<1-2 sentence justification>",
    "personality_fidelity": "<1-2 sentence justification>"
  }
}
"""


@dataclass
class ChatScoreResult:
    """Outcome of scoring a chat response across quality dimensions."""

    scores: dict[str, float] = field(default_factory=dict)
    composite: float = 0.0
    justifications: dict[str, str] = field(default_factory=dict)
    elapsed_seconds: float = 0.0
    error: str | None = None

    # Convenience properties
    @property
    def citation_accuracy(self) -> float:
        return self.scores.get("citation_accuracy", 0.0)

    @property
    def response_structure(self) -> float:
        return self.scores.get("response_structure", 0.0)

    @property
    def domain_expertise(self) -> float:
        return self.scores.get("domain_expertise", 0.0)

    @property
    def source_grounding(self) -> float:
        return self.scores.get("source_grounding", 0.0)

    @property
    def personality_fidelity(self) -> float:
        return self.scores.get("personality_fidelity", 0.0)


class ChatScoreRunner:
    """Scores chat responses using LLM-as-judge evaluation."""

    def __init__(self, client: LLMClient) -> None:
        self.client = client

    def score_response(
        self,
        query: str,
        response: str,
        sources: list[dict],
        personality_weight: float = 0.0,
        creator_name: str | None = None,
    ) -> ChatScoreResult:
        """Score a single chat response against the 5 chat quality dimensions.

        Parameters
        ----------
        query:
            The user's original query.
        response:
            The assistant's accumulated response text.
        sources:
            List of source citation dicts (as emitted by the SSE sources event).
        personality_weight:
            0.0 = encyclopedic mode, >0 = personality mode.
        creator_name:
            Creator name, if this was a creator-scoped query.

        Returns
        -------
        ChatScoreResult with per-dimension scores.
        """
        sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"

        user_prompt = (
            f"## User Query\n\n{query}\n\n"
            f"## Assistant Response\n\n{response}\n\n"
            f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
            f"## Metadata\n\n"
            f"- personality_weight: {personality_weight}\n"
            f"- creator_name: {creator_name or '(none)'}\n\n"
            f"Score this chat response across all 5 dimensions."
        )

        t0 = time.monotonic()
        try:
            from pydantic import BaseModel as _BM
            resp = self.client.complete(
                system_prompt=CHAT_RUBRIC,
                user_prompt=user_prompt,
                response_model=_BM,
                modality="chat",
            )
            elapsed = round(time.monotonic() - t0, 2)
        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
            elapsed = round(time.monotonic() - t0, 2)
            return ChatScoreResult(
                elapsed_seconds=elapsed,
                error=f"Cannot reach LLM judge. Error: {exc}",
            )

        raw_text = str(resp).strip()
        try:
            parsed = json.loads(raw_text)
        except json.JSONDecodeError:
            logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
            return ChatScoreResult(
                elapsed_seconds=elapsed,
                error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
            )

        return self._parse_scores(parsed, elapsed)

    def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
        """Extract and validate scores from parsed JSON judge response."""
        scores: dict[str, float] = {}
        justifications: dict[str, str] = {}

        raw_justifications = parsed.get("justifications", {})
        if not isinstance(raw_justifications, dict):
            raw_justifications = {}

        for dim in CHAT_DIMENSIONS:
            raw = parsed.get(dim)
            if raw is None:
                logger.warning("Missing dimension '%s' in chat judge response", dim)
                scores[dim] = 0.0
                justifications[dim] = "(missing from judge response)"
                continue

            try:
                val = float(raw)
                scores[dim] = max(0.0, min(1.0, val))
            except (TypeError, ValueError):
                logger.warning("Invalid value for '%s': %r", dim, raw)
                scores[dim] = 0.0
                justifications[dim] = f"(invalid value: {raw!r})"
                continue

            justifications[dim] = str(raw_justifications.get(dim, ""))

        composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0

        return ChatScoreResult(
            scores=scores,
            composite=round(composite, 3),
            justifications=justifications,
            elapsed_seconds=elapsed,
        )

    def print_report(self, result: ChatScoreResult, query: str = "") -> None:
        """Print a formatted chat scoring report to stdout."""
        print("\n" + "=" * 60)
        print("  CHAT QUALITY SCORE REPORT")
        if query:
            print(f"  Query: {query[:60]}{'...' if len(query) > 60 else ''}")
        print("=" * 60)

        if result.error:
            print(f"\n  ✗ Error: {result.error}\n")
            print("=" * 60 + "\n")
            return

        for dim in CHAT_DIMENSIONS:
            score = result.scores.get(dim, 0.0)
            filled = int(score * 20)
            bar = "█" * filled + "░" * (20 - filled)
            justification = result.justifications.get(dim, "")
            print(f"\n  {dim.replace('_', ' ').title()}")
            print(f"    Score: {score:.2f}  {bar}")
            if justification:
                # Simple word wrap at ~56 chars
                words = justification.split()
                lines: list[str] = []
                current = ""
                for word in words:
                    if current and len(current) + len(word) + 1 > 56:
                        lines.append(current)
                        current = word
                    else:
                        current = f"{current} {word}" if current else word
                if current:
                    lines.append(current)
                for line in lines:
                    print(f"    {line}")

        print("\n" + "-" * 60)
        print(f"  Composite: {result.composite:.3f}")
        print(f"  Time: {result.elapsed_seconds}s")
        print("=" * 60 + "\n")