chrysopedia/backend/pipeline/quality/scorer.py

"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions.

Evaluates a synthesized technique page against source moments on:
1. Structural quality — section naming, count, paragraph depth
2. Content specificity — concrete details vs vague generalities
3. Voice preservation — direct quotes, attributed opinions, personality
4. Readability / flow — synthesis quality, logical ordering, no redundancy
5. Factual fidelity — no hallucinated specifics, grounded in source moments

Run via: python -m pipeline.quality score --file <path>
"""
from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field

import openai
from pydantic import BaseModel

from pipeline.llm_client import LLMClient

logger = logging.getLogger(__name__)


# ── Scoring rubric (hardcoded for iteration speed) ───────────────────────────

SCORING_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education.

You will be given:
1. A synthesized technique page (JSON with title, summary, body_sections)
2. The source key moments (transcript excerpts, summaries, tags) used to create it

Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0:

**structural** — Section naming and organization
- 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section
- 0.5-0.7: Acceptable structure but some generic section names or uneven depth
- 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections

**content_specificity** — Concrete technical details
- 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values
- 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings")
- 0.0-0.3: Mostly vague generalities with few concrete values from the source material

**voice_preservation** — Creator's authentic voice
- 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained
- 0.5-0.7: Some paraphrased references to creator's views but few direct quotes
- 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution

**readability** — Synthesis quality and flow
- 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction
- 0.5-0.7: Generally readable but some awkward transitions or minor repetition
- 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages

**factual_fidelity** — Grounded in source material
- 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques
- 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources

Return ONLY a JSON object with this exact structure:
{
  "structural": <float 0.0-1.0>,
  "content_specificity": <float 0.0-1.0>,
  "voice_preservation": <float 0.0-1.0>,
  "readability": <float 0.0-1.0>,
  "factual_fidelity": <float 0.0-1.0>,
  "justifications": {
    "structural": "<1-2 sentence justification>",
    "content_specificity": "<1-2 sentence justification>",
    "voice_preservation": "<1-2 sentence justification>",
    "readability": "<1-2 sentence justification>",
    "factual_fidelity": "<1-2 sentence justification>"
  }
}
"""

DIMENSIONS = [
    "structural",
    "content_specificity",
    "voice_preservation",
    "readability",
    "factual_fidelity",
]


# ── Result type ──────────────────────────────────────────────────────────────

@dataclass
class ScoreResult:
    """Outcome of scoring a technique page across 5 quality dimensions."""

    structural: float = 0.0
    content_specificity: float = 0.0
    voice_preservation: float = 0.0
    readability: float = 0.0
    factual_fidelity: float = 0.0
    composite: float = 0.0
    justifications: dict[str, str] = field(default_factory=dict)
    elapsed_seconds: float = 0.0
    error: str | None = None


# ── Runner ───────────────────────────────────────────────────────────────────

class ScoreRunner:
    """Scores a Stage 5 technique page using LLM-as-judge evaluation."""

    def __init__(self, client: LLMClient) -> None:
        self.client = client

    def score_page(
        self,
        page_json: dict,
        moments: list[dict],
    ) -> ScoreResult:
        """Evaluate a technique page against source moments.

        Parameters
        ----------
        page_json:
            Synthesized page dict (title, summary, body_sections).
        moments:
            Source key moments with transcript_excerpt, summary, etc.

        Returns
        -------
        ScoreResult with per-dimension scores and justifications.
        """
        # Build the user prompt with the page and source moments
        user_prompt = (
            "## Synthesized Technique Page\n\n"
            f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n"
            "## Source Key Moments\n\n"
            f"```json\n{json.dumps(moments, indent=2)}\n```\n\n"
            "Score this page across all 5 dimensions."
        )

        t0 = time.monotonic()
        try:
            resp = self.client.complete(
                system_prompt=SCORING_RUBRIC,
                user_prompt=user_prompt,
                response_model=BaseModel,  # triggers JSON mode
                modality="chat",
            )
            elapsed = round(time.monotonic() - t0, 2)
        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
            elapsed = round(time.monotonic() - t0, 2)
            url = self.client.settings.llm_api_url
            fallback = self.client.settings.llm_fallback_url
            return ScoreResult(
                elapsed_seconds=elapsed,
                error=(
                    f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
                    f"Error: {exc}"
                ),
            )

        # Parse the LLM judge response
        raw_text = str(resp).strip()
        try:
            parsed = json.loads(raw_text)
        except json.JSONDecodeError:
            logger.error("Malformed judge response (not JSON): %.300s", raw_text)
            return ScoreResult(
                elapsed_seconds=elapsed,
                error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
            )

        return self._parse_scores(parsed, elapsed)

    def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult:
        """Extract and validate scores from parsed JSON response."""
        scores: dict[str, float] = {}
        justifications: dict[str, str] = {}

        raw_justifications = parsed.get("justifications", {})
        if not isinstance(raw_justifications, dict):
            raw_justifications = {}

        for dim in DIMENSIONS:
            raw = parsed.get(dim)
            if raw is None:
                logger.warning("Missing dimension '%s' in judge response", dim)
                scores[dim] = 0.0
                justifications[dim] = "(missing from judge response)"
                continue

            try:
                val = float(raw)
                scores[dim] = max(0.0, min(1.0, val))  # clamp
            except (TypeError, ValueError):
                logger.warning("Invalid value for '%s': %r", dim, raw)
                scores[dim] = 0.0
                justifications[dim] = f"(invalid value: {raw!r})"
                continue

            justifications[dim] = str(raw_justifications.get(dim, ""))

        composite = sum(scores.values()) / len(DIMENSIONS)

        return ScoreResult(
            structural=scores["structural"],
            content_specificity=scores["content_specificity"],
            voice_preservation=scores["voice_preservation"],
            readability=scores["readability"],
            factual_fidelity=scores["factual_fidelity"],
            composite=round(composite, 3),
            justifications=justifications,
            elapsed_seconds=elapsed,
        )

    def print_report(self, result: ScoreResult) -> None:
        """Print a formatted scoring report to stdout."""
        print("\n" + "=" * 60)
        print("  STAGE 5 QUALITY SCORE REPORT")
        print("=" * 60)

        if result.error:
            print(f"\n  ✗ Error: {result.error}\n")
            print("=" * 60 + "\n")
            return

        for dim in DIMENSIONS:
            score = getattr(result, dim)
            bar = self._score_bar(score)
            justification = result.justifications.get(dim, "")
            print(f"\n  {dim.replace('_', ' ').title()}")
            print(f"    Score: {score:.2f}  {bar}")
            if justification:
                # Wrap justification at ~60 chars
                for line in self._wrap(justification, 56):
                    print(f"    {line}")

        print("\n" + "-" * 60)
        print(f"  Composite: {result.composite:.3f}")
        print(f"  Time: {result.elapsed_seconds}s")
        print("=" * 60 + "\n")

    @staticmethod
    def _score_bar(score: float, width: int = 20) -> str:
        """Render a visual bar for a 0-1 score."""
        filled = int(score * width)
        return "█" * filled + "░" * (width - filled)

    @staticmethod
    def _wrap(text: str, width: int) -> list[str]:
        """Simple word wrap."""
        words = text.split()
        lines: list[str] = []
        current = ""
        for word in words:
            if current and len(current) + len(word) + 1 > width:
                lines.append(current)
                current = word
            else:
                current = f"{current} {word}" if current else word
        if current:
            lines.append(current)
        return lines