chrysopedia/backend/pipeline/quality/scorer.py

"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics.

Supports stages 2-5, each with its own scoring dimensions, rubric, format
markers, fixture key requirements, prompt file name, and output schema.

Run via: python -m pipeline.quality score --file <path>
"""
from __future__ import annotations

import json
import logging
import sys
import time
from dataclasses import dataclass, field
from typing import Any

import openai
from pydantic import BaseModel

from pipeline.llm_client import LLMClient
from pipeline.quality.voice_dial import VoiceDial

logger = logging.getLogger(__name__)


# ── Per-stage configuration registry ─────────────────────────────────────────

class StageConfig:
    """Configuration for scoring a specific pipeline stage."""

    def __init__(
        self,
        stage: int,
        dimensions: list[str],
        rubric: str,
        format_markers: list[str],
        fixture_keys: list[str],
        prompt_file: str,
        schema_class: str,
    ) -> None:
        self.stage = stage
        self.dimensions = dimensions
        self.rubric = rubric
        self.format_markers = format_markers
        self.fixture_keys = fixture_keys
        self.prompt_file = prompt_file
        self.schema_class = schema_class

    def get_schema(self) -> type[BaseModel]:
        """Import and return the Pydantic schema class for this stage."""
        from pipeline import schemas
        return getattr(schemas, self.schema_class)


# ── Stage rubrics ────────────────────────────────────────────────────────────

_STAGE_2_RUBRIC = """\
You are an expert evaluator of transcript segmentation quality for educational content.

You will be given:
1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary)
2. The source transcript segments used as input

Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0:

**coverage_completeness** — All transcript content accounted for
- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps
- 0.5-0.7: Minor gaps or overlaps, but most content is covered
- 0.0-0.3: Large gaps — significant transcript segments are not assigned to any topic

**topic_specificity** — Topic labels are descriptive and useful
- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing")
- 0.5-0.7: Labels are somewhat specific but could be more descriptive
- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio")

**boundary_accuracy** — Segment boundaries align with actual topic transitions
- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units
- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics
- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content

**summary_quality** — Summaries accurately describe segment content
- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately
- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague
- 0.0-0.3: Summaries are inaccurate, too generic, or missing

Return ONLY a JSON object with this exact structure:
{
  "coverage_completeness": <float 0.0-1.0>,
  "topic_specificity": <float 0.0-1.0>,
  "boundary_accuracy": <float 0.0-1.0>,
  "summary_quality": <float 0.0-1.0>,
  "justifications": {
    "coverage_completeness": "<1-2 sentence justification>",
    "topic_specificity": "<1-2 sentence justification>",
    "boundary_accuracy": "<1-2 sentence justification>",
    "summary_quality": "<1-2 sentence justification>"
  }
}
"""

_STAGE_3_RUBRIC = """\
You are an expert evaluator of key moment extraction quality for educational content.

You will be given:
1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript)
2. The source topic segments used as input

Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0:

**moment_richness** — Extracted moments capture substantial, distinct insights
- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary
- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others
- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed

**timestamp_accuracy** — Time ranges are plausible and well-bounded
- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans
- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow
- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical

**content_type_correctness** — Content types match the actual moment content
- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it
- 0.5-0.7: Most are correct but 1-2 are miscategorized
- 0.0-0.3: Content types seem randomly assigned or all the same

**summary_actionability** — Summaries provide actionable, specific information
- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow
- 0.5-0.7: Summaries describe the topic but lack specific actionable details
- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information

**plugin_normalization** — Plugin/tool names are correctly identified and normalized
- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools
- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed
- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors

Return ONLY a JSON object with this exact structure:
{
  "moment_richness": <float 0.0-1.0>,
  "timestamp_accuracy": <float 0.0-1.0>,
  "content_type_correctness": <float 0.0-1.0>,
  "summary_actionability": <float 0.0-1.0>,
  "plugin_normalization": <float 0.0-1.0>,
  "justifications": {
    "moment_richness": "<1-2 sentence justification>",
    "timestamp_accuracy": "<1-2 sentence justification>",
    "content_type_correctness": "<1-2 sentence justification>",
    "summary_actionability": "<1-2 sentence justification>",
    "plugin_normalization": "<1-2 sentence justification>"
  }
}
"""

_STAGE_4_RUBRIC = """\
You are an expert evaluator of content classification quality for educational content.

You will be given:
1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags)
2. The source extracted moments used as input

Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0:

**category_accuracy** — Topic categories are appropriate and meaningful
- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels
- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off
- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same

**tag_completeness** — All relevant tags are captured
- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively
- 0.5-0.7: Main tags are present but secondary concepts or tools are missed
- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments

**tag_specificity** — Tags are specific enough to be useful for search/filtering
- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing")
- 0.5-0.7: Mix of specific and generic tags
- 0.0-0.3: Tags are too generic to meaningfully distinguish moments

**coverage** — All moments are classified
- 0.9-1.0: Every moment_index from the input has a corresponding classification entry
- 0.5-0.7: Most moments classified but 1-2 are missing
- 0.0-0.3: Many moments are not classified

Return ONLY a JSON object with this exact structure:
{
  "category_accuracy": <float 0.0-1.0>,
  "tag_completeness": <float 0.0-1.0>,
  "tag_specificity": <float 0.0-1.0>,
  "coverage": <float 0.0-1.0>,
  "justifications": {
    "category_accuracy": "<1-2 sentence justification>",
    "tag_completeness": "<1-2 sentence justification>",
    "tag_specificity": "<1-2 sentence justification>",
    "coverage": "<1-2 sentence justification>"
  }
}
"""

_STAGE_5_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education.

You will be given:
1. A synthesized technique page (JSON with title, summary, body_sections)
2. The source key moments (transcript excerpts, summaries, tags) used to create it

Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0:

**structural** — Section naming and organization
- 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section
- 0.5-0.7: Acceptable structure but some generic section names or uneven depth
- 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections

**content_specificity** — Concrete technical details
- 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values
- 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings")
- 0.0-0.3: Mostly vague generalities with few concrete values from the source material

**voice_preservation** — Creator's authentic voice
- 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained
- 0.5-0.7: Some paraphrased references to creator's views but few direct quotes
- 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution

**readability** — Synthesis quality and flow
- 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction
- 0.5-0.7: Generally readable but some awkward transitions or minor repetition
- 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages

**factual_fidelity** — Grounded in source material
- 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques
- 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources

Return ONLY a JSON object with this exact structure:
{
  "structural": <float 0.0-1.0>,
  "content_specificity": <float 0.0-1.0>,
  "voice_preservation": <float 0.0-1.0>,
  "readability": <float 0.0-1.0>,
  "factual_fidelity": <float 0.0-1.0>,
  "justifications": {
    "structural": "<1-2 sentence justification>",
    "content_specificity": "<1-2 sentence justification>",
    "voice_preservation": "<1-2 sentence justification>",
    "readability": "<1-2 sentence justification>",
    "factual_fidelity": "<1-2 sentence justification>"
  }
}
"""

# Backward-compat alias used by synthesize_and_score and external references
SCORING_RUBRIC = _STAGE_5_RUBRIC

# Build the stage configs registry
STAGE_CONFIGS: dict[int, StageConfig] = {
    2: StageConfig(
        stage=2,
        dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"],
        rubric=_STAGE_2_RUBRIC,
        format_markers=["segments", "start_index", "end_index", "topic_label"],
        fixture_keys=["transcript_segments"],
        prompt_file="stage2_segmentation.txt",
        schema_class="SegmentationResult",
    ),
    3: StageConfig(
        stage=3,
        dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"],
        rubric=_STAGE_3_RUBRIC,
        format_markers=["moments", "content_type", "raw_transcript", "plugins"],
        fixture_keys=["topic_segments"],
        prompt_file="stage3_extraction.txt",
        schema_class="ExtractionResult",
    ),
    4: StageConfig(
        stage=4,
        dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"],
        rubric=_STAGE_4_RUBRIC,
        format_markers=["classifications", "moment_index", "topic_category", "topic_tags"],
        fixture_keys=["extracted_moments"],
        prompt_file="stage4_classification.txt",
        schema_class="ClassificationResult",
    ),
    5: StageConfig(
        stage=5,
        dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"],
        rubric=SCORING_RUBRIC,
        format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"],
        fixture_keys=["key_moments", "creator_name"],
        prompt_file="stage5_synthesis.txt",
        schema_class="SynthesisResult",
    ),
}

# Backward-compatible alias: stage 5 dimensions list
DIMENSIONS = STAGE_CONFIGS[5].dimensions


# ── Result type ──────────────────────────────────────────────────────────────

@dataclass
class ScoreResult:
    """Outcome of scoring a stage output across quality dimensions.

    Uses a generic ``scores`` dict keyed by dimension name.  Stage 5's
    original named fields (structural, content_specificity, …) are
    preserved as properties for backward compatibility.
    """

    scores: dict[str, float] = field(default_factory=dict)
    composite: float = 0.0
    justifications: dict[str, str] = field(default_factory=dict)
    elapsed_seconds: float = 0.0
    error: str | None = None

    # ── Backward-compat properties for stage 5 named dimensions ──────
    @property
    def structural(self) -> float:
        return self.scores.get("structural", 0.0)

    @property
    def content_specificity(self) -> float:
        return self.scores.get("content_specificity", 0.0)

    @property
    def voice_preservation(self) -> float:
        return self.scores.get("voice_preservation", 0.0)

    @property
    def readability(self) -> float:
        return self.scores.get("readability", 0.0)

    @property
    def factual_fidelity(self) -> float:
        return self.scores.get("factual_fidelity", 0.0)


# ── Runner ───────────────────────────────────────────────────────────────────

class ScoreRunner:
    """Scores pipeline stage outputs using LLM-as-judge evaluation."""

    def __init__(self, client: LLMClient) -> None:
        self.client = client

    # ── Generic stage scorer ─────────────────────────────────────────────

    def score_stage_output(
        self,
        stage: int,
        output_json: dict | list,
        input_json: dict | list,
    ) -> ScoreResult:
        """Score an arbitrary stage's output against its input.

        Parameters
        ----------
        stage:
            Pipeline stage number (2-5).
        output_json:
            The stage output to evaluate (parsed JSON).
        input_json:
            The stage input / source material.

        Returns
        -------
        ScoreResult with per-dimension scores for the requested stage.
        """
        if stage not in STAGE_CONFIGS:
            return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}")

        cfg = STAGE_CONFIGS[stage]

        user_prompt = (
            "## Stage Output\n\n"
            f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n"
            "## Stage Input\n\n"
            f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n"
            f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions."
        )

        t0 = time.monotonic()
        try:
            resp = self.client.complete(
                system_prompt=cfg.rubric,
                user_prompt=user_prompt,
                response_model=BaseModel,
                modality="chat",
            )
            elapsed = round(time.monotonic() - t0, 2)
        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
            elapsed = round(time.monotonic() - t0, 2)
            url = self.client.settings.llm_api_url
            fallback = self.client.settings.llm_fallback_url
            return ScoreResult(
                elapsed_seconds=elapsed,
                error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}",
            )

        raw_text = str(resp).strip()
        try:
            parsed = json.loads(raw_text)
        except json.JSONDecodeError:
            logger.error("Malformed judge response (not JSON): %.300s", raw_text)
            return ScoreResult(
                elapsed_seconds=elapsed,
                error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
            )

        return self._parse_scores(parsed, elapsed, cfg.dimensions)

    # ── Stage 5 convenience (backward compat) ────────────────────────────

    def score_page(
        self,
        page_json: dict,
        moments: list[dict],
    ) -> ScoreResult:
        """Evaluate a stage 5 technique page against source moments."""
        return self.score_stage_output(
            stage=5,
            output_json=page_json,
            input_json=moments,
        )

        return self._parse_scores(parsed, elapsed)

    def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult:
        """Extract and validate scores from parsed JSON response."""
        dims = dimensions or DIMENSIONS
        scores: dict[str, float] = {}
        justifications: dict[str, str] = {}

        raw_justifications = parsed.get("justifications", {})
        if not isinstance(raw_justifications, dict):
            raw_justifications = {}

        for dim in dims:
            raw = parsed.get(dim)
            if raw is None:
                logger.warning("Missing dimension '%s' in judge response", dim)
                scores[dim] = 0.0
                justifications[dim] = "(missing from judge response)"
                continue

            try:
                val = float(raw)
                scores[dim] = max(0.0, min(1.0, val))  # clamp
            except (TypeError, ValueError):
                logger.warning("Invalid value for '%s': %r", dim, raw)
                scores[dim] = 0.0
                justifications[dim] = f"(invalid value: {raw!r})"
                continue

            justifications[dim] = str(raw_justifications.get(dim, ""))

        composite = sum(scores.values()) / len(dims) if dims else 0.0

        return ScoreResult(
            scores=scores,
            composite=round(composite, 3),
            justifications=justifications,
            elapsed_seconds=elapsed,
        )

    def synthesize_and_score(
        self,
        moments: list[dict],
        creator_name: str,
        voice_level: float,
    ) -> ScoreResult:
        """Re-synthesize from source moments with a voice-dialed prompt, then score.

        Loads the stage 5 synthesis prompt from disk, applies the VoiceDial
        modifier at the given voice_level, calls the LLM to produce a
        SynthesisResult, then scores the first page.

        Parameters
        ----------
        moments:
            Source key moments (dicts with summary, transcript_excerpt, etc.)
        creator_name:
            Creator name to inject into the synthesis prompt.
        voice_level:
            Float 0.0–1.0 controlling voice preservation intensity.

        Returns
        -------
        ScoreResult with per-dimension scores after voice-dialed re-synthesis.
        """
        from pipeline.schemas import SynthesisResult
        from pipeline.stages import _get_stage_config, _load_prompt

        # Load and modify the stage 5 system prompt
        try:
            base_prompt = _load_prompt("stage5_synthesis.txt")
        except FileNotFoundError as exc:
            return ScoreResult(error=f"Prompt file not found: {exc}")

        dial = VoiceDial(base_prompt)
        modified_prompt = dial.modify(voice_level)
        band = dial.band_name(voice_level)

        # Build user prompt in the same format as _synthesize_chunk
        moments_json = json.dumps(moments, indent=2)
        user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_json}\n</moments>"

        model_override, modality = _get_stage_config(5)

        print(f"  Re-synthesizing at voice_level={voice_level} (band={band})...")

        t0 = time.monotonic()
        try:
            raw = self.client.complete(
                system_prompt=modified_prompt,
                user_prompt=user_prompt,
                response_model=SynthesisResult,
                modality=modality,
                model_override=model_override,
            )
            elapsed_synth = round(time.monotonic() - t0, 2)
        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
            elapsed_synth = round(time.monotonic() - t0, 2)
            url = self.client.settings.llm_api_url
            fallback = self.client.settings.llm_fallback_url
            return ScoreResult(
                elapsed_seconds=elapsed_synth,
                error=(
                    f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
                    f"Error: {exc}"
                ),
            )

        # Parse synthesis response
        raw_text = str(raw).strip()
        try:
            synthesis = self.client.parse_response(raw_text, SynthesisResult)
        except (json.JSONDecodeError, ValueError, Exception) as exc:
            logger.error("Malformed synthesis response: %.300s", raw_text)
            return ScoreResult(
                elapsed_seconds=elapsed_synth,
                error=f"Malformed synthesis response: {exc}. Raw excerpt: {raw_text[:200]}",
            )

        if not synthesis.pages:
            return ScoreResult(
                elapsed_seconds=elapsed_synth,
                error="Synthesis returned no pages.",
            )

        # Score the first page
        page = synthesis.pages[0]
        page_json = {
            "title": page.title,
            "creator_name": creator_name,
            "summary": page.summary,
            "body_sections": [
                {"heading": heading, "content": content}
                for heading, content in page.body_sections.items()
            ],
        }

        print(f"  Synthesis complete ({elapsed_synth}s). Scoring...")
        result = self.score_page(page_json, moments)
        # Include synthesis time in total
        result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
        return result

    def print_report(self, result: ScoreResult, stage: int = 5) -> None:
        """Print a formatted scoring report to stdout."""
        dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys())
        stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY"

        print("\n" + "=" * 60)
        print(f"  {stage_label} QUALITY SCORE REPORT")
        print("=" * 60)

        if result.error:
            print(f"\n  ✗ Error: {result.error}\n")
            print("=" * 60 + "\n")
            return

        for dim in dims:
            score = result.scores.get(dim, 0.0)
            bar = self._score_bar(score)
            justification = result.justifications.get(dim, "")
            print(f"\n  {dim.replace('_', ' ').title()}")
            print(f"    Score: {score:.2f}  {bar}")
            if justification:
                # Wrap justification at ~60 chars
                for line in self._wrap(justification, 56):
                    print(f"    {line}")

        print("\n" + "-" * 60)
        print(f"  Composite: {result.composite:.3f}")
        print(f"  Time: {result.elapsed_seconds}s")
        print("=" * 60 + "\n")

    @staticmethod
    def _score_bar(score: float, width: int = 20) -> str:
        """Render a visual bar for a 0-1 score."""
        filled = int(score * width)
        return "█" * filled + "░" * (width - filled)

    @staticmethod
    def _wrap(text: str, width: int) -> list[str]:
        """Simple word wrap."""
        words = text.split()
        lines: list[str] = []
        current = ""
        for word in words:
            if current and len(current) + len(word) + 1 > width:
                lines.append(current)
                current = word
            else:
                current = f"{current} {word}" if current else word
        if current:
            lines.append(current)
        return lines