"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions. Evaluates a synthesized technique page against source moments on: 1. Structural quality — section naming, count, paragraph depth 2. Content specificity — concrete details vs vague generalities 3. Voice preservation — direct quotes, attributed opinions, personality 4. Readability / flow — synthesis quality, logical ordering, no redundancy 5. Factual fidelity — no hallucinated specifics, grounded in source moments Run via: python -m pipeline.quality score --file """ from __future__ import annotations import json import logging import time from dataclasses import dataclass, field import openai from pydantic import BaseModel from pipeline.llm_client import LLMClient logger = logging.getLogger(__name__) # ── Scoring rubric (hardcoded for iteration speed) ─────────────────────────── SCORING_RUBRIC = """\ You are an expert evaluator of synthesized technique articles for music production education. You will be given: 1. A synthesized technique page (JSON with title, summary, body_sections) 2. The source key moments (transcript excerpts, summaries, tags) used to create it Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0: **structural** — Section naming and organization - 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section - 0.5-0.7: Acceptable structure but some generic section names or uneven depth - 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections **content_specificity** — Concrete technical details - 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values - 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings") - 0.0-0.3: Mostly vague generalities with few concrete values from the source material **voice_preservation** — Creator's authentic voice - 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained - 0.5-0.7: Some paraphrased references to creator's views but few direct quotes - 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution **readability** — Synthesis quality and flow - 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction - 0.5-0.7: Generally readable but some awkward transitions or minor repetition - 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages **factual_fidelity** — Grounded in source material - 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques - 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources - 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources Return ONLY a JSON object with this exact structure: { "structural": , "content_specificity": , "voice_preservation": , "readability": , "factual_fidelity": , "justifications": { "structural": "<1-2 sentence justification>", "content_specificity": "<1-2 sentence justification>", "voice_preservation": "<1-2 sentence justification>", "readability": "<1-2 sentence justification>", "factual_fidelity": "<1-2 sentence justification>" } } """ DIMENSIONS = [ "structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity", ] # ── Result type ────────────────────────────────────────────────────────────── @dataclass class ScoreResult: """Outcome of scoring a technique page across 5 quality dimensions.""" structural: float = 0.0 content_specificity: float = 0.0 voice_preservation: float = 0.0 readability: float = 0.0 factual_fidelity: float = 0.0 composite: float = 0.0 justifications: dict[str, str] = field(default_factory=dict) elapsed_seconds: float = 0.0 error: str | None = None # ── Runner ─────────────────────────────────────────────────────────────────── class ScoreRunner: """Scores a Stage 5 technique page using LLM-as-judge evaluation.""" def __init__(self, client: LLMClient) -> None: self.client = client def score_page( self, page_json: dict, moments: list[dict], ) -> ScoreResult: """Evaluate a technique page against source moments. Parameters ---------- page_json: Synthesized page dict (title, summary, body_sections). moments: Source key moments with transcript_excerpt, summary, etc. Returns ------- ScoreResult with per-dimension scores and justifications. """ # Build the user prompt with the page and source moments user_prompt = ( "## Synthesized Technique Page\n\n" f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n" "## Source Key Moments\n\n" f"```json\n{json.dumps(moments, indent=2)}\n```\n\n" "Score this page across all 5 dimensions." ) t0 = time.monotonic() try: resp = self.client.complete( system_prompt=SCORING_RUBRIC, user_prompt=user_prompt, response_model=BaseModel, # triggers JSON mode modality="chat", ) elapsed = round(time.monotonic() - t0, 2) except (openai.APIConnectionError, openai.APITimeoutError) as exc: elapsed = round(time.monotonic() - t0, 2) url = self.client.settings.llm_api_url fallback = self.client.settings.llm_fallback_url return ScoreResult( elapsed_seconds=elapsed, error=( f"Cannot reach LLM endpoint at {url} (fallback {fallback}). " f"Error: {exc}" ), ) # Parse the LLM judge response raw_text = str(resp).strip() try: parsed = json.loads(raw_text) except json.JSONDecodeError: logger.error("Malformed judge response (not JSON): %.300s", raw_text) return ScoreResult( elapsed_seconds=elapsed, error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}", ) return self._parse_scores(parsed, elapsed) def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult: """Extract and validate scores from parsed JSON response.""" scores: dict[str, float] = {} justifications: dict[str, str] = {} raw_justifications = parsed.get("justifications", {}) if not isinstance(raw_justifications, dict): raw_justifications = {} for dim in DIMENSIONS: raw = parsed.get(dim) if raw is None: logger.warning("Missing dimension '%s' in judge response", dim) scores[dim] = 0.0 justifications[dim] = "(missing from judge response)" continue try: val = float(raw) scores[dim] = max(0.0, min(1.0, val)) # clamp except (TypeError, ValueError): logger.warning("Invalid value for '%s': %r", dim, raw) scores[dim] = 0.0 justifications[dim] = f"(invalid value: {raw!r})" continue justifications[dim] = str(raw_justifications.get(dim, "")) composite = sum(scores.values()) / len(DIMENSIONS) return ScoreResult( structural=scores["structural"], content_specificity=scores["content_specificity"], voice_preservation=scores["voice_preservation"], readability=scores["readability"], factual_fidelity=scores["factual_fidelity"], composite=round(composite, 3), justifications=justifications, elapsed_seconds=elapsed, ) def print_report(self, result: ScoreResult) -> None: """Print a formatted scoring report to stdout.""" print("\n" + "=" * 60) print(" STAGE 5 QUALITY SCORE REPORT") print("=" * 60) if result.error: print(f"\n ✗ Error: {result.error}\n") print("=" * 60 + "\n") return for dim in DIMENSIONS: score = getattr(result, dim) bar = self._score_bar(score) justification = result.justifications.get(dim, "") print(f"\n {dim.replace('_', ' ').title()}") print(f" Score: {score:.2f} {bar}") if justification: # Wrap justification at ~60 chars for line in self._wrap(justification, 56): print(f" {line}") print("\n" + "-" * 60) print(f" Composite: {result.composite:.3f}") print(f" Time: {result.elapsed_seconds}s") print("=" * 60 + "\n") @staticmethod def _score_bar(score: float, width: int = 20) -> str: """Render a visual bar for a 0-1 score.""" filled = int(score * width) return "█" * filled + "░" * (width - filled) @staticmethod def _wrap(text: str, width: int) -> list[str]: """Simple word wrap.""" words = text.split() lines: list[str] = [] current = "" for word in words: if current and len(current) + len(word) + 1 > width: lines.append(current) current = word else: current = f"{current} {word}" if current else word if current: lines.append(current) return lines