"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics. Supports stages 2-5, each with its own scoring dimensions, rubric, format markers, fixture key requirements, prompt file name, and output schema. Run via: python -m pipeline.quality score --file """ from __future__ import annotations import json import logging import sys import time from dataclasses import dataclass, field from typing import Any import openai from pydantic import BaseModel from pipeline.llm_client import LLMClient from pipeline.quality.voice_dial import VoiceDial logger = logging.getLogger(__name__) # ── Per-stage configuration registry ───────────────────────────────────────── class StageConfig: """Configuration for scoring a specific pipeline stage.""" def __init__( self, stage: int, dimensions: list[str], rubric: str, format_markers: list[str], fixture_keys: list[str], prompt_file: str, schema_class: str, ) -> None: self.stage = stage self.dimensions = dimensions self.rubric = rubric self.format_markers = format_markers self.fixture_keys = fixture_keys self.prompt_file = prompt_file self.schema_class = schema_class def get_schema(self) -> type[BaseModel]: """Import and return the Pydantic schema class for this stage.""" from pipeline import schemas return getattr(schemas, self.schema_class) # ── Stage rubrics ──────────────────────────────────────────────────────────── _STAGE_2_RUBRIC = """\ You are an expert evaluator of transcript segmentation quality for educational content. You will be given: 1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary) 2. The source transcript segments used as input Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0: **coverage_completeness** — All transcript content accounted for - 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps - 0.5-0.7: Minor gaps or overlaps, but most content is covered - 0.0-0.3: Large gaps — significant transcript segments are not assigned to any topic **topic_specificity** — Topic labels are descriptive and useful - 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing") - 0.5-0.7: Labels are somewhat specific but could be more descriptive - 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio") **boundary_accuracy** — Segment boundaries align with actual topic transitions - 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units - 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics - 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content **summary_quality** — Summaries accurately describe segment content - 0.9-1.0: Summaries capture the key points of each segment concisely and accurately - 0.5-0.7: Summaries are acceptable but miss some key points or are too vague - 0.0-0.3: Summaries are inaccurate, too generic, or missing Return ONLY a JSON object with this exact structure: { "coverage_completeness": , "topic_specificity": , "boundary_accuracy": , "summary_quality": , "justifications": { "coverage_completeness": "<1-2 sentence justification>", "topic_specificity": "<1-2 sentence justification>", "boundary_accuracy": "<1-2 sentence justification>", "summary_quality": "<1-2 sentence justification>" } } """ _STAGE_3_RUBRIC = """\ You are an expert evaluator of key moment extraction quality for educational content. You will be given: 1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript) 2. The source topic segments used as input Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0: **moment_richness** — Extracted moments capture substantial, distinct insights - 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary - 0.5-0.7: Moments are valid but some are thin or overlap significantly with others - 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed **timestamp_accuracy** — Time ranges are plausible and well-bounded - 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans - 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow - 0.0-0.3: Timestamps appear arbitrary or many are zero/identical **content_type_correctness** — Content types match the actual moment content - 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it - 0.5-0.7: Most are correct but 1-2 are miscategorized - 0.0-0.3: Content types seem randomly assigned or all the same **summary_actionability** — Summaries provide actionable, specific information - 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow - 0.5-0.7: Summaries describe the topic but lack specific actionable details - 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information **plugin_normalization** — Plugin/tool names are correctly identified and normalized - 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools - 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed - 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors Return ONLY a JSON object with this exact structure: { "moment_richness": , "timestamp_accuracy": , "content_type_correctness": , "summary_actionability": , "plugin_normalization": , "justifications": { "moment_richness": "<1-2 sentence justification>", "timestamp_accuracy": "<1-2 sentence justification>", "content_type_correctness": "<1-2 sentence justification>", "summary_actionability": "<1-2 sentence justification>", "plugin_normalization": "<1-2 sentence justification>" } } """ _STAGE_4_RUBRIC = """\ You are an expert evaluator of content classification quality for educational content. You will be given: 1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags) 2. The source extracted moments used as input Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0: **category_accuracy** — Topic categories are appropriate and meaningful - 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels - 0.5-0.7: Most categories are reasonable but some are too broad or slightly off - 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same **tag_completeness** — All relevant tags are captured - 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively - 0.5-0.7: Main tags are present but secondary concepts or tools are missed - 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments **tag_specificity** — Tags are specific enough to be useful for search/filtering - 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing") - 0.5-0.7: Mix of specific and generic tags - 0.0-0.3: Tags are too generic to meaningfully distinguish moments **coverage** — All moments are classified - 0.9-1.0: Every moment_index from the input has a corresponding classification entry - 0.5-0.7: Most moments classified but 1-2 are missing - 0.0-0.3: Many moments are not classified Return ONLY a JSON object with this exact structure: { "category_accuracy": , "tag_completeness": , "tag_specificity": , "coverage": , "justifications": { "category_accuracy": "<1-2 sentence justification>", "tag_completeness": "<1-2 sentence justification>", "tag_specificity": "<1-2 sentence justification>", "coverage": "<1-2 sentence justification>" } } """ _STAGE_5_RUBRIC = """\ You are an expert evaluator of synthesized technique articles for music production education. You will be given: 1. A synthesized technique page (JSON with title, summary, body_sections) 2. The source key moments (transcript excerpts, summaries, tags) used to create it Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0: **structural** — Section naming and organization - 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section - 0.5-0.7: Acceptable structure but some generic section names or uneven depth - 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections **content_specificity** — Concrete technical details - 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values - 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings") - 0.0-0.3: Mostly vague generalities with few concrete values from the source material **voice_preservation** — Creator's authentic voice - 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained - 0.5-0.7: Some paraphrased references to creator's views but few direct quotes - 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution **readability** — Synthesis quality and flow - 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction - 0.5-0.7: Generally readable but some awkward transitions or minor repetition - 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages **factual_fidelity** — Grounded in source material - 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques - 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources - 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources Return ONLY a JSON object with this exact structure: { "structural": , "content_specificity": , "voice_preservation": , "readability": , "factual_fidelity": , "justifications": { "structural": "<1-2 sentence justification>", "content_specificity": "<1-2 sentence justification>", "voice_preservation": "<1-2 sentence justification>", "readability": "<1-2 sentence justification>", "factual_fidelity": "<1-2 sentence justification>" } } """ # Backward-compat alias used by synthesize_and_score and external references SCORING_RUBRIC = _STAGE_5_RUBRIC # Build the stage configs registry STAGE_CONFIGS: dict[int, StageConfig] = { 2: StageConfig( stage=2, dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"], rubric=_STAGE_2_RUBRIC, format_markers=["segments", "start_index", "end_index", "topic_label"], fixture_keys=["transcript_segments"], prompt_file="stage2_segmentation.txt", schema_class="SegmentationResult", ), 3: StageConfig( stage=3, dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"], rubric=_STAGE_3_RUBRIC, format_markers=["moments", "content_type", "raw_transcript", "plugins"], fixture_keys=["topic_segments"], prompt_file="stage3_extraction.txt", schema_class="ExtractionResult", ), 4: StageConfig( stage=4, dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"], rubric=_STAGE_4_RUBRIC, format_markers=["classifications", "moment_index", "topic_category", "topic_tags"], fixture_keys=["extracted_moments"], prompt_file="stage4_classification.txt", schema_class="ClassificationResult", ), 5: StageConfig( stage=5, dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"], rubric=SCORING_RUBRIC, format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"], fixture_keys=["key_moments", "creator_name"], prompt_file="stage5_synthesis.txt", schema_class="SynthesisResult", ), } # Backward-compatible alias: stage 5 dimensions list DIMENSIONS = STAGE_CONFIGS[5].dimensions # ── Result type ────────────────────────────────────────────────────────────── @dataclass class ScoreResult: """Outcome of scoring a stage output across quality dimensions. Uses a generic ``scores`` dict keyed by dimension name. Stage 5's original named fields (structural, content_specificity, …) are preserved as properties for backward compatibility. """ scores: dict[str, float] = field(default_factory=dict) composite: float = 0.0 justifications: dict[str, str] = field(default_factory=dict) elapsed_seconds: float = 0.0 error: str | None = None # ── Backward-compat properties for stage 5 named dimensions ────── @property def structural(self) -> float: return self.scores.get("structural", 0.0) @property def content_specificity(self) -> float: return self.scores.get("content_specificity", 0.0) @property def voice_preservation(self) -> float: return self.scores.get("voice_preservation", 0.0) @property def readability(self) -> float: return self.scores.get("readability", 0.0) @property def factual_fidelity(self) -> float: return self.scores.get("factual_fidelity", 0.0) # ── Runner ─────────────────────────────────────────────────────────────────── class ScoreRunner: """Scores pipeline stage outputs using LLM-as-judge evaluation.""" def __init__(self, client: LLMClient) -> None: self.client = client # ── Generic stage scorer ───────────────────────────────────────────── def score_stage_output( self, stage: int, output_json: dict | list, input_json: dict | list, ) -> ScoreResult: """Score an arbitrary stage's output against its input. Parameters ---------- stage: Pipeline stage number (2-5). output_json: The stage output to evaluate (parsed JSON). input_json: The stage input / source material. Returns ------- ScoreResult with per-dimension scores for the requested stage. """ if stage not in STAGE_CONFIGS: return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}") cfg = STAGE_CONFIGS[stage] user_prompt = ( "## Stage Output\n\n" f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n" "## Stage Input\n\n" f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n" f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions." ) t0 = time.monotonic() try: resp = self.client.complete( system_prompt=cfg.rubric, user_prompt=user_prompt, response_model=BaseModel, modality="chat", ) elapsed = round(time.monotonic() - t0, 2) except (openai.APIConnectionError, openai.APITimeoutError) as exc: elapsed = round(time.monotonic() - t0, 2) url = self.client.settings.llm_api_url fallback = self.client.settings.llm_fallback_url return ScoreResult( elapsed_seconds=elapsed, error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}", ) raw_text = str(resp).strip() try: parsed = json.loads(raw_text) except json.JSONDecodeError: logger.error("Malformed judge response (not JSON): %.300s", raw_text) return ScoreResult( elapsed_seconds=elapsed, error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}", ) return self._parse_scores(parsed, elapsed, cfg.dimensions) # ── Stage 5 convenience (backward compat) ──────────────────────────── def score_page( self, page_json: dict, moments: list[dict], ) -> ScoreResult: """Evaluate a stage 5 technique page against source moments.""" return self.score_stage_output( stage=5, output_json=page_json, input_json=moments, ) return self._parse_scores(parsed, elapsed) def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult: """Extract and validate scores from parsed JSON response.""" dims = dimensions or DIMENSIONS scores: dict[str, float] = {} justifications: dict[str, str] = {} raw_justifications = parsed.get("justifications", {}) if not isinstance(raw_justifications, dict): raw_justifications = {} for dim in dims: raw = parsed.get(dim) if raw is None: logger.warning("Missing dimension '%s' in judge response", dim) scores[dim] = 0.0 justifications[dim] = "(missing from judge response)" continue try: val = float(raw) scores[dim] = max(0.0, min(1.0, val)) # clamp except (TypeError, ValueError): logger.warning("Invalid value for '%s': %r", dim, raw) scores[dim] = 0.0 justifications[dim] = f"(invalid value: {raw!r})" continue justifications[dim] = str(raw_justifications.get(dim, "")) composite = sum(scores.values()) / len(dims) if dims else 0.0 return ScoreResult( scores=scores, composite=round(composite, 3), justifications=justifications, elapsed_seconds=elapsed, ) def synthesize_and_score( self, moments: list[dict], creator_name: str, voice_level: float, ) -> ScoreResult: """Re-synthesize from source moments with a voice-dialed prompt, then score. Loads the stage 5 synthesis prompt from disk, applies the VoiceDial modifier at the given voice_level, calls the LLM to produce a SynthesisResult, then scores the first page. Parameters ---------- moments: Source key moments (dicts with summary, transcript_excerpt, etc.) creator_name: Creator name to inject into the synthesis prompt. voice_level: Float 0.0–1.0 controlling voice preservation intensity. Returns ------- ScoreResult with per-dimension scores after voice-dialed re-synthesis. """ from pipeline.schemas import SynthesisResult from pipeline.stages import _get_stage_config, _load_prompt # Load and modify the stage 5 system prompt try: base_prompt = _load_prompt("stage5_synthesis.txt") except FileNotFoundError as exc: return ScoreResult(error=f"Prompt file not found: {exc}") dial = VoiceDial(base_prompt) modified_prompt = dial.modify(voice_level) band = dial.band_name(voice_level) # Build user prompt in the same format as _synthesize_chunk moments_json = json.dumps(moments, indent=2) user_prompt = f"{creator_name}\n\n{moments_json}\n" model_override, modality = _get_stage_config(5) print(f" Re-synthesizing at voice_level={voice_level} (band={band})...") t0 = time.monotonic() try: raw = self.client.complete( system_prompt=modified_prompt, user_prompt=user_prompt, response_model=SynthesisResult, modality=modality, model_override=model_override, ) elapsed_synth = round(time.monotonic() - t0, 2) except (openai.APIConnectionError, openai.APITimeoutError) as exc: elapsed_synth = round(time.monotonic() - t0, 2) url = self.client.settings.llm_api_url fallback = self.client.settings.llm_fallback_url return ScoreResult( elapsed_seconds=elapsed_synth, error=( f"Cannot reach LLM endpoint at {url} (fallback {fallback}). " f"Error: {exc}" ), ) # Parse synthesis response raw_text = str(raw).strip() try: synthesis = self.client.parse_response(raw_text, SynthesisResult) except (json.JSONDecodeError, ValueError, Exception) as exc: logger.error("Malformed synthesis response: %.300s", raw_text) return ScoreResult( elapsed_seconds=elapsed_synth, error=f"Malformed synthesis response: {exc}. Raw excerpt: {raw_text[:200]}", ) if not synthesis.pages: return ScoreResult( elapsed_seconds=elapsed_synth, error="Synthesis returned no pages.", ) # Score the first page page = synthesis.pages[0] page_json = { "title": page.title, "creator_name": creator_name, "summary": page.summary, "body_sections": [ {"heading": heading, "content": content} for heading, content in page.body_sections.items() ], } print(f" Synthesis complete ({elapsed_synth}s). Scoring...") result = self.score_page(page_json, moments) # Include synthesis time in total result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2) return result def print_report(self, result: ScoreResult, stage: int = 5) -> None: """Print a formatted scoring report to stdout.""" dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys()) stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY" print("\n" + "=" * 60) print(f" {stage_label} QUALITY SCORE REPORT") print("=" * 60) if result.error: print(f"\n ✗ Error: {result.error}\n") print("=" * 60 + "\n") return for dim in dims: score = result.scores.get(dim, 0.0) bar = self._score_bar(score) justification = result.justifications.get(dim, "") print(f"\n {dim.replace('_', ' ').title()}") print(f" Score: {score:.2f} {bar}") if justification: # Wrap justification at ~60 chars for line in self._wrap(justification, 56): print(f" {line}") print("\n" + "-" * 60) print(f" Composite: {result.composite:.3f}") print(f" Time: {result.elapsed_seconds}s") print("=" * 60 + "\n") @staticmethod def _score_bar(score: float, width: int = 20) -> str: """Render a visual bar for a 0-1 score.""" filled = int(score * width) return "█" * filled + "░" * (width - filled) @staticmethod def _wrap(text: str, width: int) -> list[str]: """Simple word wrap.""" words = text.split() lines: list[str] = [] current = "" for word in words: if current and len(current) + len(word) + 1 > width: lines.append(current) current = word else: current = f"{current} {word}" if current else word if current: lines.append(current) return lines