chrysopedia/backend/pipeline/quality/scorer.py
jlightner e740798f7c feat: Added STAGE_CONFIGS registry (stages 2-5) with per-stage rubrics,…
- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/variant_generator.py"

GSD-Task: S04/T01
2026-04-01 09:20:24 +00:00

614 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics.
Supports stages 2-5, each with its own scoring dimensions, rubric, format
markers, fixture key requirements, prompt file name, and output schema.
Run via: python -m pipeline.quality score --file <path>
"""
from __future__ import annotations
import json
import logging
import sys
import time
from dataclasses import dataclass, field
from typing import Any
import openai
from pydantic import BaseModel
from pipeline.llm_client import LLMClient
from pipeline.quality.voice_dial import VoiceDial
logger = logging.getLogger(__name__)
# ── Per-stage configuration registry ─────────────────────────────────────────
class StageConfig:
"""Configuration for scoring a specific pipeline stage."""
def __init__(
self,
stage: int,
dimensions: list[str],
rubric: str,
format_markers: list[str],
fixture_keys: list[str],
prompt_file: str,
schema_class: str,
) -> None:
self.stage = stage
self.dimensions = dimensions
self.rubric = rubric
self.format_markers = format_markers
self.fixture_keys = fixture_keys
self.prompt_file = prompt_file
self.schema_class = schema_class
def get_schema(self) -> type[BaseModel]:
"""Import and return the Pydantic schema class for this stage."""
from pipeline import schemas
return getattr(schemas, self.schema_class)
# ── Stage rubrics ────────────────────────────────────────────────────────────
_STAGE_2_RUBRIC = """\
You are an expert evaluator of transcript segmentation quality for educational content.
You will be given:
1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary)
2. The source transcript segments used as input
Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0:
**coverage_completeness** — All transcript content accounted for
- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps
- 0.5-0.7: Minor gaps or overlaps, but most content is covered
- 0.0-0.3: Large gaps — significant transcript segments are not assigned to any topic
**topic_specificity** — Topic labels are descriptive and useful
- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing")
- 0.5-0.7: Labels are somewhat specific but could be more descriptive
- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio")
**boundary_accuracy** — Segment boundaries align with actual topic transitions
- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units
- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics
- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content
**summary_quality** — Summaries accurately describe segment content
- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately
- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague
- 0.0-0.3: Summaries are inaccurate, too generic, or missing
Return ONLY a JSON object with this exact structure:
{
"coverage_completeness": <float 0.0-1.0>,
"topic_specificity": <float 0.0-1.0>,
"boundary_accuracy": <float 0.0-1.0>,
"summary_quality": <float 0.0-1.0>,
"justifications": {
"coverage_completeness": "<1-2 sentence justification>",
"topic_specificity": "<1-2 sentence justification>",
"boundary_accuracy": "<1-2 sentence justification>",
"summary_quality": "<1-2 sentence justification>"
}
}
"""
_STAGE_3_RUBRIC = """\
You are an expert evaluator of key moment extraction quality for educational content.
You will be given:
1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript)
2. The source topic segments used as input
Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0:
**moment_richness** — Extracted moments capture substantial, distinct insights
- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary
- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others
- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed
**timestamp_accuracy** — Time ranges are plausible and well-bounded
- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans
- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow
- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical
**content_type_correctness** — Content types match the actual moment content
- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it
- 0.5-0.7: Most are correct but 1-2 are miscategorized
- 0.0-0.3: Content types seem randomly assigned or all the same
**summary_actionability** — Summaries provide actionable, specific information
- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow
- 0.5-0.7: Summaries describe the topic but lack specific actionable details
- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information
**plugin_normalization** — Plugin/tool names are correctly identified and normalized
- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools
- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed
- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors
Return ONLY a JSON object with this exact structure:
{
"moment_richness": <float 0.0-1.0>,
"timestamp_accuracy": <float 0.0-1.0>,
"content_type_correctness": <float 0.0-1.0>,
"summary_actionability": <float 0.0-1.0>,
"plugin_normalization": <float 0.0-1.0>,
"justifications": {
"moment_richness": "<1-2 sentence justification>",
"timestamp_accuracy": "<1-2 sentence justification>",
"content_type_correctness": "<1-2 sentence justification>",
"summary_actionability": "<1-2 sentence justification>",
"plugin_normalization": "<1-2 sentence justification>"
}
}
"""
_STAGE_4_RUBRIC = """\
You are an expert evaluator of content classification quality for educational content.
You will be given:
1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags)
2. The source extracted moments used as input
Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0:
**category_accuracy** — Topic categories are appropriate and meaningful
- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels
- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off
- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same
**tag_completeness** — All relevant tags are captured
- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively
- 0.5-0.7: Main tags are present but secondary concepts or tools are missed
- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments
**tag_specificity** — Tags are specific enough to be useful for search/filtering
- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing")
- 0.5-0.7: Mix of specific and generic tags
- 0.0-0.3: Tags are too generic to meaningfully distinguish moments
**coverage** — All moments are classified
- 0.9-1.0: Every moment_index from the input has a corresponding classification entry
- 0.5-0.7: Most moments classified but 1-2 are missing
- 0.0-0.3: Many moments are not classified
Return ONLY a JSON object with this exact structure:
{
"category_accuracy": <float 0.0-1.0>,
"tag_completeness": <float 0.0-1.0>,
"tag_specificity": <float 0.0-1.0>,
"coverage": <float 0.0-1.0>,
"justifications": {
"category_accuracy": "<1-2 sentence justification>",
"tag_completeness": "<1-2 sentence justification>",
"tag_specificity": "<1-2 sentence justification>",
"coverage": "<1-2 sentence justification>"
}
}
"""
_STAGE_5_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education.
You will be given:
1. A synthesized technique page (JSON with title, summary, body_sections)
2. The source key moments (transcript excerpts, summaries, tags) used to create it
Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0:
**structural** — Section naming and organization
- 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section
- 0.5-0.7: Acceptable structure but some generic section names or uneven depth
- 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections
**content_specificity** — Concrete technical details
- 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values
- 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings")
- 0.0-0.3: Mostly vague generalities with few concrete values from the source material
**voice_preservation** — Creator's authentic voice
- 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained
- 0.5-0.7: Some paraphrased references to creator's views but few direct quotes
- 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution
**readability** — Synthesis quality and flow
- 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction
- 0.5-0.7: Generally readable but some awkward transitions or minor repetition
- 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages
**factual_fidelity** — Grounded in source material
- 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques
- 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources
Return ONLY a JSON object with this exact structure:
{
"structural": <float 0.0-1.0>,
"content_specificity": <float 0.0-1.0>,
"voice_preservation": <float 0.0-1.0>,
"readability": <float 0.0-1.0>,
"factual_fidelity": <float 0.0-1.0>,
"justifications": {
"structural": "<1-2 sentence justification>",
"content_specificity": "<1-2 sentence justification>",
"voice_preservation": "<1-2 sentence justification>",
"readability": "<1-2 sentence justification>",
"factual_fidelity": "<1-2 sentence justification>"
}
}
"""
# Backward-compat alias used by synthesize_and_score and external references
SCORING_RUBRIC = _STAGE_5_RUBRIC
# Build the stage configs registry
STAGE_CONFIGS: dict[int, StageConfig] = {
2: StageConfig(
stage=2,
dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"],
rubric=_STAGE_2_RUBRIC,
format_markers=["segments", "start_index", "end_index", "topic_label"],
fixture_keys=["transcript_segments"],
prompt_file="stage2_segmentation.txt",
schema_class="SegmentationResult",
),
3: StageConfig(
stage=3,
dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"],
rubric=_STAGE_3_RUBRIC,
format_markers=["moments", "content_type", "raw_transcript", "plugins"],
fixture_keys=["topic_segments"],
prompt_file="stage3_extraction.txt",
schema_class="ExtractionResult",
),
4: StageConfig(
stage=4,
dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"],
rubric=_STAGE_4_RUBRIC,
format_markers=["classifications", "moment_index", "topic_category", "topic_tags"],
fixture_keys=["extracted_moments"],
prompt_file="stage4_classification.txt",
schema_class="ClassificationResult",
),
5: StageConfig(
stage=5,
dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"],
rubric=SCORING_RUBRIC,
format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"],
fixture_keys=["key_moments", "creator_name"],
prompt_file="stage5_synthesis.txt",
schema_class="SynthesisResult",
),
}
# Backward-compatible alias: stage 5 dimensions list
DIMENSIONS = STAGE_CONFIGS[5].dimensions
# ── Result type ──────────────────────────────────────────────────────────────
@dataclass
class ScoreResult:
"""Outcome of scoring a stage output across quality dimensions.
Uses a generic ``scores`` dict keyed by dimension name. Stage 5's
original named fields (structural, content_specificity, …) are
preserved as properties for backward compatibility.
"""
scores: dict[str, float] = field(default_factory=dict)
composite: float = 0.0
justifications: dict[str, str] = field(default_factory=dict)
elapsed_seconds: float = 0.0
error: str | None = None
# ── Backward-compat properties for stage 5 named dimensions ──────
@property
def structural(self) -> float:
return self.scores.get("structural", 0.0)
@property
def content_specificity(self) -> float:
return self.scores.get("content_specificity", 0.0)
@property
def voice_preservation(self) -> float:
return self.scores.get("voice_preservation", 0.0)
@property
def readability(self) -> float:
return self.scores.get("readability", 0.0)
@property
def factual_fidelity(self) -> float:
return self.scores.get("factual_fidelity", 0.0)
# ── Runner ───────────────────────────────────────────────────────────────────
class ScoreRunner:
"""Scores pipeline stage outputs using LLM-as-judge evaluation."""
def __init__(self, client: LLMClient) -> None:
self.client = client
# ── Generic stage scorer ─────────────────────────────────────────────
def score_stage_output(
self,
stage: int,
output_json: dict | list,
input_json: dict | list,
) -> ScoreResult:
"""Score an arbitrary stage's output against its input.
Parameters
----------
stage:
Pipeline stage number (2-5).
output_json:
The stage output to evaluate (parsed JSON).
input_json:
The stage input / source material.
Returns
-------
ScoreResult with per-dimension scores for the requested stage.
"""
if stage not in STAGE_CONFIGS:
return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}")
cfg = STAGE_CONFIGS[stage]
user_prompt = (
"## Stage Output\n\n"
f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n"
"## Stage Input\n\n"
f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n"
f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions."
)
t0 = time.monotonic()
try:
resp = self.client.complete(
system_prompt=cfg.rubric,
user_prompt=user_prompt,
response_model=BaseModel,
modality="chat",
)
elapsed = round(time.monotonic() - t0, 2)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
elapsed = round(time.monotonic() - t0, 2)
url = self.client.settings.llm_api_url
fallback = self.client.settings.llm_fallback_url
return ScoreResult(
elapsed_seconds=elapsed,
error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}",
)
raw_text = str(resp).strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
logger.error("Malformed judge response (not JSON): %.300s", raw_text)
return ScoreResult(
elapsed_seconds=elapsed,
error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
)
return self._parse_scores(parsed, elapsed, cfg.dimensions)
# ── Stage 5 convenience (backward compat) ────────────────────────────
def score_page(
self,
page_json: dict,
moments: list[dict],
) -> ScoreResult:
"""Evaluate a stage 5 technique page against source moments."""
return self.score_stage_output(
stage=5,
output_json=page_json,
input_json=moments,
)
return self._parse_scores(parsed, elapsed)
def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult:
"""Extract and validate scores from parsed JSON response."""
dims = dimensions or DIMENSIONS
scores: dict[str, float] = {}
justifications: dict[str, str] = {}
raw_justifications = parsed.get("justifications", {})
if not isinstance(raw_justifications, dict):
raw_justifications = {}
for dim in dims:
raw = parsed.get(dim)
if raw is None:
logger.warning("Missing dimension '%s' in judge response", dim)
scores[dim] = 0.0
justifications[dim] = "(missing from judge response)"
continue
try:
val = float(raw)
scores[dim] = max(0.0, min(1.0, val)) # clamp
except (TypeError, ValueError):
logger.warning("Invalid value for '%s': %r", dim, raw)
scores[dim] = 0.0
justifications[dim] = f"(invalid value: {raw!r})"
continue
justifications[dim] = str(raw_justifications.get(dim, ""))
composite = sum(scores.values()) / len(dims) if dims else 0.0
return ScoreResult(
scores=scores,
composite=round(composite, 3),
justifications=justifications,
elapsed_seconds=elapsed,
)
def synthesize_and_score(
self,
moments: list[dict],
creator_name: str,
voice_level: float,
) -> ScoreResult:
"""Re-synthesize from source moments with a voice-dialed prompt, then score.
Loads the stage 5 synthesis prompt from disk, applies the VoiceDial
modifier at the given voice_level, calls the LLM to produce a
SynthesisResult, then scores the first page.
Parameters
----------
moments:
Source key moments (dicts with summary, transcript_excerpt, etc.)
creator_name:
Creator name to inject into the synthesis prompt.
voice_level:
Float 0.01.0 controlling voice preservation intensity.
Returns
-------
ScoreResult with per-dimension scores after voice-dialed re-synthesis.
"""
from pipeline.schemas import SynthesisResult
from pipeline.stages import _get_stage_config, _load_prompt
# Load and modify the stage 5 system prompt
try:
base_prompt = _load_prompt("stage5_synthesis.txt")
except FileNotFoundError as exc:
return ScoreResult(error=f"Prompt file not found: {exc}")
dial = VoiceDial(base_prompt)
modified_prompt = dial.modify(voice_level)
band = dial.band_name(voice_level)
# Build user prompt in the same format as _synthesize_chunk
moments_json = json.dumps(moments, indent=2)
user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_json}\n</moments>"
model_override, modality = _get_stage_config(5)
print(f" Re-synthesizing at voice_level={voice_level} (band={band})...")
t0 = time.monotonic()
try:
raw = self.client.complete(
system_prompt=modified_prompt,
user_prompt=user_prompt,
response_model=SynthesisResult,
modality=modality,
model_override=model_override,
)
elapsed_synth = round(time.monotonic() - t0, 2)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
elapsed_synth = round(time.monotonic() - t0, 2)
url = self.client.settings.llm_api_url
fallback = self.client.settings.llm_fallback_url
return ScoreResult(
elapsed_seconds=elapsed_synth,
error=(
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
f"Error: {exc}"
),
)
# Parse synthesis response
raw_text = str(raw).strip()
try:
synthesis = self.client.parse_response(raw_text, SynthesisResult)
except (json.JSONDecodeError, ValueError, Exception) as exc:
logger.error("Malformed synthesis response: %.300s", raw_text)
return ScoreResult(
elapsed_seconds=elapsed_synth,
error=f"Malformed synthesis response: {exc}. Raw excerpt: {raw_text[:200]}",
)
if not synthesis.pages:
return ScoreResult(
elapsed_seconds=elapsed_synth,
error="Synthesis returned no pages.",
)
# Score the first page
page = synthesis.pages[0]
page_json = {
"title": page.title,
"creator_name": creator_name,
"summary": page.summary,
"body_sections": [
{"heading": heading, "content": content}
for heading, content in page.body_sections.items()
],
}
print(f" Synthesis complete ({elapsed_synth}s). Scoring...")
result = self.score_page(page_json, moments)
# Include synthesis time in total
result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
return result
def print_report(self, result: ScoreResult, stage: int = 5) -> None:
"""Print a formatted scoring report to stdout."""
dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys())
stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY"
print("\n" + "=" * 60)
print(f" {stage_label} QUALITY SCORE REPORT")
print("=" * 60)
if result.error:
print(f"\n ✗ Error: {result.error}\n")
print("=" * 60 + "\n")
return
for dim in dims:
score = result.scores.get(dim, 0.0)
bar = self._score_bar(score)
justification = result.justifications.get(dim, "")
print(f"\n {dim.replace('_', ' ').title()}")
print(f" Score: {score:.2f} {bar}")
if justification:
# Wrap justification at ~60 chars
for line in self._wrap(justification, 56):
print(f" {line}")
print("\n" + "-" * 60)
print(f" Composite: {result.composite:.3f}")
print(f" Time: {result.elapsed_seconds}s")
print("=" * 60 + "\n")
@staticmethod
def _score_bar(score: float, width: int = 20) -> str:
"""Render a visual bar for a 0-1 score."""
filled = int(score * width)
return "" * filled + "" * (width - filled)
@staticmethod
def _wrap(text: str, width: int) -> list[str]:
"""Simple word wrap."""
words = text.split()
lines: list[str] = []
current = ""
for word in words:
if current and len(current) + len(word) + 1 > width:
lines.append(current)
current = word
else:
current = f"{current} {word}" if current else word
if current:
lines.append(current)
return lines