chrysopedia/backend/pipeline/quality/scorer.py
jlightner 15a7afdaff feat: Added VoiceDial class with 3-band prompt modification and ScoreRu…
- "backend/pipeline/quality/voice_dial.py"
- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/__main__.py"

GSD-Task: S02/T02
2026-04-01 08:57:07 +00:00

368 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions.
Evaluates a synthesized technique page against source moments on:
1. Structural quality — section naming, count, paragraph depth
2. Content specificity — concrete details vs vague generalities
3. Voice preservation — direct quotes, attributed opinions, personality
4. Readability / flow — synthesis quality, logical ordering, no redundancy
5. Factual fidelity — no hallucinated specifics, grounded in source moments
Run via: python -m pipeline.quality score --file <path>
"""
from __future__ import annotations
import json
import logging
import sys
import time
from dataclasses import dataclass, field
import openai
from pydantic import BaseModel
from pipeline.llm_client import LLMClient
from pipeline.quality.voice_dial import VoiceDial
logger = logging.getLogger(__name__)
# ── Scoring rubric (hardcoded for iteration speed) ───────────────────────────
SCORING_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education.
You will be given:
1. A synthesized technique page (JSON with title, summary, body_sections)
2. The source key moments (transcript excerpts, summaries, tags) used to create it
Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0:
**structural** — Section naming and organization
- 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section
- 0.5-0.7: Acceptable structure but some generic section names or uneven depth
- 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections
**content_specificity** — Concrete technical details
- 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values
- 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings")
- 0.0-0.3: Mostly vague generalities with few concrete values from the source material
**voice_preservation** — Creator's authentic voice
- 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained
- 0.5-0.7: Some paraphrased references to creator's views but few direct quotes
- 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution
**readability** — Synthesis quality and flow
- 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction
- 0.5-0.7: Generally readable but some awkward transitions or minor repetition
- 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages
**factual_fidelity** — Grounded in source material
- 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques
- 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources
Return ONLY a JSON object with this exact structure:
{
"structural": <float 0.0-1.0>,
"content_specificity": <float 0.0-1.0>,
"voice_preservation": <float 0.0-1.0>,
"readability": <float 0.0-1.0>,
"factual_fidelity": <float 0.0-1.0>,
"justifications": {
"structural": "<1-2 sentence justification>",
"content_specificity": "<1-2 sentence justification>",
"voice_preservation": "<1-2 sentence justification>",
"readability": "<1-2 sentence justification>",
"factual_fidelity": "<1-2 sentence justification>"
}
}
"""
DIMENSIONS = [
"structural",
"content_specificity",
"voice_preservation",
"readability",
"factual_fidelity",
]
# ── Result type ──────────────────────────────────────────────────────────────
@dataclass
class ScoreResult:
"""Outcome of scoring a technique page across 5 quality dimensions."""
structural: float = 0.0
content_specificity: float = 0.0
voice_preservation: float = 0.0
readability: float = 0.0
factual_fidelity: float = 0.0
composite: float = 0.0
justifications: dict[str, str] = field(default_factory=dict)
elapsed_seconds: float = 0.0
error: str | None = None
# ── Runner ───────────────────────────────────────────────────────────────────
class ScoreRunner:
"""Scores a Stage 5 technique page using LLM-as-judge evaluation."""
def __init__(self, client: LLMClient) -> None:
self.client = client
def score_page(
self,
page_json: dict,
moments: list[dict],
) -> ScoreResult:
"""Evaluate a technique page against source moments.
Parameters
----------
page_json:
Synthesized page dict (title, summary, body_sections).
moments:
Source key moments with transcript_excerpt, summary, etc.
Returns
-------
ScoreResult with per-dimension scores and justifications.
"""
# Build the user prompt with the page and source moments
user_prompt = (
"## Synthesized Technique Page\n\n"
f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n"
"## Source Key Moments\n\n"
f"```json\n{json.dumps(moments, indent=2)}\n```\n\n"
"Score this page across all 5 dimensions."
)
t0 = time.monotonic()
try:
resp = self.client.complete(
system_prompt=SCORING_RUBRIC,
user_prompt=user_prompt,
response_model=BaseModel, # triggers JSON mode
modality="chat",
)
elapsed = round(time.monotonic() - t0, 2)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
elapsed = round(time.monotonic() - t0, 2)
url = self.client.settings.llm_api_url
fallback = self.client.settings.llm_fallback_url
return ScoreResult(
elapsed_seconds=elapsed,
error=(
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
f"Error: {exc}"
),
)
# Parse the LLM judge response
raw_text = str(resp).strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
logger.error("Malformed judge response (not JSON): %.300s", raw_text)
return ScoreResult(
elapsed_seconds=elapsed,
error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
)
return self._parse_scores(parsed, elapsed)
def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult:
"""Extract and validate scores from parsed JSON response."""
scores: dict[str, float] = {}
justifications: dict[str, str] = {}
raw_justifications = parsed.get("justifications", {})
if not isinstance(raw_justifications, dict):
raw_justifications = {}
for dim in DIMENSIONS:
raw = parsed.get(dim)
if raw is None:
logger.warning("Missing dimension '%s' in judge response", dim)
scores[dim] = 0.0
justifications[dim] = "(missing from judge response)"
continue
try:
val = float(raw)
scores[dim] = max(0.0, min(1.0, val)) # clamp
except (TypeError, ValueError):
logger.warning("Invalid value for '%s': %r", dim, raw)
scores[dim] = 0.0
justifications[dim] = f"(invalid value: {raw!r})"
continue
justifications[dim] = str(raw_justifications.get(dim, ""))
composite = sum(scores.values()) / len(DIMENSIONS)
return ScoreResult(
structural=scores["structural"],
content_specificity=scores["content_specificity"],
voice_preservation=scores["voice_preservation"],
readability=scores["readability"],
factual_fidelity=scores["factual_fidelity"],
composite=round(composite, 3),
justifications=justifications,
elapsed_seconds=elapsed,
)
def synthesize_and_score(
self,
moments: list[dict],
creator_name: str,
voice_level: float,
) -> ScoreResult:
"""Re-synthesize from source moments with a voice-dialed prompt, then score.
Loads the stage 5 synthesis prompt from disk, applies the VoiceDial
modifier at the given voice_level, calls the LLM to produce a
SynthesisResult, then scores the first page.
Parameters
----------
moments:
Source key moments (dicts with summary, transcript_excerpt, etc.)
creator_name:
Creator name to inject into the synthesis prompt.
voice_level:
Float 0.01.0 controlling voice preservation intensity.
Returns
-------
ScoreResult with per-dimension scores after voice-dialed re-synthesis.
"""
from pipeline.schemas import SynthesisResult
from pipeline.stages import _get_stage_config, _load_prompt
# Load and modify the stage 5 system prompt
try:
base_prompt = _load_prompt("stage5_synthesis.txt")
except FileNotFoundError as exc:
return ScoreResult(error=f"Prompt file not found: {exc}")
dial = VoiceDial(base_prompt)
modified_prompt = dial.modify(voice_level)
band = dial.band_name(voice_level)
# Build user prompt in the same format as _synthesize_chunk
moments_json = json.dumps(moments, indent=2)
user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_json}\n</moments>"
model_override, modality = _get_stage_config(5)
print(f" Re-synthesizing at voice_level={voice_level} (band={band})...")
t0 = time.monotonic()
try:
raw = self.client.complete(
system_prompt=modified_prompt,
user_prompt=user_prompt,
response_model=SynthesisResult,
modality=modality,
model_override=model_override,
)
elapsed_synth = round(time.monotonic() - t0, 2)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
elapsed_synth = round(time.monotonic() - t0, 2)
url = self.client.settings.llm_api_url
fallback = self.client.settings.llm_fallback_url
return ScoreResult(
elapsed_seconds=elapsed_synth,
error=(
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
f"Error: {exc}"
),
)
# Parse synthesis response
raw_text = str(raw).strip()
try:
synthesis = self.client.parse_response(raw_text, SynthesisResult)
except (json.JSONDecodeError, ValueError, Exception) as exc:
logger.error("Malformed synthesis response: %.300s", raw_text)
return ScoreResult(
elapsed_seconds=elapsed_synth,
error=f"Malformed synthesis response: {exc}. Raw excerpt: {raw_text[:200]}",
)
if not synthesis.pages:
return ScoreResult(
elapsed_seconds=elapsed_synth,
error="Synthesis returned no pages.",
)
# Score the first page
page = synthesis.pages[0]
page_json = {
"title": page.title,
"creator_name": creator_name,
"summary": page.summary,
"body_sections": [
{"heading": heading, "content": content}
for heading, content in page.body_sections.items()
],
}
print(f" Synthesis complete ({elapsed_synth}s). Scoring...")
result = self.score_page(page_json, moments)
# Include synthesis time in total
result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
return result
def print_report(self, result: ScoreResult) -> None:
"""Print a formatted scoring report to stdout."""
print("\n" + "=" * 60)
print(" STAGE 5 QUALITY SCORE REPORT")
print("=" * 60)
if result.error:
print(f"\n ✗ Error: {result.error}\n")
print("=" * 60 + "\n")
return
for dim in DIMENSIONS:
score = getattr(result, dim)
bar = self._score_bar(score)
justification = result.justifications.get(dim, "")
print(f"\n {dim.replace('_', ' ').title()}")
print(f" Score: {score:.2f} {bar}")
if justification:
# Wrap justification at ~60 chars
for line in self._wrap(justification, 56):
print(f" {line}")
print("\n" + "-" * 60)
print(f" Composite: {result.composite:.3f}")
print(f" Time: {result.elapsed_seconds}s")
print("=" * 60 + "\n")
@staticmethod
def _score_bar(score: float, width: int = 20) -> str:
"""Render a visual bar for a 0-1 score."""
filled = int(score * width)
return "" * filled + "" * (width - filled)
@staticmethod
def _wrap(text: str, width: int) -> list[str]:
"""Simple word wrap."""
words = text.split()
lines: list[str] = []
current = ""
for word in words:
if current and len(current) + len(word) + 1 > width:
lines.append(current)
current = word
else:
current = f"{current} {word}" if current else word
if current:
lines.append(current)
return lines