- "backend/pipeline/quality/voice_dial.py" - "backend/pipeline/quality/scorer.py" - "backend/pipeline/quality/__main__.py" GSD-Task: S02/T02
368 lines
14 KiB
Python
368 lines
14 KiB
Python
"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions.
|
||
|
||
Evaluates a synthesized technique page against source moments on:
|
||
1. Structural quality — section naming, count, paragraph depth
|
||
2. Content specificity — concrete details vs vague generalities
|
||
3. Voice preservation — direct quotes, attributed opinions, personality
|
||
4. Readability / flow — synthesis quality, logical ordering, no redundancy
|
||
5. Factual fidelity — no hallucinated specifics, grounded in source moments
|
||
|
||
Run via: python -m pipeline.quality score --file <path>
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
|
||
import openai
|
||
from pydantic import BaseModel
|
||
|
||
from pipeline.llm_client import LLMClient
|
||
from pipeline.quality.voice_dial import VoiceDial
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ── Scoring rubric (hardcoded for iteration speed) ───────────────────────────
|
||
|
||
SCORING_RUBRIC = """\
|
||
You are an expert evaluator of synthesized technique articles for music production education.
|
||
|
||
You will be given:
|
||
1. A synthesized technique page (JSON with title, summary, body_sections)
|
||
2. The source key moments (transcript excerpts, summaries, tags) used to create it
|
||
|
||
Evaluate the page across these 5 dimensions, scoring each 0.0 to 1.0:
|
||
|
||
**structural** — Section naming and organization
|
||
- 0.9-1.0: Well-named specific sections (not generic "Overview"/"Tips"), appropriate count (3-6), 2-5 paragraphs per section
|
||
- 0.5-0.7: Acceptable structure but some generic section names or uneven depth
|
||
- 0.0-0.3: Poor structure — too few/many sections, generic names, single-paragraph sections
|
||
|
||
**content_specificity** — Concrete technical details
|
||
- 0.9-1.0: Rich in frequencies (Hz), time values (ms), ratios, plugin names, specific settings, dB values
|
||
- 0.5-0.7: Some specific details but padded with vague statements ("adjust to taste", "experiment with settings")
|
||
- 0.0-0.3: Mostly vague generalities with few concrete values from the source material
|
||
|
||
**voice_preservation** — Creator's authentic voice
|
||
- 0.9-1.0: Direct quotes preserved, opinions attributed to creator by name, personality and strong views retained
|
||
- 0.5-0.7: Some paraphrased references to creator's views but few direct quotes
|
||
- 0.0-0.3: Encyclopedia style — creator's voice completely smoothed out, no attribution
|
||
|
||
**readability** — Synthesis quality and flow
|
||
- 0.9-1.0: Reads as a cohesive article, related info merged, logical flow, no redundancy or contradiction
|
||
- 0.5-0.7: Generally readable but some awkward transitions or minor repetition
|
||
- 0.0-0.3: Feels like concatenated bullet points, disjointed, redundant passages
|
||
|
||
**factual_fidelity** — Grounded in source material
|
||
- 0.9-1.0: Every claim traceable to source moments, no invented plugin names/settings/techniques
|
||
- 0.5-0.7: Mostly grounded but 1-2 details seem embellished or not directly from sources
|
||
- 0.0-0.3: Contains hallucinated specifics — plugin names, settings, or techniques not in sources
|
||
|
||
Return ONLY a JSON object with this exact structure:
|
||
{
|
||
"structural": <float 0.0-1.0>,
|
||
"content_specificity": <float 0.0-1.0>,
|
||
"voice_preservation": <float 0.0-1.0>,
|
||
"readability": <float 0.0-1.0>,
|
||
"factual_fidelity": <float 0.0-1.0>,
|
||
"justifications": {
|
||
"structural": "<1-2 sentence justification>",
|
||
"content_specificity": "<1-2 sentence justification>",
|
||
"voice_preservation": "<1-2 sentence justification>",
|
||
"readability": "<1-2 sentence justification>",
|
||
"factual_fidelity": "<1-2 sentence justification>"
|
||
}
|
||
}
|
||
"""
|
||
|
||
DIMENSIONS = [
|
||
"structural",
|
||
"content_specificity",
|
||
"voice_preservation",
|
||
"readability",
|
||
"factual_fidelity",
|
||
]
|
||
|
||
|
||
# ── Result type ──────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class ScoreResult:
|
||
"""Outcome of scoring a technique page across 5 quality dimensions."""
|
||
|
||
structural: float = 0.0
|
||
content_specificity: float = 0.0
|
||
voice_preservation: float = 0.0
|
||
readability: float = 0.0
|
||
factual_fidelity: float = 0.0
|
||
composite: float = 0.0
|
||
justifications: dict[str, str] = field(default_factory=dict)
|
||
elapsed_seconds: float = 0.0
|
||
error: str | None = None
|
||
|
||
|
||
# ── Runner ───────────────────────────────────────────────────────────────────
|
||
|
||
class ScoreRunner:
|
||
"""Scores a Stage 5 technique page using LLM-as-judge evaluation."""
|
||
|
||
def __init__(self, client: LLMClient) -> None:
|
||
self.client = client
|
||
|
||
def score_page(
|
||
self,
|
||
page_json: dict,
|
||
moments: list[dict],
|
||
) -> ScoreResult:
|
||
"""Evaluate a technique page against source moments.
|
||
|
||
Parameters
|
||
----------
|
||
page_json:
|
||
Synthesized page dict (title, summary, body_sections).
|
||
moments:
|
||
Source key moments with transcript_excerpt, summary, etc.
|
||
|
||
Returns
|
||
-------
|
||
ScoreResult with per-dimension scores and justifications.
|
||
"""
|
||
# Build the user prompt with the page and source moments
|
||
user_prompt = (
|
||
"## Synthesized Technique Page\n\n"
|
||
f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n"
|
||
"## Source Key Moments\n\n"
|
||
f"```json\n{json.dumps(moments, indent=2)}\n```\n\n"
|
||
"Score this page across all 5 dimensions."
|
||
)
|
||
|
||
t0 = time.monotonic()
|
||
try:
|
||
resp = self.client.complete(
|
||
system_prompt=SCORING_RUBRIC,
|
||
user_prompt=user_prompt,
|
||
response_model=BaseModel, # triggers JSON mode
|
||
modality="chat",
|
||
)
|
||
elapsed = round(time.monotonic() - t0, 2)
|
||
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
||
elapsed = round(time.monotonic() - t0, 2)
|
||
url = self.client.settings.llm_api_url
|
||
fallback = self.client.settings.llm_fallback_url
|
||
return ScoreResult(
|
||
elapsed_seconds=elapsed,
|
||
error=(
|
||
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
|
||
f"Error: {exc}"
|
||
),
|
||
)
|
||
|
||
# Parse the LLM judge response
|
||
raw_text = str(resp).strip()
|
||
try:
|
||
parsed = json.loads(raw_text)
|
||
except json.JSONDecodeError:
|
||
logger.error("Malformed judge response (not JSON): %.300s", raw_text)
|
||
return ScoreResult(
|
||
elapsed_seconds=elapsed,
|
||
error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
|
||
)
|
||
|
||
return self._parse_scores(parsed, elapsed)
|
||
|
||
def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult:
|
||
"""Extract and validate scores from parsed JSON response."""
|
||
scores: dict[str, float] = {}
|
||
justifications: dict[str, str] = {}
|
||
|
||
raw_justifications = parsed.get("justifications", {})
|
||
if not isinstance(raw_justifications, dict):
|
||
raw_justifications = {}
|
||
|
||
for dim in DIMENSIONS:
|
||
raw = parsed.get(dim)
|
||
if raw is None:
|
||
logger.warning("Missing dimension '%s' in judge response", dim)
|
||
scores[dim] = 0.0
|
||
justifications[dim] = "(missing from judge response)"
|
||
continue
|
||
|
||
try:
|
||
val = float(raw)
|
||
scores[dim] = max(0.0, min(1.0, val)) # clamp
|
||
except (TypeError, ValueError):
|
||
logger.warning("Invalid value for '%s': %r", dim, raw)
|
||
scores[dim] = 0.0
|
||
justifications[dim] = f"(invalid value: {raw!r})"
|
||
continue
|
||
|
||
justifications[dim] = str(raw_justifications.get(dim, ""))
|
||
|
||
composite = sum(scores.values()) / len(DIMENSIONS)
|
||
|
||
return ScoreResult(
|
||
structural=scores["structural"],
|
||
content_specificity=scores["content_specificity"],
|
||
voice_preservation=scores["voice_preservation"],
|
||
readability=scores["readability"],
|
||
factual_fidelity=scores["factual_fidelity"],
|
||
composite=round(composite, 3),
|
||
justifications=justifications,
|
||
elapsed_seconds=elapsed,
|
||
)
|
||
|
||
def synthesize_and_score(
|
||
self,
|
||
moments: list[dict],
|
||
creator_name: str,
|
||
voice_level: float,
|
||
) -> ScoreResult:
|
||
"""Re-synthesize from source moments with a voice-dialed prompt, then score.
|
||
|
||
Loads the stage 5 synthesis prompt from disk, applies the VoiceDial
|
||
modifier at the given voice_level, calls the LLM to produce a
|
||
SynthesisResult, then scores the first page.
|
||
|
||
Parameters
|
||
----------
|
||
moments:
|
||
Source key moments (dicts with summary, transcript_excerpt, etc.)
|
||
creator_name:
|
||
Creator name to inject into the synthesis prompt.
|
||
voice_level:
|
||
Float 0.0–1.0 controlling voice preservation intensity.
|
||
|
||
Returns
|
||
-------
|
||
ScoreResult with per-dimension scores after voice-dialed re-synthesis.
|
||
"""
|
||
from pipeline.schemas import SynthesisResult
|
||
from pipeline.stages import _get_stage_config, _load_prompt
|
||
|
||
# Load and modify the stage 5 system prompt
|
||
try:
|
||
base_prompt = _load_prompt("stage5_synthesis.txt")
|
||
except FileNotFoundError as exc:
|
||
return ScoreResult(error=f"Prompt file not found: {exc}")
|
||
|
||
dial = VoiceDial(base_prompt)
|
||
modified_prompt = dial.modify(voice_level)
|
||
band = dial.band_name(voice_level)
|
||
|
||
# Build user prompt in the same format as _synthesize_chunk
|
||
moments_json = json.dumps(moments, indent=2)
|
||
user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_json}\n</moments>"
|
||
|
||
model_override, modality = _get_stage_config(5)
|
||
|
||
print(f" Re-synthesizing at voice_level={voice_level} (band={band})...")
|
||
|
||
t0 = time.monotonic()
|
||
try:
|
||
raw = self.client.complete(
|
||
system_prompt=modified_prompt,
|
||
user_prompt=user_prompt,
|
||
response_model=SynthesisResult,
|
||
modality=modality,
|
||
model_override=model_override,
|
||
)
|
||
elapsed_synth = round(time.monotonic() - t0, 2)
|
||
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
||
elapsed_synth = round(time.monotonic() - t0, 2)
|
||
url = self.client.settings.llm_api_url
|
||
fallback = self.client.settings.llm_fallback_url
|
||
return ScoreResult(
|
||
elapsed_seconds=elapsed_synth,
|
||
error=(
|
||
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
|
||
f"Error: {exc}"
|
||
),
|
||
)
|
||
|
||
# Parse synthesis response
|
||
raw_text = str(raw).strip()
|
||
try:
|
||
synthesis = self.client.parse_response(raw_text, SynthesisResult)
|
||
except (json.JSONDecodeError, ValueError, Exception) as exc:
|
||
logger.error("Malformed synthesis response: %.300s", raw_text)
|
||
return ScoreResult(
|
||
elapsed_seconds=elapsed_synth,
|
||
error=f"Malformed synthesis response: {exc}. Raw excerpt: {raw_text[:200]}",
|
||
)
|
||
|
||
if not synthesis.pages:
|
||
return ScoreResult(
|
||
elapsed_seconds=elapsed_synth,
|
||
error="Synthesis returned no pages.",
|
||
)
|
||
|
||
# Score the first page
|
||
page = synthesis.pages[0]
|
||
page_json = {
|
||
"title": page.title,
|
||
"creator_name": creator_name,
|
||
"summary": page.summary,
|
||
"body_sections": [
|
||
{"heading": heading, "content": content}
|
||
for heading, content in page.body_sections.items()
|
||
],
|
||
}
|
||
|
||
print(f" Synthesis complete ({elapsed_synth}s). Scoring...")
|
||
result = self.score_page(page_json, moments)
|
||
# Include synthesis time in total
|
||
result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
|
||
return result
|
||
|
||
def print_report(self, result: ScoreResult) -> None:
|
||
"""Print a formatted scoring report to stdout."""
|
||
print("\n" + "=" * 60)
|
||
print(" STAGE 5 QUALITY SCORE REPORT")
|
||
print("=" * 60)
|
||
|
||
if result.error:
|
||
print(f"\n ✗ Error: {result.error}\n")
|
||
print("=" * 60 + "\n")
|
||
return
|
||
|
||
for dim in DIMENSIONS:
|
||
score = getattr(result, dim)
|
||
bar = self._score_bar(score)
|
||
justification = result.justifications.get(dim, "")
|
||
print(f"\n {dim.replace('_', ' ').title()}")
|
||
print(f" Score: {score:.2f} {bar}")
|
||
if justification:
|
||
# Wrap justification at ~60 chars
|
||
for line in self._wrap(justification, 56):
|
||
print(f" {line}")
|
||
|
||
print("\n" + "-" * 60)
|
||
print(f" Composite: {result.composite:.3f}")
|
||
print(f" Time: {result.elapsed_seconds}s")
|
||
print("=" * 60 + "\n")
|
||
|
||
@staticmethod
|
||
def _score_bar(score: float, width: int = 20) -> str:
|
||
"""Render a visual bar for a 0-1 score."""
|
||
filled = int(score * width)
|
||
return "█" * filled + "░" * (width - filled)
|
||
|
||
@staticmethod
|
||
def _wrap(text: str, width: int) -> list[str]:
|
||
"""Simple word wrap."""
|
||
words = text.split()
|
||
lines: list[str] = []
|
||
current = ""
|
||
for word in words:
|
||
if current and len(current) + len(word) + 1 > width:
|
||
lines.append(current)
|
||
current = word
|
||
else:
|
||
current = f"{current} {word}" if current else word
|
||
if current:
|
||
lines.append(current)
|
||
return lines
|