chrysopedia/backend/pipeline/quality/chat_scorer.py
jlightner 846db2aad5 test: Created chat-specific LLM-as-judge scorer (5 dimensions), SSE-par…
- "backend/pipeline/quality/chat_scorer.py"
- "backend/pipeline/quality/chat_eval.py"
- "backend/pipeline/quality/fixtures/chat_test_suite.yaml"
- "backend/pipeline/quality/__main__.py"

GSD-Task: S09/T01
2026-04-04 14:43:52 +00:00

271 lines
11 KiB
Python

"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
Scores chat responses across 5 dimensions:
- citation_accuracy: Are citations real and correctly numbered?
- response_structure: Concise, well-organized, uses appropriate formatting?
- domain_expertise: Music production terminology used naturally?
- source_grounding: Claims backed by provided sources, no fabrication?
- personality_fidelity: At weight>0, response reflects creator voice?
Run via: python -m pipeline.quality chat_eval --suite <path>
"""
from __future__ import annotations
import json
import logging
import time
from dataclasses import dataclass, field
import openai
from pipeline.llm_client import LLMClient
logger = logging.getLogger(__name__)
CHAT_DIMENSIONS = [
"citation_accuracy",
"response_structure",
"domain_expertise",
"source_grounding",
"personality_fidelity",
]
CHAT_RUBRIC = """\
You are an expert evaluator of AI chat response quality for a music production knowledge base.
You will be given:
1. The user's query
2. The assistant's response
3. The numbered source citations that were provided to the assistant
4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
5. The creator_name (if any)
Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
**response_structure** — Response is concise, well-organized, uses appropriate formatting
- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
**domain_expertise** — Music production terminology used naturally and correctly
- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
**source_grounding** — Claims are backed by provided sources, no fabrication
- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
Return ONLY a JSON object with this exact structure:
{
"citation_accuracy": <float 0.0-1.0>,
"response_structure": <float 0.0-1.0>,
"domain_expertise": <float 0.0-1.0>,
"source_grounding": <float 0.0-1.0>,
"personality_fidelity": <float 0.0-1.0>,
"justifications": {
"citation_accuracy": "<1-2 sentence justification>",
"response_structure": "<1-2 sentence justification>",
"domain_expertise": "<1-2 sentence justification>",
"source_grounding": "<1-2 sentence justification>",
"personality_fidelity": "<1-2 sentence justification>"
}
}
"""
@dataclass
class ChatScoreResult:
"""Outcome of scoring a chat response across quality dimensions."""
scores: dict[str, float] = field(default_factory=dict)
composite: float = 0.0
justifications: dict[str, str] = field(default_factory=dict)
elapsed_seconds: float = 0.0
error: str | None = None
# Convenience properties
@property
def citation_accuracy(self) -> float:
return self.scores.get("citation_accuracy", 0.0)
@property
def response_structure(self) -> float:
return self.scores.get("response_structure", 0.0)
@property
def domain_expertise(self) -> float:
return self.scores.get("domain_expertise", 0.0)
@property
def source_grounding(self) -> float:
return self.scores.get("source_grounding", 0.0)
@property
def personality_fidelity(self) -> float:
return self.scores.get("personality_fidelity", 0.0)
class ChatScoreRunner:
"""Scores chat responses using LLM-as-judge evaluation."""
def __init__(self, client: LLMClient) -> None:
self.client = client
def score_response(
self,
query: str,
response: str,
sources: list[dict],
personality_weight: float = 0.0,
creator_name: str | None = None,
) -> ChatScoreResult:
"""Score a single chat response against the 5 chat quality dimensions.
Parameters
----------
query:
The user's original query.
response:
The assistant's accumulated response text.
sources:
List of source citation dicts (as emitted by the SSE sources event).
personality_weight:
0.0 = encyclopedic mode, >0 = personality mode.
creator_name:
Creator name, if this was a creator-scoped query.
Returns
-------
ChatScoreResult with per-dimension scores.
"""
sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
user_prompt = (
f"## User Query\n\n{query}\n\n"
f"## Assistant Response\n\n{response}\n\n"
f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
f"## Metadata\n\n"
f"- personality_weight: {personality_weight}\n"
f"- creator_name: {creator_name or '(none)'}\n\n"
f"Score this chat response across all 5 dimensions."
)
t0 = time.monotonic()
try:
from pydantic import BaseModel as _BM
resp = self.client.complete(
system_prompt=CHAT_RUBRIC,
user_prompt=user_prompt,
response_model=_BM,
modality="chat",
)
elapsed = round(time.monotonic() - t0, 2)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
elapsed = round(time.monotonic() - t0, 2)
return ChatScoreResult(
elapsed_seconds=elapsed,
error=f"Cannot reach LLM judge. Error: {exc}",
)
raw_text = str(resp).strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
return ChatScoreResult(
elapsed_seconds=elapsed,
error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
)
return self._parse_scores(parsed, elapsed)
def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
"""Extract and validate scores from parsed JSON judge response."""
scores: dict[str, float] = {}
justifications: dict[str, str] = {}
raw_justifications = parsed.get("justifications", {})
if not isinstance(raw_justifications, dict):
raw_justifications = {}
for dim in CHAT_DIMENSIONS:
raw = parsed.get(dim)
if raw is None:
logger.warning("Missing dimension '%s' in chat judge response", dim)
scores[dim] = 0.0
justifications[dim] = "(missing from judge response)"
continue
try:
val = float(raw)
scores[dim] = max(0.0, min(1.0, val))
except (TypeError, ValueError):
logger.warning("Invalid value for '%s': %r", dim, raw)
scores[dim] = 0.0
justifications[dim] = f"(invalid value: {raw!r})"
continue
justifications[dim] = str(raw_justifications.get(dim, ""))
composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
return ChatScoreResult(
scores=scores,
composite=round(composite, 3),
justifications=justifications,
elapsed_seconds=elapsed,
)
def print_report(self, result: ChatScoreResult, query: str = "") -> None:
"""Print a formatted chat scoring report to stdout."""
print("\n" + "=" * 60)
print(" CHAT QUALITY SCORE REPORT")
if query:
print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}")
print("=" * 60)
if result.error:
print(f"\n ✗ Error: {result.error}\n")
print("=" * 60 + "\n")
return
for dim in CHAT_DIMENSIONS:
score = result.scores.get(dim, 0.0)
filled = int(score * 20)
bar = "" * filled + "" * (20 - filled)
justification = result.justifications.get(dim, "")
print(f"\n {dim.replace('_', ' ').title()}")
print(f" Score: {score:.2f} {bar}")
if justification:
# Simple word wrap at ~56 chars
words = justification.split()
lines: list[str] = []
current = ""
for word in words:
if current and len(current) + len(word) + 1 > 56:
lines.append(current)
current = word
else:
current = f"{current} {word}" if current else word
if current:
lines.append(current)
for line in lines:
print(f" {line}")
print("\n" + "-" * 60)
print(f" Composite: {result.composite:.3f}")
print(f" Time: {result.elapsed_seconds}s")
print("=" * 60 + "\n")