- "backend/pipeline/quality/chat_scorer.py" - "backend/pipeline/quality/chat_eval.py" - "backend/pipeline/quality/fixtures/chat_test_suite.yaml" - "backend/pipeline/quality/__main__.py" GSD-Task: S09/T01
271 lines
11 KiB
Python
271 lines
11 KiB
Python
"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
|
|
|
|
Scores chat responses across 5 dimensions:
|
|
- citation_accuracy: Are citations real and correctly numbered?
|
|
- response_structure: Concise, well-organized, uses appropriate formatting?
|
|
- domain_expertise: Music production terminology used naturally?
|
|
- source_grounding: Claims backed by provided sources, no fabrication?
|
|
- personality_fidelity: At weight>0, response reflects creator voice?
|
|
|
|
Run via: python -m pipeline.quality chat_eval --suite <path>
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
|
|
import openai
|
|
|
|
from pipeline.llm_client import LLMClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CHAT_DIMENSIONS = [
|
|
"citation_accuracy",
|
|
"response_structure",
|
|
"domain_expertise",
|
|
"source_grounding",
|
|
"personality_fidelity",
|
|
]
|
|
|
|
CHAT_RUBRIC = """\
|
|
You are an expert evaluator of AI chat response quality for a music production knowledge base.
|
|
|
|
You will be given:
|
|
1. The user's query
|
|
2. The assistant's response
|
|
3. The numbered source citations that were provided to the assistant
|
|
4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
|
|
5. The creator_name (if any)
|
|
|
|
Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
|
|
|
|
**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
|
|
- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
|
|
- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
|
|
- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
|
|
|
|
**response_structure** — Response is concise, well-organized, uses appropriate formatting
|
|
- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
|
|
- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
|
|
- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
|
|
|
|
**domain_expertise** — Music production terminology used naturally and correctly
|
|
- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
|
|
- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
|
|
- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
|
|
|
|
**source_grounding** — Claims are backed by provided sources, no fabrication
|
|
- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
|
|
- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
|
|
- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
|
|
|
|
**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
|
|
- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
|
|
- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
|
|
- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
|
|
- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
|
|
|
|
Return ONLY a JSON object with this exact structure:
|
|
{
|
|
"citation_accuracy": <float 0.0-1.0>,
|
|
"response_structure": <float 0.0-1.0>,
|
|
"domain_expertise": <float 0.0-1.0>,
|
|
"source_grounding": <float 0.0-1.0>,
|
|
"personality_fidelity": <float 0.0-1.0>,
|
|
"justifications": {
|
|
"citation_accuracy": "<1-2 sentence justification>",
|
|
"response_structure": "<1-2 sentence justification>",
|
|
"domain_expertise": "<1-2 sentence justification>",
|
|
"source_grounding": "<1-2 sentence justification>",
|
|
"personality_fidelity": "<1-2 sentence justification>"
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
@dataclass
|
|
class ChatScoreResult:
|
|
"""Outcome of scoring a chat response across quality dimensions."""
|
|
|
|
scores: dict[str, float] = field(default_factory=dict)
|
|
composite: float = 0.0
|
|
justifications: dict[str, str] = field(default_factory=dict)
|
|
elapsed_seconds: float = 0.0
|
|
error: str | None = None
|
|
|
|
# Convenience properties
|
|
@property
|
|
def citation_accuracy(self) -> float:
|
|
return self.scores.get("citation_accuracy", 0.0)
|
|
|
|
@property
|
|
def response_structure(self) -> float:
|
|
return self.scores.get("response_structure", 0.0)
|
|
|
|
@property
|
|
def domain_expertise(self) -> float:
|
|
return self.scores.get("domain_expertise", 0.0)
|
|
|
|
@property
|
|
def source_grounding(self) -> float:
|
|
return self.scores.get("source_grounding", 0.0)
|
|
|
|
@property
|
|
def personality_fidelity(self) -> float:
|
|
return self.scores.get("personality_fidelity", 0.0)
|
|
|
|
|
|
class ChatScoreRunner:
|
|
"""Scores chat responses using LLM-as-judge evaluation."""
|
|
|
|
def __init__(self, client: LLMClient) -> None:
|
|
self.client = client
|
|
|
|
def score_response(
|
|
self,
|
|
query: str,
|
|
response: str,
|
|
sources: list[dict],
|
|
personality_weight: float = 0.0,
|
|
creator_name: str | None = None,
|
|
) -> ChatScoreResult:
|
|
"""Score a single chat response against the 5 chat quality dimensions.
|
|
|
|
Parameters
|
|
----------
|
|
query:
|
|
The user's original query.
|
|
response:
|
|
The assistant's accumulated response text.
|
|
sources:
|
|
List of source citation dicts (as emitted by the SSE sources event).
|
|
personality_weight:
|
|
0.0 = encyclopedic mode, >0 = personality mode.
|
|
creator_name:
|
|
Creator name, if this was a creator-scoped query.
|
|
|
|
Returns
|
|
-------
|
|
ChatScoreResult with per-dimension scores.
|
|
"""
|
|
sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
|
|
|
|
user_prompt = (
|
|
f"## User Query\n\n{query}\n\n"
|
|
f"## Assistant Response\n\n{response}\n\n"
|
|
f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
|
|
f"## Metadata\n\n"
|
|
f"- personality_weight: {personality_weight}\n"
|
|
f"- creator_name: {creator_name or '(none)'}\n\n"
|
|
f"Score this chat response across all 5 dimensions."
|
|
)
|
|
|
|
t0 = time.monotonic()
|
|
try:
|
|
from pydantic import BaseModel as _BM
|
|
resp = self.client.complete(
|
|
system_prompt=CHAT_RUBRIC,
|
|
user_prompt=user_prompt,
|
|
response_model=_BM,
|
|
modality="chat",
|
|
)
|
|
elapsed = round(time.monotonic() - t0, 2)
|
|
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
|
elapsed = round(time.monotonic() - t0, 2)
|
|
return ChatScoreResult(
|
|
elapsed_seconds=elapsed,
|
|
error=f"Cannot reach LLM judge. Error: {exc}",
|
|
)
|
|
|
|
raw_text = str(resp).strip()
|
|
try:
|
|
parsed = json.loads(raw_text)
|
|
except json.JSONDecodeError:
|
|
logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
|
|
return ChatScoreResult(
|
|
elapsed_seconds=elapsed,
|
|
error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
|
|
)
|
|
|
|
return self._parse_scores(parsed, elapsed)
|
|
|
|
def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
|
|
"""Extract and validate scores from parsed JSON judge response."""
|
|
scores: dict[str, float] = {}
|
|
justifications: dict[str, str] = {}
|
|
|
|
raw_justifications = parsed.get("justifications", {})
|
|
if not isinstance(raw_justifications, dict):
|
|
raw_justifications = {}
|
|
|
|
for dim in CHAT_DIMENSIONS:
|
|
raw = parsed.get(dim)
|
|
if raw is None:
|
|
logger.warning("Missing dimension '%s' in chat judge response", dim)
|
|
scores[dim] = 0.0
|
|
justifications[dim] = "(missing from judge response)"
|
|
continue
|
|
|
|
try:
|
|
val = float(raw)
|
|
scores[dim] = max(0.0, min(1.0, val))
|
|
except (TypeError, ValueError):
|
|
logger.warning("Invalid value for '%s': %r", dim, raw)
|
|
scores[dim] = 0.0
|
|
justifications[dim] = f"(invalid value: {raw!r})"
|
|
continue
|
|
|
|
justifications[dim] = str(raw_justifications.get(dim, ""))
|
|
|
|
composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
|
|
|
|
return ChatScoreResult(
|
|
scores=scores,
|
|
composite=round(composite, 3),
|
|
justifications=justifications,
|
|
elapsed_seconds=elapsed,
|
|
)
|
|
|
|
def print_report(self, result: ChatScoreResult, query: str = "") -> None:
|
|
"""Print a formatted chat scoring report to stdout."""
|
|
print("\n" + "=" * 60)
|
|
print(" CHAT QUALITY SCORE REPORT")
|
|
if query:
|
|
print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}")
|
|
print("=" * 60)
|
|
|
|
if result.error:
|
|
print(f"\n ✗ Error: {result.error}\n")
|
|
print("=" * 60 + "\n")
|
|
return
|
|
|
|
for dim in CHAT_DIMENSIONS:
|
|
score = result.scores.get(dim, 0.0)
|
|
filled = int(score * 20)
|
|
bar = "█" * filled + "░" * (20 - filled)
|
|
justification = result.justifications.get(dim, "")
|
|
print(f"\n {dim.replace('_', ' ').title()}")
|
|
print(f" Score: {score:.2f} {bar}")
|
|
if justification:
|
|
# Simple word wrap at ~56 chars
|
|
words = justification.split()
|
|
lines: list[str] = []
|
|
current = ""
|
|
for word in words:
|
|
if current and len(current) + len(word) + 1 > 56:
|
|
lines.append(current)
|
|
current = word
|
|
else:
|
|
current = f"{current} {word}" if current else word
|
|
if current:
|
|
lines.append(current)
|
|
for line in lines:
|
|
print(f" {line}")
|
|
|
|
print("\n" + "-" * 60)
|
|
print(f" Composite: {result.composite:.3f}")
|
|
print(f" Time: {result.elapsed_seconds}s")
|
|
print("=" * 60 + "\n")
|