"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses. Scores chat responses across 5 dimensions: - citation_accuracy: Are citations real and correctly numbered? - response_structure: Concise, well-organized, uses appropriate formatting? - domain_expertise: Music production terminology used naturally? - source_grounding: Claims backed by provided sources, no fabrication? - personality_fidelity: At weight>0, response reflects creator voice? Run via: python -m pipeline.quality chat_eval --suite """ from __future__ import annotations import json import logging import time from dataclasses import dataclass, field import openai from pipeline.llm_client import LLMClient logger = logging.getLogger(__name__) CHAT_DIMENSIONS = [ "citation_accuracy", "response_structure", "domain_expertise", "source_grounding", "personality_fidelity", ] CHAT_RUBRIC = """\ You are an expert evaluator of AI chat response quality for a music production knowledge base. You will be given: 1. The user's query 2. The assistant's response 3. The numbered source citations that were provided to the assistant 4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected) 5. The creator_name (if any) Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0: **citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources - 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations - 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers - 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims **response_structure** — Response is concise, well-organized, uses appropriate formatting - 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded) - 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help - 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan **domain_expertise** — Music production terminology used naturally and correctly - 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer - 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused - 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly **source_grounding** — Claims are backed by provided sources, no fabrication - 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources) - 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources - 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source **personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight - If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5. - If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic. - If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator. - If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona. Return ONLY a JSON object with this exact structure: { "citation_accuracy": , "response_structure": , "domain_expertise": , "source_grounding": , "personality_fidelity": , "justifications": { "citation_accuracy": "<1-2 sentence justification>", "response_structure": "<1-2 sentence justification>", "domain_expertise": "<1-2 sentence justification>", "source_grounding": "<1-2 sentence justification>", "personality_fidelity": "<1-2 sentence justification>" } } """ @dataclass class ChatScoreResult: """Outcome of scoring a chat response across quality dimensions.""" scores: dict[str, float] = field(default_factory=dict) composite: float = 0.0 justifications: dict[str, str] = field(default_factory=dict) elapsed_seconds: float = 0.0 error: str | None = None # Convenience properties @property def citation_accuracy(self) -> float: return self.scores.get("citation_accuracy", 0.0) @property def response_structure(self) -> float: return self.scores.get("response_structure", 0.0) @property def domain_expertise(self) -> float: return self.scores.get("domain_expertise", 0.0) @property def source_grounding(self) -> float: return self.scores.get("source_grounding", 0.0) @property def personality_fidelity(self) -> float: return self.scores.get("personality_fidelity", 0.0) class ChatScoreRunner: """Scores chat responses using LLM-as-judge evaluation.""" def __init__(self, client: LLMClient) -> None: self.client = client def score_response( self, query: str, response: str, sources: list[dict], personality_weight: float = 0.0, creator_name: str | None = None, ) -> ChatScoreResult: """Score a single chat response against the 5 chat quality dimensions. Parameters ---------- query: The user's original query. response: The assistant's accumulated response text. sources: List of source citation dicts (as emitted by the SSE sources event). personality_weight: 0.0 = encyclopedic mode, >0 = personality mode. creator_name: Creator name, if this was a creator-scoped query. Returns ------- ChatScoreResult with per-dimension scores. """ sources_block = json.dumps(sources, indent=2) if sources else "(no sources)" user_prompt = ( f"## User Query\n\n{query}\n\n" f"## Assistant Response\n\n{response}\n\n" f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n" f"## Metadata\n\n" f"- personality_weight: {personality_weight}\n" f"- creator_name: {creator_name or '(none)'}\n\n" f"Score this chat response across all 5 dimensions." ) t0 = time.monotonic() try: from pydantic import BaseModel as _BM resp = self.client.complete( system_prompt=CHAT_RUBRIC, user_prompt=user_prompt, response_model=_BM, modality="chat", ) elapsed = round(time.monotonic() - t0, 2) except (openai.APIConnectionError, openai.APITimeoutError) as exc: elapsed = round(time.monotonic() - t0, 2) return ChatScoreResult( elapsed_seconds=elapsed, error=f"Cannot reach LLM judge. Error: {exc}", ) raw_text = str(resp).strip() try: parsed = json.loads(raw_text) except json.JSONDecodeError: logger.error("Malformed chat judge response (not JSON): %.300s", raw_text) return ChatScoreResult( elapsed_seconds=elapsed, error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}", ) return self._parse_scores(parsed, elapsed) def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult: """Extract and validate scores from parsed JSON judge response.""" scores: dict[str, float] = {} justifications: dict[str, str] = {} raw_justifications = parsed.get("justifications", {}) if not isinstance(raw_justifications, dict): raw_justifications = {} for dim in CHAT_DIMENSIONS: raw = parsed.get(dim) if raw is None: logger.warning("Missing dimension '%s' in chat judge response", dim) scores[dim] = 0.0 justifications[dim] = "(missing from judge response)" continue try: val = float(raw) scores[dim] = max(0.0, min(1.0, val)) except (TypeError, ValueError): logger.warning("Invalid value for '%s': %r", dim, raw) scores[dim] = 0.0 justifications[dim] = f"(invalid value: {raw!r})" continue justifications[dim] = str(raw_justifications.get(dim, "")) composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0 return ChatScoreResult( scores=scores, composite=round(composite, 3), justifications=justifications, elapsed_seconds=elapsed, ) def print_report(self, result: ChatScoreResult, query: str = "") -> None: """Print a formatted chat scoring report to stdout.""" print("\n" + "=" * 60) print(" CHAT QUALITY SCORE REPORT") if query: print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}") print("=" * 60) if result.error: print(f"\n ✗ Error: {result.error}\n") print("=" * 60 + "\n") return for dim in CHAT_DIMENSIONS: score = result.scores.get(dim, 0.0) filled = int(score * 20) bar = "█" * filled + "░" * (20 - filled) justification = result.justifications.get(dim, "") print(f"\n {dim.replace('_', ' ').title()}") print(f" Score: {score:.2f} {bar}") if justification: # Simple word wrap at ~56 chars words = justification.split() lines: list[str] = [] current = "" for word in words: if current and len(current) + len(word) + 1 > 56: lines.append(current) current = word else: current = f"{current} {word}" if current else word if current: lines.append(current) for line in lines: print(f" {line}") print("\n" + "-" * 60) print(f" Composite: {result.composite:.3f}") print(f" Time: {result.elapsed_seconds}s") print("=" * 60 + "\n")