test: Created chat-specific LLM-as-judge scorer (5 dimensions), SSE-par…

- "backend/pipeline/quality/chat_scorer.py" - "backend/pipeline/quality/chat_eval.py" - "backend/pipeline/quality/fixtures/chat_test_suite.yaml" - "backend/pipeline/quality/__main__.py" GSD-Task: S09/T01
2026-04-04 14:43:52 +00:00 · 2026-04-04 14:43:52 +00:00 · 90bb90e989
commit 90bb90e989
parent 183d852f31
4 changed files with 779 additions and 0 deletions
--- a/backend/pipeline/quality/main.py
+++ b/backend/pipeline/quality/main.py
@ -18,6 +18,8 @@ from pathlib import Path
 from config import get_settings
 from pipeline.llm_client import LLMClient

+from .chat_eval import ChatEvalRunner
+from .chat_scorer import ChatScoreRunner
 from .fitness import FitnessRunner
 from .optimizer import OptimizationLoop, OptimizationResult
 from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner
@ -260,6 +262,36 @@ def main() -> int:
        help="Write the winning prompt back to the stage's prompt file (backs up the original first)",
    )

+    # -- chat_eval subcommand --
+    chat_parser = sub.add_parser(
+        "chat_eval",
+        help="Evaluate chat quality across a test suite of queries",
+    )
+    chat_parser.add_argument(
+        "--suite",
+        type=str,
+        required=True,
+        help="Path to a chat test suite YAML/JSON file",
+    )
+    chat_parser.add_argument(
+        "--base-url",
+        type=str,
+        default="http://localhost:8096",
+        help="Chat API base URL (default: http://localhost:8096)",
+    )
+    chat_parser.add_argument(
+        "--output",
+        type=str,
+        default="backend/pipeline/quality/results/",
+        help="Output path for results JSON (default: backend/pipeline/quality/results/)",
+    )
+    chat_parser.add_argument(
+        "--timeout",
+        type=float,
+        default=120.0,
+        help="Request timeout in seconds (default: 120)",
+    )
+
    args = parser.parse_args()

    if args.command is None:
@ -281,6 +313,9 @@ def main() -> int:
    if args.command == "apply":
        return _run_apply(args)

+    if args.command == "chat_eval":
+        return _run_chat_eval(args)
+
    return 0


@ -558,5 +593,54 @@ def _run_apply(args: argparse.Namespace) -> int:
    return 0 if success else 1


+def _run_chat_eval(args: argparse.Namespace) -> int:
+    """Execute the chat_eval subcommand — evaluate chat quality across a test suite."""
+    suite_path = Path(args.suite)
+    if not suite_path.exists():
+        print(f"Error: suite file not found: {args.suite}", file=sys.stderr)
+        return 1
+
+    # Load test cases
+    try:
+        cases = ChatEvalRunner.load_suite(suite_path)
+    except Exception as exc:
+        print(f"Error loading test suite: {exc}", file=sys.stderr)
+        return 1
+
+    if not cases:
+        print("Error: test suite contains no queries", file=sys.stderr)
+        return 1
+
+    print(f"\n  Chat Evaluation: {len(cases)} queries from {suite_path}")
+    print(f"  Endpoint: {args.base_url}")
+
+    # Build scorer and runner
+    settings = get_settings()
+    client = LLMClient(settings)
+    scorer = ChatScoreRunner(client)
+    runner = ChatEvalRunner(
+        scorer=scorer,
+        base_url=args.base_url,
+        timeout=args.timeout,
+    )
+
+    # Execute
+    results = runner.run_suite(cases)
+
+    # Print summary
+    runner.print_summary(results)
+
+    # Write results
+    try:
+        json_path = runner.write_results(results, args.output)
+        print(f"  Results written to: {json_path}")
+    except OSError as exc:
+        print(f"  Warning: failed to write results: {exc}", file=sys.stderr)
+
+    # Exit code: 0 if at least one scored, 1 if all errored
+    scored = [r for r in results if r.score and not r.score.error and not r.request_error]
+    return 0 if scored else 1
+
+
 if __name__ == "__main__":
    sys.exit(main())
--- a/backend/pipeline/quality/chat_eval.py
+++ b/backend/pipeline/quality/chat_eval.py
@ -0,0 +1,352 @@
+"""Chat evaluation harness — sends queries to the live chat endpoint, scores responses.
+
+Loads a test suite (YAML or JSON), calls the chat HTTP endpoint for each query,
+parses SSE events to collect response text and sources, then scores each using
+ChatScoreRunner. Writes results to a JSON file.
+
+Usage:
+    python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml
+    python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml --base-url http://ub01:8096
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+from pipeline.llm_client import LLMClient
+from pipeline.quality.chat_scorer import CHAT_DIMENSIONS, ChatScoreResult, ChatScoreRunner
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_BASE_URL = "http://localhost:8096"
+_CHAT_ENDPOINT = "/api/chat"
+_REQUEST_TIMEOUT = 120.0  # seconds — LLM streaming can be slow
+
+
+@dataclass
+class ChatTestCase:
+    """A single test case from the test suite."""
+
+    query: str
+    creator: str | None = None
+    personality_weight: float = 0.0
+    category: str = "general"
+    description: str = ""
+
+
+@dataclass
+class ChatEvalResult:
+    """Result of evaluating a single test case."""
+
+    test_case: ChatTestCase
+    response: str = ""
+    sources: list[dict] = field(default_factory=list)
+    cascade_tier: str = ""
+    score: ChatScoreResult | None = None
+    request_error: str | None = None
+    latency_seconds: float = 0.0
+
+
+class ChatEvalRunner:
+    """Runs a chat evaluation suite against a live endpoint."""
+
+    def __init__(
+        self,
+        scorer: ChatScoreRunner,
+        base_url: str = _DEFAULT_BASE_URL,
+        timeout: float = _REQUEST_TIMEOUT,
+    ) -> None:
+        self.scorer = scorer
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+
+    @staticmethod
+    def load_suite(path: str | Path) -> list[ChatTestCase]:
+        """Load test cases from a YAML or JSON file.
+
+        Expected format (YAML):
+            queries:
+              - query: "How do I sidechain a bass?"
+                creator: null
+                personality_weight: 0.0
+                category: technical
+                description: "Basic sidechain compression question"
+        """
+        filepath = Path(path)
+        text = filepath.read_text(encoding="utf-8")
+
+        if filepath.suffix in (".yaml", ".yml"):
+            try:
+                import yaml
+            except ImportError:
+                raise ImportError(
+                    "PyYAML is required to load YAML test suites. "
+                    "Install with: pip install pyyaml"
+                )
+            data = yaml.safe_load(text)
+        else:
+            data = json.loads(text)
+
+        queries = data.get("queries", [])
+        cases: list[ChatTestCase] = []
+        for q in queries:
+            cases.append(ChatTestCase(
+                query=q["query"],
+                creator=q.get("creator"),
+                personality_weight=float(q.get("personality_weight", 0.0)),
+                category=q.get("category", "general"),
+                description=q.get("description", ""),
+            ))
+        return cases
+
+    def run_suite(self, cases: list[ChatTestCase]) -> list[ChatEvalResult]:
+        """Execute all test cases sequentially, scoring each response."""
+        results: list[ChatEvalResult] = []
+
+        for i, case in enumerate(cases, 1):
+            print(f"\n  [{i}/{len(cases)}] {case.category}: {case.query[:60]}...")
+            result = self._run_single(case)
+            results.append(result)
+
+            if result.request_error:
+                print(f"    ✗ Request error: {result.request_error}")
+            elif result.score and result.score.error:
+                print(f"    ✗ Scoring error: {result.score.error}")
+            elif result.score:
+                print(f"    ✓ Composite: {result.score.composite:.3f}  "
+                      f"(latency: {result.latency_seconds:.1f}s)")
+
+        return results
+
+    def _run_single(self, case: ChatTestCase) -> ChatEvalResult:
+        """Execute a single test case: call endpoint, parse SSE, score."""
+        eval_result = ChatEvalResult(test_case=case)
+
+        # Call the chat endpoint
+        t0 = time.monotonic()
+        try:
+            response_text, sources, cascade_tier = self._call_chat_endpoint(case)
+            eval_result.latency_seconds = round(time.monotonic() - t0, 2)
+        except Exception as exc:
+            eval_result.latency_seconds = round(time.monotonic() - t0, 2)
+            eval_result.request_error = str(exc)
+            logger.error("chat_eval_request_error query=%r error=%s", case.query, exc)
+            return eval_result
+
+        eval_result.response = response_text
+        eval_result.sources = sources
+        eval_result.cascade_tier = cascade_tier
+
+        if not response_text:
+            eval_result.request_error = "Empty response from chat endpoint"
+            return eval_result
+
+        # Score the response
+        eval_result.score = self.scorer.score_response(
+            query=case.query,
+            response=response_text,
+            sources=sources,
+            personality_weight=case.personality_weight,
+            creator_name=case.creator,
+        )
+
+        return eval_result
+
+    def _call_chat_endpoint(
+        self, case: ChatTestCase
+    ) -> tuple[str, list[dict], str]:
+        """Call the chat SSE endpoint and parse the event stream.
+
+        Returns (accumulated_text, sources_list, cascade_tier).
+        """
+        url = f"{self.base_url}{_CHAT_ENDPOINT}"
+        payload: dict[str, Any] = {"query": case.query}
+        if case.creator:
+            payload["creator"] = case.creator
+        if case.personality_weight > 0:
+            payload["personality_weight"] = case.personality_weight
+
+        sources: list[dict] = []
+        accumulated = ""
+        cascade_tier = ""
+
+        with httpx.Client(timeout=self.timeout) as client:
+            with client.stream("POST", url, json=payload) as resp:
+                resp.raise_for_status()
+
+                buffer = ""
+                for chunk in resp.iter_text():
+                    buffer += chunk
+                    # Parse SSE events from buffer
+                    while "\n\n" in buffer:
+                        event_block, buffer = buffer.split("\n\n", 1)
+                        event_type, event_data = self._parse_sse_event(event_block)
+
+                        if event_type == "sources":
+                            sources = event_data if isinstance(event_data, list) else []
+                        elif event_type == "token":
+                            accumulated += event_data if isinstance(event_data, str) else str(event_data)
+                        elif event_type == "done":
+                            if isinstance(event_data, dict):
+                                cascade_tier = event_data.get("cascade_tier", "")
+                        elif event_type == "error":
+                            msg = event_data.get("message", str(event_data)) if isinstance(event_data, dict) else str(event_data)
+                            raise RuntimeError(f"Chat endpoint returned error: {msg}")
+
+        return accumulated, sources, cascade_tier
+
+    @staticmethod
+    def _parse_sse_event(block: str) -> tuple[str, Any]:
+        """Parse a single SSE event block into (event_type, data)."""
+        event_type = ""
+        data_lines: list[str] = []
+
+        for line in block.strip().splitlines():
+            if line.startswith("event: "):
+                event_type = line[7:].strip()
+            elif line.startswith("data: "):
+                data_lines.append(line[6:])
+            elif line.startswith("data:"):
+                data_lines.append(line[5:])
+
+        raw_data = "\n".join(data_lines)
+        try:
+            parsed = json.loads(raw_data)
+        except (json.JSONDecodeError, ValueError):
+            parsed = raw_data  # plain text token
+
+        return event_type, parsed
+
+    @staticmethod
+    def write_results(
+        results: list[ChatEvalResult],
+        output_path: str | Path,
+    ) -> str:
+        """Write evaluation results to a JSON file. Returns the path."""
+        out = Path(output_path)
+        out.parent.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        if out.is_dir():
+            filepath = out / f"chat_eval_{timestamp}.json"
+        else:
+            filepath = out
+
+        # Build serializable payload
+        entries: list[dict] = []
+        for r in results:
+            entry: dict[str, Any] = {
+                "query": r.test_case.query,
+                "creator": r.test_case.creator,
+                "personality_weight": r.test_case.personality_weight,
+                "category": r.test_case.category,
+                "description": r.test_case.description,
+                "response_length": len(r.response),
+                "source_count": len(r.sources),
+                "cascade_tier": r.cascade_tier,
+                "latency_seconds": r.latency_seconds,
+            }
+
+            if r.request_error:
+                entry["error"] = r.request_error
+            elif r.score:
+                entry["scores"] = r.score.scores
+                entry["composite"] = r.score.composite
+                entry["justifications"] = r.score.justifications
+                entry["scoring_time"] = r.score.elapsed_seconds
+                if r.score.error:
+                    entry["scoring_error"] = r.score.error
+
+            entries.append(entry)
+
+        # Summary stats
+        scored = [e for e in entries if "composite" in e]
+        avg_composite = (
+            sum(e["composite"] for e in scored) / len(scored) if scored else 0.0
+        )
+        dim_avgs: dict[str, float] = {}
+        for dim in CHAT_DIMENSIONS:
+            vals = [e["scores"][dim] for e in scored if dim in e.get("scores", {})]
+            dim_avgs[dim] = round(sum(vals) / len(vals), 3) if vals else 0.0
+
+        payload = {
+            "timestamp": timestamp,
+            "total_queries": len(results),
+            "scored_queries": len(scored),
+            "errors": len(results) - len(scored),
+            "average_composite": round(avg_composite, 3),
+            "dimension_averages": dim_avgs,
+            "results": entries,
+        }
+
+        filepath.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        return str(filepath)
+
+    @staticmethod
+    def print_summary(results: list[ChatEvalResult]) -> None:
+        """Print a summary table of evaluation results."""
+        print("\n" + "=" * 72)
+        print("  CHAT EVALUATION SUMMARY")
+        print("=" * 72)
+
+        scored = [r for r in results if r.score and not r.score.error and not r.request_error]
+        errored = [r for r in results if r.request_error or (r.score and r.score.error)]
+
+        if not scored:
+            print("\n  No successfully scored responses.\n")
+            if errored:
+                print(f"  Errors: {len(errored)}")
+                for r in errored:
+                    err = r.request_error or (r.score.error if r.score else "unknown")
+                    print(f"    - {r.test_case.query[:50]}: {err}")
+            print("=" * 72 + "\n")
+            return
+
+        # Header
+        print(f"\n  {'Category':<12s} {'Query':<30s} {'Comp':>5s} {'Cite':>5s} {'Struct':>6s} {'Domain':>6s} {'Ground':>6s} {'Person':>6s}")
+        print(f"  {'─'*12} {'─'*30} {'─'*5} {'─'*5} {'─'*6} {'─'*6} {'─'*6} {'─'*6}")
+
+        for r in scored:
+            s = r.score
+            assert s is not None
+            q = r.test_case.query[:30]
+            cat = r.test_case.category[:12]
+            print(
+                f"  {cat:<12s} {q:<30s} "
+                f"{s.composite:5.2f} "
+                f"{s.citation_accuracy:5.2f} "
+                f"{s.response_structure:6.2f} "
+                f"{s.domain_expertise:6.2f} "
+                f"{s.source_grounding:6.2f} "
+                f"{s.personality_fidelity:6.2f}"
+            )
+
+        # Averages
+        avg_comp = sum(r.score.composite for r in scored) / len(scored)
+        avg_dims = {}
+        for dim in CHAT_DIMENSIONS:
+            vals = [r.score.scores.get(dim, 0.0) for r in scored]
+            avg_dims[dim] = sum(vals) / len(vals)
+
+        print(f"\n  {'AVERAGE':<12s} {'':30s} "
+              f"{avg_comp:5.2f} "
+              f"{avg_dims['citation_accuracy']:5.2f} "
+              f"{avg_dims['response_structure']:6.2f} "
+              f"{avg_dims['domain_expertise']:6.2f} "
+              f"{avg_dims['source_grounding']:6.2f} "
+              f"{avg_dims['personality_fidelity']:6.2f}")
+
+        if errored:
+            print(f"\n  Errors: {len(errored)}")
+            for r in errored:
+                err = r.request_error or (r.score.error if r.score else "unknown")
+                print(f"    - {r.test_case.query[:50]}: {err}")
+
+        print("=" * 72 + "\n")
--- a/backend/pipeline/quality/chat_scorer.py
+++ b/backend/pipeline/quality/chat_scorer.py
@ -0,0 +1,271 @@
+"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
+
+Scores chat responses across 5 dimensions:
+- citation_accuracy: Are citations real and correctly numbered?
+- response_structure: Concise, well-organized, uses appropriate formatting?
+- domain_expertise: Music production terminology used naturally?
+- source_grounding: Claims backed by provided sources, no fabrication?
+- personality_fidelity: At weight>0, response reflects creator voice?
+
+Run via: python -m pipeline.quality chat_eval --suite <path>
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+
+import openai
+
+from pipeline.llm_client import LLMClient
+
+logger = logging.getLogger(__name__)
+
+CHAT_DIMENSIONS = [
+    "citation_accuracy",
+    "response_structure",
+    "domain_expertise",
+    "source_grounding",
+    "personality_fidelity",
+]
+
+CHAT_RUBRIC = """\
+You are an expert evaluator of AI chat response quality for a music production knowledge base.
+
+You will be given:
+1. The user's query
+2. The assistant's response
+3. The numbered source citations that were provided to the assistant
+4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
+5. The creator_name (if any)
+
+Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
+
+**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
+- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
+- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
+- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
+
+**response_structure** — Response is concise, well-organized, uses appropriate formatting
+- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
+- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
+- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
+
+**domain_expertise** — Music production terminology used naturally and correctly
+- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
+- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
+- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
+
+**source_grounding** — Claims are backed by provided sources, no fabrication
+- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
+- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
+- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
+
+**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
+- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
+- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
+- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
+- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
+
+Return ONLY a JSON object with this exact structure:
+{
+  "citation_accuracy": <float 0.0-1.0>,
+  "response_structure": <float 0.0-1.0>,
+  "domain_expertise": <float 0.0-1.0>,
+  "source_grounding": <float 0.0-1.0>,
+  "personality_fidelity": <float 0.0-1.0>,
+  "justifications": {
+    "citation_accuracy": "<1-2 sentence justification>",
+    "response_structure": "<1-2 sentence justification>",
+    "domain_expertise": "<1-2 sentence justification>",
+    "source_grounding": "<1-2 sentence justification>",
+    "personality_fidelity": "<1-2 sentence justification>"
+  }
+}
+"""
+
+
+@dataclass
+class ChatScoreResult:
+    """Outcome of scoring a chat response across quality dimensions."""
+
+    scores: dict[str, float] = field(default_factory=dict)
+    composite: float = 0.0
+    justifications: dict[str, str] = field(default_factory=dict)
+    elapsed_seconds: float = 0.0
+    error: str | None = None
+
+    # Convenience properties
+    @property
+    def citation_accuracy(self) -> float:
+        return self.scores.get("citation_accuracy", 0.0)
+
+    @property
+    def response_structure(self) -> float:
+        return self.scores.get("response_structure", 0.0)
+
+    @property
+    def domain_expertise(self) -> float:
+        return self.scores.get("domain_expertise", 0.0)
+
+    @property
+    def source_grounding(self) -> float:
+        return self.scores.get("source_grounding", 0.0)
+
+    @property
+    def personality_fidelity(self) -> float:
+        return self.scores.get("personality_fidelity", 0.0)
+
+
+class ChatScoreRunner:
+    """Scores chat responses using LLM-as-judge evaluation."""
+
+    def __init__(self, client: LLMClient) -> None:
+        self.client = client
+
+    def score_response(
+        self,
+        query: str,
+        response: str,
+        sources: list[dict],
+        personality_weight: float = 0.0,
+        creator_name: str | None = None,
+    ) -> ChatScoreResult:
+        """Score a single chat response against the 5 chat quality dimensions.
+
+        Parameters
+        ----------
+        query:
+            The user's original query.
+        response:
+            The assistant's accumulated response text.
+        sources:
+            List of source citation dicts (as emitted by the SSE sources event).
+        personality_weight:
+            0.0 = encyclopedic mode, >0 = personality mode.
+        creator_name:
+            Creator name, if this was a creator-scoped query.
+
+        Returns
+        -------
+        ChatScoreResult with per-dimension scores.
+        """
+        sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
+
+        user_prompt = (
+            f"## User Query\n\n{query}\n\n"
+            f"## Assistant Response\n\n{response}\n\n"
+            f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
+            f"## Metadata\n\n"
+            f"- personality_weight: {personality_weight}\n"
+            f"- creator_name: {creator_name or '(none)'}\n\n"
+            f"Score this chat response across all 5 dimensions."
+        )
+
+        t0 = time.monotonic()
+        try:
+            from pydantic import BaseModel as _BM
+            resp = self.client.complete(
+                system_prompt=CHAT_RUBRIC,
+                user_prompt=user_prompt,
+                response_model=_BM,
+                modality="chat",
+            )
+            elapsed = round(time.monotonic() - t0, 2)
+        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
+            elapsed = round(time.monotonic() - t0, 2)
+            return ChatScoreResult(
+                elapsed_seconds=elapsed,
+                error=f"Cannot reach LLM judge. Error: {exc}",
+            )
+
+        raw_text = str(resp).strip()
+        try:
+            parsed = json.loads(raw_text)
+        except json.JSONDecodeError:
+            logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
+            return ChatScoreResult(
+                elapsed_seconds=elapsed,
+                error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
+            )
+
+        return self._parse_scores(parsed, elapsed)
+
+    def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
+        """Extract and validate scores from parsed JSON judge response."""
+        scores: dict[str, float] = {}
+        justifications: dict[str, str] = {}
+
+        raw_justifications = parsed.get("justifications", {})
+        if not isinstance(raw_justifications, dict):
+            raw_justifications = {}
+
+        for dim in CHAT_DIMENSIONS:
+            raw = parsed.get(dim)
+            if raw is None:
+                logger.warning("Missing dimension '%s' in chat judge response", dim)
+                scores[dim] = 0.0
+                justifications[dim] = "(missing from judge response)"
+                continue
+
+            try:
+                val = float(raw)
+                scores[dim] = max(0.0, min(1.0, val))
+            except (TypeError, ValueError):
+                logger.warning("Invalid value for '%s': %r", dim, raw)
+                scores[dim] = 0.0
+                justifications[dim] = f"(invalid value: {raw!r})"
+                continue
+
+            justifications[dim] = str(raw_justifications.get(dim, ""))
+
+        composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
+
+        return ChatScoreResult(
+            scores=scores,
+            composite=round(composite, 3),
+            justifications=justifications,
+            elapsed_seconds=elapsed,
+        )
+
+    def print_report(self, result: ChatScoreResult, query: str = "") -> None:
+        """Print a formatted chat scoring report to stdout."""
+        print("\n" + "=" * 60)
+        print("  CHAT QUALITY SCORE REPORT")
+        if query:
+            print(f"  Query: {query[:60]}{'...' if len(query) > 60 else ''}")
+        print("=" * 60)
+
+        if result.error:
+            print(f"\n  ✗ Error: {result.error}\n")
+            print("=" * 60 + "\n")
+            return
+
+        for dim in CHAT_DIMENSIONS:
+            score = result.scores.get(dim, 0.0)
+            filled = int(score * 20)
+            bar = "█" * filled + "░" * (20 - filled)
+            justification = result.justifications.get(dim, "")
+            print(f"\n  {dim.replace('_', ' ').title()}")
+            print(f"    Score: {score:.2f}  {bar}")
+            if justification:
+                # Simple word wrap at ~56 chars
+                words = justification.split()
+                lines: list[str] = []
+                current = ""
+                for word in words:
+                    if current and len(current) + len(word) + 1 > 56:
+                        lines.append(current)
+                        current = word
+                    else:
+                        current = f"{current} {word}" if current else word
+                if current:
+                    lines.append(current)
+                for line in lines:
+                    print(f"    {line}")
+
+        print("\n" + "-" * 60)
+        print(f"  Composite: {result.composite:.3f}")
+        print(f"  Time: {result.elapsed_seconds}s")
+        print("=" * 60 + "\n")
--- a/backend/pipeline/quality/fixtures/chat_test_suite.yaml
+++ b/backend/pipeline/quality/fixtures/chat_test_suite.yaml
@ -0,0 +1,72 @@
+# Chat quality evaluation test suite
+# 10 representative queries across 4 categories:
+#   - technical: How-to questions about specific production techniques
+#   - conceptual: Broader understanding questions about audio concepts
+#   - creator: Creator-scoped queries at different personality weights
+#   - cross_creator: Queries spanning multiple creators' approaches
+
+queries:
+  # ── Technical how-to (2) ────────────────────────────────────────────
+  - query: "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?"
+    creator: null
+    personality_weight: 0.0
+    category: technical
+    description: "Common sidechain compression setup — expects specific settings (ratio, attack, release)"
+
+  - query: "What are the best EQ settings for cleaning up a muddy vocal recording?"
+    creator: null
+    personality_weight: 0.0
+    category: technical
+    description: "Vocal EQ technique — expects frequency ranges, Q values, cut/boost guidance"
+
+  # ── Conceptual (2) ─────────────────────────────────────────────────
+  - query: "What is the difference between parallel compression and serial compression, and when should I use each?"
+    creator: null
+    personality_weight: 0.0
+    category: conceptual
+    description: "Conceptual comparison — expects clear definitions, use cases, pros/cons"
+
+  - query: "How does sample rate affect sound quality in music production?"
+    creator: null
+    personality_weight: 0.0
+    category: conceptual
+    description: "Audio fundamentals — expects Nyquist, aliasing, practical guidance"
+
+  # ── Creator-specific: encyclopedic (2) ──────────────────────────────
+  - query: "How does this creator approach sound design for bass sounds?"
+    creator: "KEOTA"
+    personality_weight: 0.0
+    category: creator_encyclopedic
+    description: "Creator-scoped query at weight=0 — should be neutral/encyclopedic about KEOTA's techniques"
+
+  - query: "What mixing techniques does this creator recommend for achieving width in a mix?"
+    creator: "Mr. Bill"
+    personality_weight: 0.0
+    category: creator_encyclopedic
+    description: "Creator-scoped query at weight=0 — neutral tone about Mr. Bill's approach"
+
+  # ── Creator-specific: personality (2) ───────────────────────────────
+  - query: "How does this creator approach sound design for bass sounds?"
+    creator: "KEOTA"
+    personality_weight: 0.7
+    category: creator_personality
+    description: "Same query as above but at weight=0.7 — should reflect KEOTA's voice and teaching style"
+
+  - query: "What mixing techniques does this creator recommend for achieving width in a mix?"
+    creator: "Mr. Bill"
+    personality_weight: 0.7
+    category: creator_personality
+    description: "Same query as above but at weight=0.7 — should reflect Mr. Bill's voice"
+
+  # ── Cross-creator (2) ──────────────────────────────────────────────
+  - query: "What are the different approaches to layering synth sounds across creators?"
+    creator: null
+    personality_weight: 0.0
+    category: cross_creator
+    description: "Cross-creator comparison — should cite multiple creators' techniques"
+
+  - query: "How do different producers approach drum processing and what plugins do they prefer?"
+    creator: null
+    personality_weight: 0.0
+    category: cross_creator
+    description: "Cross-creator comparison on drums — expects multiple perspectives with citations"