test: Created chat-specific LLM-as-judge scorer (5 dimensions), SSE-par…

- "backend/pipeline/quality/chat_scorer.py" - "backend/pipeline/quality/chat_eval.py" - "backend/pipeline/quality/fixtures/chat_test_suite.yaml" - "backend/pipeline/quality/__main__.py" GSD-Task: S09/T01
2026-04-04 14:43:52 +00:00 · 2026-04-04 14:43:52 +00:00 · 90bb90e989
commit 90bb90e989
parent 183d852f31
4 changed files with 779 additions and 0 deletions
--- a/backend/pipeline/quality/main.py
+++ b/backend/pipeline/quality/main.py
@ -18,6 +18,8 @@ from pathlib import Path
 from config import get_settings
 from pipeline.llm_client import LLMClient
 from .chat_eval import ChatEvalRunner
 from .chat_scorer import ChatScoreRunner
 from .fitness import FitnessRunner
 from .optimizer import OptimizationLoop, OptimizationResult
 from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner
@ -260,6 +262,36 @@ def main() -> int:
        help="Write the winning prompt back to the stage's prompt file (backs up the original first)",
    )
    # -- chat_eval subcommand --
    chat_parser = sub.add_parser(
        "chat_eval",
        help="Evaluate chat quality across a test suite of queries",
    )
    chat_parser.add_argument(
        "--suite",
        type=str,
        required=True,
        help="Path to a chat test suite YAML/JSON file",
    )
    chat_parser.add_argument(
        "--base-url",
        type=str,
        default="http://localhost:8096",
        help="Chat API base URL (default: http://localhost:8096)",
    )
    chat_parser.add_argument(
        "--output",
        type=str,
        default="backend/pipeline/quality/results/",
        help="Output path for results JSON (default: backend/pipeline/quality/results/)",
    )
    chat_parser.add_argument(
        "--timeout",
        type=float,
        default=120.0,
        help="Request timeout in seconds (default: 120)",
    )
    args = parser.parse_args()
    if args.command is None:
@ -281,6 +313,9 @@ def main() -> int:
    if args.command == "apply":
        return _run_apply(args)
    if args.command == "chat_eval":
        return _run_chat_eval(args)
    return 0
@ -558,5 +593,54 @@ def _run_apply(args: argparse.Namespace) -> int:
    return 0 if success else 1
 def _run_chat_eval(args: argparse.Namespace) -> int:
    """Execute the chat_eval subcommand — evaluate chat quality across a test suite."""
    suite_path = Path(args.suite)
    if not suite_path.exists():
        print(f"Error: suite file not found: {args.suite}", file=sys.stderr)
        return 1
    # Load test cases
    try:
        cases = ChatEvalRunner.load_suite(suite_path)
    except Exception as exc:
        print(f"Error loading test suite: {exc}", file=sys.stderr)
        return 1
    if not cases:
        print("Error: test suite contains no queries", file=sys.stderr)
        return 1
    print(f"\n  Chat Evaluation: {len(cases)} queries from {suite_path}")
    print(f"  Endpoint: {args.base_url}")
    # Build scorer and runner
    settings = get_settings()
    client = LLMClient(settings)
    scorer = ChatScoreRunner(client)
    runner = ChatEvalRunner(
        scorer=scorer,
        base_url=args.base_url,
        timeout=args.timeout,
    )
    # Execute
    results = runner.run_suite(cases)
    # Print summary
    runner.print_summary(results)
    # Write results
    try:
        json_path = runner.write_results(results, args.output)
        print(f"  Results written to: {json_path}")
    except OSError as exc:
        print(f"  Warning: failed to write results: {exc}", file=sys.stderr)
    # Exit code: 0 if at least one scored, 1 if all errored
    scored = [r for r in results if r.score and not r.score.error and not r.request_error]
    return 0 if scored else 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/backend/pipeline/quality/chat_eval.py
+++ b/backend/pipeline/quality/chat_eval.py
@ -0,0 +1,352 @@
 """Chat evaluation harness — sends queries to the live chat endpoint, scores responses.
 Loads a test suite (YAML or JSON), calls the chat HTTP endpoint for each query,
 parses SSE events to collect response text and sources, then scores each using
 ChatScoreRunner. Writes results to a JSON file.
 Usage:
    python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml
    python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml --base-url http://ub01:8096
 """
 from __future__ import annotations
 import json
 import logging
 import time
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 import httpx
 from pipeline.llm_client import LLMClient
 from pipeline.quality.chat_scorer import CHAT_DIMENSIONS, ChatScoreResult, ChatScoreRunner
 logger = logging.getLogger(__name__)
 _DEFAULT_BASE_URL = "http://localhost:8096"
 _CHAT_ENDPOINT = "/api/chat"
 _REQUEST_TIMEOUT = 120.0  # seconds — LLM streaming can be slow
@dataclass
 class ChatTestCase:
    """A single test case from the test suite."""
    query: str
    creator: str | None = None
    personality_weight: float = 0.0
    category: str = "general"
    description: str = ""
@dataclass
 class ChatEvalResult:
    """Result of evaluating a single test case."""
    test_case: ChatTestCase
    response: str = ""
    sources: list[dict] = field(default_factory=list)
    cascade_tier: str = ""
    score: ChatScoreResult | None = None
    request_error: str | None = None
    latency_seconds: float = 0.0
 class ChatEvalRunner:
    """Runs a chat evaluation suite against a live endpoint."""
    def __init__(
        self,
        scorer: ChatScoreRunner,
        base_url: str = _DEFAULT_BASE_URL,
        timeout: float = _REQUEST_TIMEOUT,
    ) -> None:
        self.scorer = scorer
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
    @staticmethod
    def load_suite(path: str | Path) -> list[ChatTestCase]:
        """Load test cases from a YAML or JSON file.
        Expected format (YAML):
            queries:
              - query: "How do I sidechain a bass?"
                creator: null
                personality_weight: 0.0
                category: technical
                description: "Basic sidechain compression question"
        """
        filepath = Path(path)
        text = filepath.read_text(encoding="utf-8")
        if filepath.suffix in (".yaml", ".yml"):
            try:
                import yaml
            except ImportError:
                raise ImportError(
                    "PyYAML is required to load YAML test suites. "
                    "Install with: pip install pyyaml"
                )
            data = yaml.safe_load(text)
        else:
            data = json.loads(text)
        queries = data.get("queries", [])
        cases: list[ChatTestCase] = []
        for q in queries:
            cases.append(ChatTestCase(
                query=q["query"],
                creator=q.get("creator"),
                personality_weight=float(q.get("personality_weight", 0.0)),
                category=q.get("category", "general"),
                description=q.get("description", ""),
            ))
        return cases
    def run_suite(self, cases: list[ChatTestCase]) -> list[ChatEvalResult]:
        """Execute all test cases sequentially, scoring each response."""
        results: list[ChatEvalResult] = []
        for i, case in enumerate(cases, 1):
            print(f"\n  [{i}/{len(cases)}] {case.category}: {case.query[:60]}...")
            result = self._run_single(case)
            results.append(result)
            if result.request_error:
                print(f"    ✗ Request error: {result.request_error}")
            elif result.score and result.score.error:
                print(f"    ✗ Scoring error: {result.score.error}")
            elif result.score:
                print(f"    ✓ Composite: {result.score.composite:.3f}  "
                      f"(latency: {result.latency_seconds:.1f}s)")
        return results
    def _run_single(self, case: ChatTestCase) -> ChatEvalResult:
        """Execute a single test case: call endpoint, parse SSE, score."""
        eval_result = ChatEvalResult(test_case=case)
        # Call the chat endpoint
        t0 = time.monotonic()
        try:
            response_text, sources, cascade_tier = self._call_chat_endpoint(case)
            eval_result.latency_seconds = round(time.monotonic() - t0, 2)
        except Exception as exc:
            eval_result.latency_seconds = round(time.monotonic() - t0, 2)
            eval_result.request_error = str(exc)
            logger.error("chat_eval_request_error query=%r error=%s", case.query, exc)
            return eval_result
        eval_result.response = response_text
        eval_result.sources = sources
        eval_result.cascade_tier = cascade_tier
        if not response_text:
            eval_result.request_error = "Empty response from chat endpoint"
            return eval_result
        # Score the response
        eval_result.score = self.scorer.score_response(
            query=case.query,
            response=response_text,
            sources=sources,
            personality_weight=case.personality_weight,
            creator_name=case.creator,
        )
        return eval_result
    def _call_chat_endpoint(
        self, case: ChatTestCase
    ) -> tuple[str, list[dict], str]:
        """Call the chat SSE endpoint and parse the event stream.
        Returns (accumulated_text, sources_list, cascade_tier).
        """
        url = f"{self.base_url}{_CHAT_ENDPOINT}"
        payload: dict[str, Any] = {"query": case.query}
        if case.creator:
            payload["creator"] = case.creator
        if case.personality_weight > 0:
            payload["personality_weight"] = case.personality_weight
        sources: list[dict] = []
        accumulated = ""
        cascade_tier = ""
        with httpx.Client(timeout=self.timeout) as client:
            with client.stream("POST", url, json=payload) as resp:
                resp.raise_for_status()
                buffer = ""
                for chunk in resp.iter_text():
                    buffer += chunk
                    # Parse SSE events from buffer
                    while "\n\n" in buffer:
                        event_block, buffer = buffer.split("\n\n", 1)
                        event_type, event_data = self._parse_sse_event(event_block)
                        if event_type == "sources":
                            sources = event_data if isinstance(event_data, list) else []
                        elif event_type == "token":
                            accumulated += event_data if isinstance(event_data, str) else str(event_data)
                        elif event_type == "done":
                            if isinstance(event_data, dict):
                                cascade_tier = event_data.get("cascade_tier", "")
                        elif event_type == "error":
                            msg = event_data.get("message", str(event_data)) if isinstance(event_data, dict) else str(event_data)
                            raise RuntimeError(f"Chat endpoint returned error: {msg}")
        return accumulated, sources, cascade_tier
    @staticmethod
    def _parse_sse_event(block: str) -> tuple[str, Any]:
        """Parse a single SSE event block into (event_type, data)."""
        event_type = ""
        data_lines: list[str] = []
        for line in block.strip().splitlines():
            if line.startswith("event: "):
                event_type = line[7:].strip()
            elif line.startswith("data: "):
                data_lines.append(line[6:])
            elif line.startswith("data:"):
                data_lines.append(line[5:])
        raw_data = "\n".join(data_lines)
        try:
            parsed = json.loads(raw_data)
        except (json.JSONDecodeError, ValueError):
            parsed = raw_data  # plain text token
        return event_type, parsed
    @staticmethod
    def write_results(
        results: list[ChatEvalResult],
        output_path: str | Path,
    ) -> str:
        """Write evaluation results to a JSON file. Returns the path."""
        out = Path(output_path)
        out.parent.mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
        if out.is_dir():
            filepath = out / f"chat_eval_{timestamp}.json"
        else:
            filepath = out
        # Build serializable payload
        entries: list[dict] = []
        for r in results:
            entry: dict[str, Any] = {
                "query": r.test_case.query,
                "creator": r.test_case.creator,
                "personality_weight": r.test_case.personality_weight,
                "category": r.test_case.category,
                "description": r.test_case.description,
                "response_length": len(r.response),
                "source_count": len(r.sources),
                "cascade_tier": r.cascade_tier,
                "latency_seconds": r.latency_seconds,
            }
            if r.request_error:
                entry["error"] = r.request_error
            elif r.score:
                entry["scores"] = r.score.scores
                entry["composite"] = r.score.composite
                entry["justifications"] = r.score.justifications
                entry["scoring_time"] = r.score.elapsed_seconds
                if r.score.error:
                    entry["scoring_error"] = r.score.error
            entries.append(entry)
        # Summary stats
        scored = [e for e in entries if "composite" in e]
        avg_composite = (
            sum(e["composite"] for e in scored) / len(scored) if scored else 0.0
        )
        dim_avgs: dict[str, float] = {}
        for dim in CHAT_DIMENSIONS:
            vals = [e["scores"][dim] for e in scored if dim in e.get("scores", {})]
            dim_avgs[dim] = round(sum(vals) / len(vals), 3) if vals else 0.0
        payload = {
            "timestamp": timestamp,
            "total_queries": len(results),
            "scored_queries": len(scored),
            "errors": len(results) - len(scored),
            "average_composite": round(avg_composite, 3),
            "dimension_averages": dim_avgs,
            "results": entries,
        }
        filepath.write_text(json.dumps(payload, indent=2), encoding="utf-8")
        return str(filepath)
    @staticmethod
    def print_summary(results: list[ChatEvalResult]) -> None:
        """Print a summary table of evaluation results."""
        print("\n" + "=" * 72)
        print("  CHAT EVALUATION SUMMARY")
        print("=" * 72)
        scored = [r for r in results if r.score and not r.score.error and not r.request_error]
        errored = [r for r in results if r.request_error or (r.score and r.score.error)]
        if not scored:
            print("\n  No successfully scored responses.\n")
            if errored:
                print(f"  Errors: {len(errored)}")
                for r in errored:
                    err = r.request_error or (r.score.error if r.score else "unknown")
                    print(f"    - {r.test_case.query[:50]}: {err}")
            print("=" * 72 + "\n")
            return
        # Header
        print(f"\n  {'Category':<12s} {'Query':<30s} {'Comp':>5s} {'Cite':>5s} {'Struct':>6s} {'Domain':>6s} {'Ground':>6s} {'Person':>6s}")
        print(f"  {'─'*12} {'─'*30} {'─'*5} {'─'*5} {'─'*6} {'─'*6} {'─'*6} {'─'*6}")
        for r in scored:
            s = r.score
            assert s is not None
            q = r.test_case.query[:30]
            cat = r.test_case.category[:12]
            print(
                f"  {cat:<12s} {q:<30s} "
                f"{s.composite:5.2f} "
                f"{s.citation_accuracy:5.2f} "
                f"{s.response_structure:6.2f} "
                f"{s.domain_expertise:6.2f} "
                f"{s.source_grounding:6.2f} "
                f"{s.personality_fidelity:6.2f}"
            )
        # Averages
        avg_comp = sum(r.score.composite for r in scored) / len(scored)
        avg_dims = {}
        for dim in CHAT_DIMENSIONS:
            vals = [r.score.scores.get(dim, 0.0) for r in scored]
            avg_dims[dim] = sum(vals) / len(vals)
        print(f"\n  {'AVERAGE':<12s} {'':30s} "
              f"{avg_comp:5.2f} "
              f"{avg_dims['citation_accuracy']:5.2f} "
              f"{avg_dims['response_structure']:6.2f} "
              f"{avg_dims['domain_expertise']:6.2f} "
              f"{avg_dims['source_grounding']:6.2f} "
              f"{avg_dims['personality_fidelity']:6.2f}")
        if errored:
            print(f"\n  Errors: {len(errored)}")
            for r in errored:
                err = r.request_error or (r.score.error if r.score else "unknown")
                print(f"    - {r.test_case.query[:50]}: {err}")
        print("=" * 72 + "\n")
--- a/backend/pipeline/quality/chat_scorer.py
+++ b/backend/pipeline/quality/chat_scorer.py
@ -0,0 +1,271 @@
 """Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
 Scores chat responses across 5 dimensions:
 - citation_accuracy: Are citations real and correctly numbered?
 - response_structure: Concise, well-organized, uses appropriate formatting?
 - domain_expertise: Music production terminology used naturally?
 - source_grounding: Claims backed by provided sources, no fabrication?
 - personality_fidelity: At weight>0, response reflects creator voice?
 Run via: python -m pipeline.quality chat_eval --suite <path>
 """
 from __future__ import annotations
 import json
 import logging
 import time
 from dataclasses import dataclass, field
 import openai
 from pipeline.llm_client import LLMClient
 logger = logging.getLogger(__name__)
 CHAT_DIMENSIONS = [
    "citation_accuracy",
    "response_structure",
    "domain_expertise",
    "source_grounding",
    "personality_fidelity",
 ]
 CHAT_RUBRIC = """\
 You are an expert evaluator of AI chat response quality for a music production knowledge base.
 You will be given:
 1. The user's query
 2. The assistant's response
 3. The numbered source citations that were provided to the assistant
 4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
 5. The creator_name (if any)
 Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
 **citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
 - 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
 - 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
 - 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
 **response_structure** — Response is concise, well-organized, uses appropriate formatting
 - 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
 - 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
 - 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
 **domain_expertise** — Music production terminology used naturally and correctly
 - 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
 - 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
 - 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
 **source_grounding** — Claims are backed by provided sources, no fabrication
 - 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
 - 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
 - 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
 **personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
 - If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
 - If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
 - If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
 - If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
 Return ONLY a JSON object with this exact structure:
 {
  "citation_accuracy": <float 0.0-1.0>,
  "response_structure": <float 0.0-1.0>,
  "domain_expertise": <float 0.0-1.0>,
  "source_grounding": <float 0.0-1.0>,
  "personality_fidelity": <float 0.0-1.0>,
  "justifications": {
    "citation_accuracy": "<1-2 sentence justification>",
    "response_structure": "<1-2 sentence justification>",
    "domain_expertise": "<1-2 sentence justification>",
    "source_grounding": "<1-2 sentence justification>",
    "personality_fidelity": "<1-2 sentence justification>"
  }
 }
 """
@dataclass
 class ChatScoreResult:
    """Outcome of scoring a chat response across quality dimensions."""
    scores: dict[str, float] = field(default_factory=dict)
    composite: float = 0.0
    justifications: dict[str, str] = field(default_factory=dict)
    elapsed_seconds: float = 0.0
    error: str | None = None
    # Convenience properties
    @property
    def citation_accuracy(self) -> float:
        return self.scores.get("citation_accuracy", 0.0)
    @property
    def response_structure(self) -> float:
        return self.scores.get("response_structure", 0.0)
    @property
    def domain_expertise(self) -> float:
        return self.scores.get("domain_expertise", 0.0)
    @property
    def source_grounding(self) -> float:
        return self.scores.get("source_grounding", 0.0)
    @property
    def personality_fidelity(self) -> float:
        return self.scores.get("personality_fidelity", 0.0)
 class ChatScoreRunner:
    """Scores chat responses using LLM-as-judge evaluation."""
    def __init__(self, client: LLMClient) -> None:
        self.client = client
    def score_response(
        self,
        query: str,
        response: str,
        sources: list[dict],
        personality_weight: float = 0.0,
        creator_name: str | None = None,
    ) -> ChatScoreResult:
        """Score a single chat response against the 5 chat quality dimensions.
        Parameters
        ----------
        query:
            The user's original query.
        response:
            The assistant's accumulated response text.
        sources:
            List of source citation dicts (as emitted by the SSE sources event).
        personality_weight:
            0.0 = encyclopedic mode, >0 = personality mode.
        creator_name:
            Creator name, if this was a creator-scoped query.
        Returns
        -------
        ChatScoreResult with per-dimension scores.
        """
        sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
        user_prompt = (
            f"## User Query\n\n{query}\n\n"
            f"## Assistant Response\n\n{response}\n\n"
            f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
            f"## Metadata\n\n"
            f"- personality_weight: {personality_weight}\n"
            f"- creator_name: {creator_name or '(none)'}\n\n"
            f"Score this chat response across all 5 dimensions."
        )
        t0 = time.monotonic()
        try:
            from pydantic import BaseModel as _BM
            resp = self.client.complete(
                system_prompt=CHAT_RUBRIC,
                user_prompt=user_prompt,
                response_model=_BM,
                modality="chat",
            )
            elapsed = round(time.monotonic() - t0, 2)
        except (openai.APIConnectionError, openai.APITimeoutError) as exc:
            elapsed = round(time.monotonic() - t0, 2)
            return ChatScoreResult(
                elapsed_seconds=elapsed,
                error=f"Cannot reach LLM judge. Error: {exc}",
            )
        raw_text = str(resp).strip()
        try:
            parsed = json.loads(raw_text)
        except json.JSONDecodeError:
            logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
            return ChatScoreResult(
                elapsed_seconds=elapsed,
                error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
            )
        return self._parse_scores(parsed, elapsed)
    def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
        """Extract and validate scores from parsed JSON judge response."""
        scores: dict[str, float] = {}
        justifications: dict[str, str] = {}
        raw_justifications = parsed.get("justifications", {})
        if not isinstance(raw_justifications, dict):
            raw_justifications = {}
        for dim in CHAT_DIMENSIONS:
            raw = parsed.get(dim)
            if raw is None:
                logger.warning("Missing dimension '%s' in chat judge response", dim)
                scores[dim] = 0.0
                justifications[dim] = "(missing from judge response)"
                continue
            try:
                val = float(raw)
                scores[dim] = max(0.0, min(1.0, val))
            except (TypeError, ValueError):
                logger.warning("Invalid value for '%s': %r", dim, raw)
                scores[dim] = 0.0
                justifications[dim] = f"(invalid value: {raw!r})"
                continue
            justifications[dim] = str(raw_justifications.get(dim, ""))
        composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
        return ChatScoreResult(
            scores=scores,
            composite=round(composite, 3),
            justifications=justifications,
            elapsed_seconds=elapsed,
        )
    def print_report(self, result: ChatScoreResult, query: str = "") -> None:
        """Print a formatted chat scoring report to stdout."""
        print("\n" + "=" * 60)
        print("  CHAT QUALITY SCORE REPORT")
        if query:
            print(f"  Query: {query[:60]}{'...' if len(query) > 60 else ''}")
        print("=" * 60)
        if result.error:
            print(f"\n  ✗ Error: {result.error}\n")
            print("=" * 60 + "\n")
            return
        for dim in CHAT_DIMENSIONS:
            score = result.scores.get(dim, 0.0)
            filled = int(score * 20)
            bar = "█" * filled + "░" * (20 - filled)
            justification = result.justifications.get(dim, "")
            print(f"\n  {dim.replace('_', ' ').title()}")
            print(f"    Score: {score:.2f}  {bar}")
            if justification:
                # Simple word wrap at ~56 chars
                words = justification.split()
                lines: list[str] = []
                current = ""
                for word in words:
                    if current and len(current) + len(word) + 1 > 56:
                        lines.append(current)
                        current = word
                    else:
                        current = f"{current} {word}" if current else word
                if current:
                    lines.append(current)
                for line in lines:
                    print(f"    {line}")
        print("\n" + "-" * 60)
        print(f"  Composite: {result.composite:.3f}")
        print(f"  Time: {result.elapsed_seconds}s")
        print("=" * 60 + "\n")
--- a/backend/pipeline/quality/fixtures/chat_test_suite.yaml
+++ b/backend/pipeline/quality/fixtures/chat_test_suite.yaml
@ -0,0 +1,72 @@
 # Chat quality evaluation test suite
 # 10 representative queries across 4 categories:
 #   - technical: How-to questions about specific production techniques
 #   - conceptual: Broader understanding questions about audio concepts
 #   - creator: Creator-scoped queries at different personality weights
 #   - cross_creator: Queries spanning multiple creators' approaches
 queries:
  # ── Technical how-to (2) ────────────────────────────────────────────
  - query: "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?"
    creator: null
    personality_weight: 0.0
    category: technical
    description: "Common sidechain compression setup — expects specific settings (ratio, attack, release)"
  - query: "What are the best EQ settings for cleaning up a muddy vocal recording?"
    creator: null
    personality_weight: 0.0
    category: technical
    description: "Vocal EQ technique — expects frequency ranges, Q values, cut/boost guidance"
  # ── Conceptual (2) ─────────────────────────────────────────────────
  - query: "What is the difference between parallel compression and serial compression, and when should I use each?"
    creator: null
    personality_weight: 0.0
    category: conceptual
    description: "Conceptual comparison — expects clear definitions, use cases, pros/cons"
  - query: "How does sample rate affect sound quality in music production?"
    creator: null
    personality_weight: 0.0
    category: conceptual
    description: "Audio fundamentals — expects Nyquist, aliasing, practical guidance"
  # ── Creator-specific: encyclopedic (2) ──────────────────────────────
  - query: "How does this creator approach sound design for bass sounds?"
    creator: "KEOTA"
    personality_weight: 0.0
    category: creator_encyclopedic
    description: "Creator-scoped query at weight=0 — should be neutral/encyclopedic about KEOTA's techniques"
  - query: "What mixing techniques does this creator recommend for achieving width in a mix?"
    creator: "Mr. Bill"
    personality_weight: 0.0
    category: creator_encyclopedic
    description: "Creator-scoped query at weight=0 — neutral tone about Mr. Bill's approach"
  # ── Creator-specific: personality (2) ───────────────────────────────
  - query: "How does this creator approach sound design for bass sounds?"
    creator: "KEOTA"
    personality_weight: 0.7
    category: creator_personality
    description: "Same query as above but at weight=0.7 — should reflect KEOTA's voice and teaching style"
  - query: "What mixing techniques does this creator recommend for achieving width in a mix?"
    creator: "Mr. Bill"
    personality_weight: 0.7
    category: creator_personality
    description: "Same query as above but at weight=0.7 — should reflect Mr. Bill's voice"
  # ── Cross-creator (2) ──────────────────────────────────────────────
  - query: "What are the different approaches to layering synth sounds across creators?"
    creator: null
    personality_weight: 0.0
    category: cross_creator
    description: "Cross-creator comparison — should cite multiple creators' techniques"
  - query: "How do different producers approach drum processing and what plugins do they prefer?"
    creator: null
    personality_weight: 0.0
    category: cross_creator
    description: "Cross-creator comparison on drums — expects multiple perspectives with citations"