diff --git a/backend/pipeline/quality/__main__.py b/backend/pipeline/quality/__main__.py index 8c6d062..cbb000e 100644 --- a/backend/pipeline/quality/__main__.py +++ b/backend/pipeline/quality/__main__.py @@ -18,6 +18,8 @@ from pathlib import Path from config import get_settings from pipeline.llm_client import LLMClient +from .chat_eval import ChatEvalRunner +from .chat_scorer import ChatScoreRunner from .fitness import FitnessRunner from .optimizer import OptimizationLoop, OptimizationResult from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner @@ -260,6 +262,36 @@ def main() -> int: help="Write the winning prompt back to the stage's prompt file (backs up the original first)", ) + # -- chat_eval subcommand -- + chat_parser = sub.add_parser( + "chat_eval", + help="Evaluate chat quality across a test suite of queries", + ) + chat_parser.add_argument( + "--suite", + type=str, + required=True, + help="Path to a chat test suite YAML/JSON file", + ) + chat_parser.add_argument( + "--base-url", + type=str, + default="http://localhost:8096", + help="Chat API base URL (default: http://localhost:8096)", + ) + chat_parser.add_argument( + "--output", + type=str, + default="backend/pipeline/quality/results/", + help="Output path for results JSON (default: backend/pipeline/quality/results/)", + ) + chat_parser.add_argument( + "--timeout", + type=float, + default=120.0, + help="Request timeout in seconds (default: 120)", + ) + args = parser.parse_args() if args.command is None: @@ -281,6 +313,9 @@ def main() -> int: if args.command == "apply": return _run_apply(args) + if args.command == "chat_eval": + return _run_chat_eval(args) + return 0 @@ -558,5 +593,54 @@ def _run_apply(args: argparse.Namespace) -> int: return 0 if success else 1 +def _run_chat_eval(args: argparse.Namespace) -> int: + """Execute the chat_eval subcommand — evaluate chat quality across a test suite.""" + suite_path = Path(args.suite) + if not suite_path.exists(): + print(f"Error: suite file not found: {args.suite}", file=sys.stderr) + return 1 + + # Load test cases + try: + cases = ChatEvalRunner.load_suite(suite_path) + except Exception as exc: + print(f"Error loading test suite: {exc}", file=sys.stderr) + return 1 + + if not cases: + print("Error: test suite contains no queries", file=sys.stderr) + return 1 + + print(f"\n Chat Evaluation: {len(cases)} queries from {suite_path}") + print(f" Endpoint: {args.base_url}") + + # Build scorer and runner + settings = get_settings() + client = LLMClient(settings) + scorer = ChatScoreRunner(client) + runner = ChatEvalRunner( + scorer=scorer, + base_url=args.base_url, + timeout=args.timeout, + ) + + # Execute + results = runner.run_suite(cases) + + # Print summary + runner.print_summary(results) + + # Write results + try: + json_path = runner.write_results(results, args.output) + print(f" Results written to: {json_path}") + except OSError as exc: + print(f" Warning: failed to write results: {exc}", file=sys.stderr) + + # Exit code: 0 if at least one scored, 1 if all errored + scored = [r for r in results if r.score and not r.score.error and not r.request_error] + return 0 if scored else 1 + + if __name__ == "__main__": sys.exit(main()) diff --git a/backend/pipeline/quality/chat_eval.py b/backend/pipeline/quality/chat_eval.py new file mode 100644 index 0000000..b43dd80 --- /dev/null +++ b/backend/pipeline/quality/chat_eval.py @@ -0,0 +1,352 @@ +"""Chat evaluation harness — sends queries to the live chat endpoint, scores responses. + +Loads a test suite (YAML or JSON), calls the chat HTTP endpoint for each query, +parses SSE events to collect response text and sources, then scores each using +ChatScoreRunner. Writes results to a JSON file. + +Usage: + python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml + python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml --base-url http://ub01:8096 +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import httpx + +from pipeline.llm_client import LLMClient +from pipeline.quality.chat_scorer import CHAT_DIMENSIONS, ChatScoreResult, ChatScoreRunner + +logger = logging.getLogger(__name__) + +_DEFAULT_BASE_URL = "http://localhost:8096" +_CHAT_ENDPOINT = "/api/chat" +_REQUEST_TIMEOUT = 120.0 # seconds — LLM streaming can be slow + + +@dataclass +class ChatTestCase: + """A single test case from the test suite.""" + + query: str + creator: str | None = None + personality_weight: float = 0.0 + category: str = "general" + description: str = "" + + +@dataclass +class ChatEvalResult: + """Result of evaluating a single test case.""" + + test_case: ChatTestCase + response: str = "" + sources: list[dict] = field(default_factory=list) + cascade_tier: str = "" + score: ChatScoreResult | None = None + request_error: str | None = None + latency_seconds: float = 0.0 + + +class ChatEvalRunner: + """Runs a chat evaluation suite against a live endpoint.""" + + def __init__( + self, + scorer: ChatScoreRunner, + base_url: str = _DEFAULT_BASE_URL, + timeout: float = _REQUEST_TIMEOUT, + ) -> None: + self.scorer = scorer + self.base_url = base_url.rstrip("/") + self.timeout = timeout + + @staticmethod + def load_suite(path: str | Path) -> list[ChatTestCase]: + """Load test cases from a YAML or JSON file. + + Expected format (YAML): + queries: + - query: "How do I sidechain a bass?" + creator: null + personality_weight: 0.0 + category: technical + description: "Basic sidechain compression question" + """ + filepath = Path(path) + text = filepath.read_text(encoding="utf-8") + + if filepath.suffix in (".yaml", ".yml"): + try: + import yaml + except ImportError: + raise ImportError( + "PyYAML is required to load YAML test suites. " + "Install with: pip install pyyaml" + ) + data = yaml.safe_load(text) + else: + data = json.loads(text) + + queries = data.get("queries", []) + cases: list[ChatTestCase] = [] + for q in queries: + cases.append(ChatTestCase( + query=q["query"], + creator=q.get("creator"), + personality_weight=float(q.get("personality_weight", 0.0)), + category=q.get("category", "general"), + description=q.get("description", ""), + )) + return cases + + def run_suite(self, cases: list[ChatTestCase]) -> list[ChatEvalResult]: + """Execute all test cases sequentially, scoring each response.""" + results: list[ChatEvalResult] = [] + + for i, case in enumerate(cases, 1): + print(f"\n [{i}/{len(cases)}] {case.category}: {case.query[:60]}...") + result = self._run_single(case) + results.append(result) + + if result.request_error: + print(f" ✗ Request error: {result.request_error}") + elif result.score and result.score.error: + print(f" ✗ Scoring error: {result.score.error}") + elif result.score: + print(f" ✓ Composite: {result.score.composite:.3f} " + f"(latency: {result.latency_seconds:.1f}s)") + + return results + + def _run_single(self, case: ChatTestCase) -> ChatEvalResult: + """Execute a single test case: call endpoint, parse SSE, score.""" + eval_result = ChatEvalResult(test_case=case) + + # Call the chat endpoint + t0 = time.monotonic() + try: + response_text, sources, cascade_tier = self._call_chat_endpoint(case) + eval_result.latency_seconds = round(time.monotonic() - t0, 2) + except Exception as exc: + eval_result.latency_seconds = round(time.monotonic() - t0, 2) + eval_result.request_error = str(exc) + logger.error("chat_eval_request_error query=%r error=%s", case.query, exc) + return eval_result + + eval_result.response = response_text + eval_result.sources = sources + eval_result.cascade_tier = cascade_tier + + if not response_text: + eval_result.request_error = "Empty response from chat endpoint" + return eval_result + + # Score the response + eval_result.score = self.scorer.score_response( + query=case.query, + response=response_text, + sources=sources, + personality_weight=case.personality_weight, + creator_name=case.creator, + ) + + return eval_result + + def _call_chat_endpoint( + self, case: ChatTestCase + ) -> tuple[str, list[dict], str]: + """Call the chat SSE endpoint and parse the event stream. + + Returns (accumulated_text, sources_list, cascade_tier). + """ + url = f"{self.base_url}{_CHAT_ENDPOINT}" + payload: dict[str, Any] = {"query": case.query} + if case.creator: + payload["creator"] = case.creator + if case.personality_weight > 0: + payload["personality_weight"] = case.personality_weight + + sources: list[dict] = [] + accumulated = "" + cascade_tier = "" + + with httpx.Client(timeout=self.timeout) as client: + with client.stream("POST", url, json=payload) as resp: + resp.raise_for_status() + + buffer = "" + for chunk in resp.iter_text(): + buffer += chunk + # Parse SSE events from buffer + while "\n\n" in buffer: + event_block, buffer = buffer.split("\n\n", 1) + event_type, event_data = self._parse_sse_event(event_block) + + if event_type == "sources": + sources = event_data if isinstance(event_data, list) else [] + elif event_type == "token": + accumulated += event_data if isinstance(event_data, str) else str(event_data) + elif event_type == "done": + if isinstance(event_data, dict): + cascade_tier = event_data.get("cascade_tier", "") + elif event_type == "error": + msg = event_data.get("message", str(event_data)) if isinstance(event_data, dict) else str(event_data) + raise RuntimeError(f"Chat endpoint returned error: {msg}") + + return accumulated, sources, cascade_tier + + @staticmethod + def _parse_sse_event(block: str) -> tuple[str, Any]: + """Parse a single SSE event block into (event_type, data).""" + event_type = "" + data_lines: list[str] = [] + + for line in block.strip().splitlines(): + if line.startswith("event: "): + event_type = line[7:].strip() + elif line.startswith("data: "): + data_lines.append(line[6:]) + elif line.startswith("data:"): + data_lines.append(line[5:]) + + raw_data = "\n".join(data_lines) + try: + parsed = json.loads(raw_data) + except (json.JSONDecodeError, ValueError): + parsed = raw_data # plain text token + + return event_type, parsed + + @staticmethod + def write_results( + results: list[ChatEvalResult], + output_path: str | Path, + ) -> str: + """Write evaluation results to a JSON file. Returns the path.""" + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + if out.is_dir(): + filepath = out / f"chat_eval_{timestamp}.json" + else: + filepath = out + + # Build serializable payload + entries: list[dict] = [] + for r in results: + entry: dict[str, Any] = { + "query": r.test_case.query, + "creator": r.test_case.creator, + "personality_weight": r.test_case.personality_weight, + "category": r.test_case.category, + "description": r.test_case.description, + "response_length": len(r.response), + "source_count": len(r.sources), + "cascade_tier": r.cascade_tier, + "latency_seconds": r.latency_seconds, + } + + if r.request_error: + entry["error"] = r.request_error + elif r.score: + entry["scores"] = r.score.scores + entry["composite"] = r.score.composite + entry["justifications"] = r.score.justifications + entry["scoring_time"] = r.score.elapsed_seconds + if r.score.error: + entry["scoring_error"] = r.score.error + + entries.append(entry) + + # Summary stats + scored = [e for e in entries if "composite" in e] + avg_composite = ( + sum(e["composite"] for e in scored) / len(scored) if scored else 0.0 + ) + dim_avgs: dict[str, float] = {} + for dim in CHAT_DIMENSIONS: + vals = [e["scores"][dim] for e in scored if dim in e.get("scores", {})] + dim_avgs[dim] = round(sum(vals) / len(vals), 3) if vals else 0.0 + + payload = { + "timestamp": timestamp, + "total_queries": len(results), + "scored_queries": len(scored), + "errors": len(results) - len(scored), + "average_composite": round(avg_composite, 3), + "dimension_averages": dim_avgs, + "results": entries, + } + + filepath.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(filepath) + + @staticmethod + def print_summary(results: list[ChatEvalResult]) -> None: + """Print a summary table of evaluation results.""" + print("\n" + "=" * 72) + print(" CHAT EVALUATION SUMMARY") + print("=" * 72) + + scored = [r for r in results if r.score and not r.score.error and not r.request_error] + errored = [r for r in results if r.request_error or (r.score and r.score.error)] + + if not scored: + print("\n No successfully scored responses.\n") + if errored: + print(f" Errors: {len(errored)}") + for r in errored: + err = r.request_error or (r.score.error if r.score else "unknown") + print(f" - {r.test_case.query[:50]}: {err}") + print("=" * 72 + "\n") + return + + # Header + print(f"\n {'Category':<12s} {'Query':<30s} {'Comp':>5s} {'Cite':>5s} {'Struct':>6s} {'Domain':>6s} {'Ground':>6s} {'Person':>6s}") + print(f" {'─'*12} {'─'*30} {'─'*5} {'─'*5} {'─'*6} {'─'*6} {'─'*6} {'─'*6}") + + for r in scored: + s = r.score + assert s is not None + q = r.test_case.query[:30] + cat = r.test_case.category[:12] + print( + f" {cat:<12s} {q:<30s} " + f"{s.composite:5.2f} " + f"{s.citation_accuracy:5.2f} " + f"{s.response_structure:6.2f} " + f"{s.domain_expertise:6.2f} " + f"{s.source_grounding:6.2f} " + f"{s.personality_fidelity:6.2f}" + ) + + # Averages + avg_comp = sum(r.score.composite for r in scored) / len(scored) + avg_dims = {} + for dim in CHAT_DIMENSIONS: + vals = [r.score.scores.get(dim, 0.0) for r in scored] + avg_dims[dim] = sum(vals) / len(vals) + + print(f"\n {'AVERAGE':<12s} {'':30s} " + f"{avg_comp:5.2f} " + f"{avg_dims['citation_accuracy']:5.2f} " + f"{avg_dims['response_structure']:6.2f} " + f"{avg_dims['domain_expertise']:6.2f} " + f"{avg_dims['source_grounding']:6.2f} " + f"{avg_dims['personality_fidelity']:6.2f}") + + if errored: + print(f"\n Errors: {len(errored)}") + for r in errored: + err = r.request_error or (r.score.error if r.score else "unknown") + print(f" - {r.test_case.query[:50]}: {err}") + + print("=" * 72 + "\n") diff --git a/backend/pipeline/quality/chat_scorer.py b/backend/pipeline/quality/chat_scorer.py new file mode 100644 index 0000000..702eab4 --- /dev/null +++ b/backend/pipeline/quality/chat_scorer.py @@ -0,0 +1,271 @@ +"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses. + +Scores chat responses across 5 dimensions: +- citation_accuracy: Are citations real and correctly numbered? +- response_structure: Concise, well-organized, uses appropriate formatting? +- domain_expertise: Music production terminology used naturally? +- source_grounding: Claims backed by provided sources, no fabrication? +- personality_fidelity: At weight>0, response reflects creator voice? + +Run via: python -m pipeline.quality chat_eval --suite +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field + +import openai + +from pipeline.llm_client import LLMClient + +logger = logging.getLogger(__name__) + +CHAT_DIMENSIONS = [ + "citation_accuracy", + "response_structure", + "domain_expertise", + "source_grounding", + "personality_fidelity", +] + +CHAT_RUBRIC = """\ +You are an expert evaluator of AI chat response quality for a music production knowledge base. + +You will be given: +1. The user's query +2. The assistant's response +3. The numbered source citations that were provided to the assistant +4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected) +5. The creator_name (if any) + +Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0: + +**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources +- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations +- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers +- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims + +**response_structure** — Response is concise, well-organized, uses appropriate formatting +- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded) +- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help +- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan + +**domain_expertise** — Music production terminology used naturally and correctly +- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer +- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused +- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly + +**source_grounding** — Claims are backed by provided sources, no fabrication +- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources) +- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources +- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source + +**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight +- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5. +- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic. +- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator. +- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona. + +Return ONLY a JSON object with this exact structure: +{ + "citation_accuracy": , + "response_structure": , + "domain_expertise": , + "source_grounding": , + "personality_fidelity": , + "justifications": { + "citation_accuracy": "<1-2 sentence justification>", + "response_structure": "<1-2 sentence justification>", + "domain_expertise": "<1-2 sentence justification>", + "source_grounding": "<1-2 sentence justification>", + "personality_fidelity": "<1-2 sentence justification>" + } +} +""" + + +@dataclass +class ChatScoreResult: + """Outcome of scoring a chat response across quality dimensions.""" + + scores: dict[str, float] = field(default_factory=dict) + composite: float = 0.0 + justifications: dict[str, str] = field(default_factory=dict) + elapsed_seconds: float = 0.0 + error: str | None = None + + # Convenience properties + @property + def citation_accuracy(self) -> float: + return self.scores.get("citation_accuracy", 0.0) + + @property + def response_structure(self) -> float: + return self.scores.get("response_structure", 0.0) + + @property + def domain_expertise(self) -> float: + return self.scores.get("domain_expertise", 0.0) + + @property + def source_grounding(self) -> float: + return self.scores.get("source_grounding", 0.0) + + @property + def personality_fidelity(self) -> float: + return self.scores.get("personality_fidelity", 0.0) + + +class ChatScoreRunner: + """Scores chat responses using LLM-as-judge evaluation.""" + + def __init__(self, client: LLMClient) -> None: + self.client = client + + def score_response( + self, + query: str, + response: str, + sources: list[dict], + personality_weight: float = 0.0, + creator_name: str | None = None, + ) -> ChatScoreResult: + """Score a single chat response against the 5 chat quality dimensions. + + Parameters + ---------- + query: + The user's original query. + response: + The assistant's accumulated response text. + sources: + List of source citation dicts (as emitted by the SSE sources event). + personality_weight: + 0.0 = encyclopedic mode, >0 = personality mode. + creator_name: + Creator name, if this was a creator-scoped query. + + Returns + ------- + ChatScoreResult with per-dimension scores. + """ + sources_block = json.dumps(sources, indent=2) if sources else "(no sources)" + + user_prompt = ( + f"## User Query\n\n{query}\n\n" + f"## Assistant Response\n\n{response}\n\n" + f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n" + f"## Metadata\n\n" + f"- personality_weight: {personality_weight}\n" + f"- creator_name: {creator_name or '(none)'}\n\n" + f"Score this chat response across all 5 dimensions." + ) + + t0 = time.monotonic() + try: + from pydantic import BaseModel as _BM + resp = self.client.complete( + system_prompt=CHAT_RUBRIC, + user_prompt=user_prompt, + response_model=_BM, + modality="chat", + ) + elapsed = round(time.monotonic() - t0, 2) + except (openai.APIConnectionError, openai.APITimeoutError) as exc: + elapsed = round(time.monotonic() - t0, 2) + return ChatScoreResult( + elapsed_seconds=elapsed, + error=f"Cannot reach LLM judge. Error: {exc}", + ) + + raw_text = str(resp).strip() + try: + parsed = json.loads(raw_text) + except json.JSONDecodeError: + logger.error("Malformed chat judge response (not JSON): %.300s", raw_text) + return ChatScoreResult( + elapsed_seconds=elapsed, + error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}", + ) + + return self._parse_scores(parsed, elapsed) + + def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult: + """Extract and validate scores from parsed JSON judge response.""" + scores: dict[str, float] = {} + justifications: dict[str, str] = {} + + raw_justifications = parsed.get("justifications", {}) + if not isinstance(raw_justifications, dict): + raw_justifications = {} + + for dim in CHAT_DIMENSIONS: + raw = parsed.get(dim) + if raw is None: + logger.warning("Missing dimension '%s' in chat judge response", dim) + scores[dim] = 0.0 + justifications[dim] = "(missing from judge response)" + continue + + try: + val = float(raw) + scores[dim] = max(0.0, min(1.0, val)) + except (TypeError, ValueError): + logger.warning("Invalid value for '%s': %r", dim, raw) + scores[dim] = 0.0 + justifications[dim] = f"(invalid value: {raw!r})" + continue + + justifications[dim] = str(raw_justifications.get(dim, "")) + + composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0 + + return ChatScoreResult( + scores=scores, + composite=round(composite, 3), + justifications=justifications, + elapsed_seconds=elapsed, + ) + + def print_report(self, result: ChatScoreResult, query: str = "") -> None: + """Print a formatted chat scoring report to stdout.""" + print("\n" + "=" * 60) + print(" CHAT QUALITY SCORE REPORT") + if query: + print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}") + print("=" * 60) + + if result.error: + print(f"\n ✗ Error: {result.error}\n") + print("=" * 60 + "\n") + return + + for dim in CHAT_DIMENSIONS: + score = result.scores.get(dim, 0.0) + filled = int(score * 20) + bar = "█" * filled + "░" * (20 - filled) + justification = result.justifications.get(dim, "") + print(f"\n {dim.replace('_', ' ').title()}") + print(f" Score: {score:.2f} {bar}") + if justification: + # Simple word wrap at ~56 chars + words = justification.split() + lines: list[str] = [] + current = "" + for word in words: + if current and len(current) + len(word) + 1 > 56: + lines.append(current) + current = word + else: + current = f"{current} {word}" if current else word + if current: + lines.append(current) + for line in lines: + print(f" {line}") + + print("\n" + "-" * 60) + print(f" Composite: {result.composite:.3f}") + print(f" Time: {result.elapsed_seconds}s") + print("=" * 60 + "\n") diff --git a/backend/pipeline/quality/fixtures/chat_test_suite.yaml b/backend/pipeline/quality/fixtures/chat_test_suite.yaml new file mode 100644 index 0000000..faf4911 --- /dev/null +++ b/backend/pipeline/quality/fixtures/chat_test_suite.yaml @@ -0,0 +1,72 @@ +# Chat quality evaluation test suite +# 10 representative queries across 4 categories: +# - technical: How-to questions about specific production techniques +# - conceptual: Broader understanding questions about audio concepts +# - creator: Creator-scoped queries at different personality weights +# - cross_creator: Queries spanning multiple creators' approaches + +queries: + # ── Technical how-to (2) ──────────────────────────────────────────── + - query: "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?" + creator: null + personality_weight: 0.0 + category: technical + description: "Common sidechain compression setup — expects specific settings (ratio, attack, release)" + + - query: "What are the best EQ settings for cleaning up a muddy vocal recording?" + creator: null + personality_weight: 0.0 + category: technical + description: "Vocal EQ technique — expects frequency ranges, Q values, cut/boost guidance" + + # ── Conceptual (2) ───────────────────────────────────────────────── + - query: "What is the difference between parallel compression and serial compression, and when should I use each?" + creator: null + personality_weight: 0.0 + category: conceptual + description: "Conceptual comparison — expects clear definitions, use cases, pros/cons" + + - query: "How does sample rate affect sound quality in music production?" + creator: null + personality_weight: 0.0 + category: conceptual + description: "Audio fundamentals — expects Nyquist, aliasing, practical guidance" + + # ── Creator-specific: encyclopedic (2) ────────────────────────────── + - query: "How does this creator approach sound design for bass sounds?" + creator: "KEOTA" + personality_weight: 0.0 + category: creator_encyclopedic + description: "Creator-scoped query at weight=0 — should be neutral/encyclopedic about KEOTA's techniques" + + - query: "What mixing techniques does this creator recommend for achieving width in a mix?" + creator: "Mr. Bill" + personality_weight: 0.0 + category: creator_encyclopedic + description: "Creator-scoped query at weight=0 — neutral tone about Mr. Bill's approach" + + # ── Creator-specific: personality (2) ─────────────────────────────── + - query: "How does this creator approach sound design for bass sounds?" + creator: "KEOTA" + personality_weight: 0.7 + category: creator_personality + description: "Same query as above but at weight=0.7 — should reflect KEOTA's voice and teaching style" + + - query: "What mixing techniques does this creator recommend for achieving width in a mix?" + creator: "Mr. Bill" + personality_weight: 0.7 + category: creator_personality + description: "Same query as above but at weight=0.7 — should reflect Mr. Bill's voice" + + # ── Cross-creator (2) ────────────────────────────────────────────── + - query: "What are the different approaches to layering synth sounds across creators?" + creator: null + personality_weight: 0.0 + category: cross_creator + description: "Cross-creator comparison — should cite multiple creators' techniques" + + - query: "How do different producers approach drum processing and what plugins do they prefer?" + creator: null + personality_weight: 0.0 + category: cross_creator + description: "Cross-creator comparison on drums — expects multiple perspectives with citations"