test: Created chat-specific LLM-as-judge scorer (5 dimensions), SSE-par…
- "backend/pipeline/quality/chat_scorer.py" - "backend/pipeline/quality/chat_eval.py" - "backend/pipeline/quality/fixtures/chat_test_suite.yaml" - "backend/pipeline/quality/__main__.py" GSD-Task: S09/T01
This commit is contained in:
parent
183d852f31
commit
90bb90e989
4 changed files with 779 additions and 0 deletions
|
|
@ -18,6 +18,8 @@ from pathlib import Path
|
|||
from config import get_settings
|
||||
from pipeline.llm_client import LLMClient
|
||||
|
||||
from .chat_eval import ChatEvalRunner
|
||||
from .chat_scorer import ChatScoreRunner
|
||||
from .fitness import FitnessRunner
|
||||
from .optimizer import OptimizationLoop, OptimizationResult
|
||||
from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner
|
||||
|
|
@ -260,6 +262,36 @@ def main() -> int:
|
|||
help="Write the winning prompt back to the stage's prompt file (backs up the original first)",
|
||||
)
|
||||
|
||||
# -- chat_eval subcommand --
|
||||
chat_parser = sub.add_parser(
|
||||
"chat_eval",
|
||||
help="Evaluate chat quality across a test suite of queries",
|
||||
)
|
||||
chat_parser.add_argument(
|
||||
"--suite",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to a chat test suite YAML/JSON file",
|
||||
)
|
||||
chat_parser.add_argument(
|
||||
"--base-url",
|
||||
type=str,
|
||||
default="http://localhost:8096",
|
||||
help="Chat API base URL (default: http://localhost:8096)",
|
||||
)
|
||||
chat_parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="backend/pipeline/quality/results/",
|
||||
help="Output path for results JSON (default: backend/pipeline/quality/results/)",
|
||||
)
|
||||
chat_parser.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=120.0,
|
||||
help="Request timeout in seconds (default: 120)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command is None:
|
||||
|
|
@ -281,6 +313,9 @@ def main() -> int:
|
|||
if args.command == "apply":
|
||||
return _run_apply(args)
|
||||
|
||||
if args.command == "chat_eval":
|
||||
return _run_chat_eval(args)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
|
|
@ -558,5 +593,54 @@ def _run_apply(args: argparse.Namespace) -> int:
|
|||
return 0 if success else 1
|
||||
|
||||
|
||||
def _run_chat_eval(args: argparse.Namespace) -> int:
|
||||
"""Execute the chat_eval subcommand — evaluate chat quality across a test suite."""
|
||||
suite_path = Path(args.suite)
|
||||
if not suite_path.exists():
|
||||
print(f"Error: suite file not found: {args.suite}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Load test cases
|
||||
try:
|
||||
cases = ChatEvalRunner.load_suite(suite_path)
|
||||
except Exception as exc:
|
||||
print(f"Error loading test suite: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not cases:
|
||||
print("Error: test suite contains no queries", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"\n Chat Evaluation: {len(cases)} queries from {suite_path}")
|
||||
print(f" Endpoint: {args.base_url}")
|
||||
|
||||
# Build scorer and runner
|
||||
settings = get_settings()
|
||||
client = LLMClient(settings)
|
||||
scorer = ChatScoreRunner(client)
|
||||
runner = ChatEvalRunner(
|
||||
scorer=scorer,
|
||||
base_url=args.base_url,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
|
||||
# Execute
|
||||
results = runner.run_suite(cases)
|
||||
|
||||
# Print summary
|
||||
runner.print_summary(results)
|
||||
|
||||
# Write results
|
||||
try:
|
||||
json_path = runner.write_results(results, args.output)
|
||||
print(f" Results written to: {json_path}")
|
||||
except OSError as exc:
|
||||
print(f" Warning: failed to write results: {exc}", file=sys.stderr)
|
||||
|
||||
# Exit code: 0 if at least one scored, 1 if all errored
|
||||
scored = [r for r in results if r.score and not r.score.error and not r.request_error]
|
||||
return 0 if scored else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
|
|||
352
backend/pipeline/quality/chat_eval.py
Normal file
352
backend/pipeline/quality/chat_eval.py
Normal file
|
|
@ -0,0 +1,352 @@
|
|||
"""Chat evaluation harness — sends queries to the live chat endpoint, scores responses.
|
||||
|
||||
Loads a test suite (YAML or JSON), calls the chat HTTP endpoint for each query,
|
||||
parses SSE events to collect response text and sources, then scores each using
|
||||
ChatScoreRunner. Writes results to a JSON file.
|
||||
|
||||
Usage:
|
||||
python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml
|
||||
python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml --base-url http://ub01:8096
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from pipeline.llm_client import LLMClient
|
||||
from pipeline.quality.chat_scorer import CHAT_DIMENSIONS, ChatScoreResult, ChatScoreRunner
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_BASE_URL = "http://localhost:8096"
|
||||
_CHAT_ENDPOINT = "/api/chat"
|
||||
_REQUEST_TIMEOUT = 120.0 # seconds — LLM streaming can be slow
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChatTestCase:
|
||||
"""A single test case from the test suite."""
|
||||
|
||||
query: str
|
||||
creator: str | None = None
|
||||
personality_weight: float = 0.0
|
||||
category: str = "general"
|
||||
description: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChatEvalResult:
|
||||
"""Result of evaluating a single test case."""
|
||||
|
||||
test_case: ChatTestCase
|
||||
response: str = ""
|
||||
sources: list[dict] = field(default_factory=list)
|
||||
cascade_tier: str = ""
|
||||
score: ChatScoreResult | None = None
|
||||
request_error: str | None = None
|
||||
latency_seconds: float = 0.0
|
||||
|
||||
|
||||
class ChatEvalRunner:
|
||||
"""Runs a chat evaluation suite against a live endpoint."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scorer: ChatScoreRunner,
|
||||
base_url: str = _DEFAULT_BASE_URL,
|
||||
timeout: float = _REQUEST_TIMEOUT,
|
||||
) -> None:
|
||||
self.scorer = scorer
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.timeout = timeout
|
||||
|
||||
@staticmethod
|
||||
def load_suite(path: str | Path) -> list[ChatTestCase]:
|
||||
"""Load test cases from a YAML or JSON file.
|
||||
|
||||
Expected format (YAML):
|
||||
queries:
|
||||
- query: "How do I sidechain a bass?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: technical
|
||||
description: "Basic sidechain compression question"
|
||||
"""
|
||||
filepath = Path(path)
|
||||
text = filepath.read_text(encoding="utf-8")
|
||||
|
||||
if filepath.suffix in (".yaml", ".yml"):
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"PyYAML is required to load YAML test suites. "
|
||||
"Install with: pip install pyyaml"
|
||||
)
|
||||
data = yaml.safe_load(text)
|
||||
else:
|
||||
data = json.loads(text)
|
||||
|
||||
queries = data.get("queries", [])
|
||||
cases: list[ChatTestCase] = []
|
||||
for q in queries:
|
||||
cases.append(ChatTestCase(
|
||||
query=q["query"],
|
||||
creator=q.get("creator"),
|
||||
personality_weight=float(q.get("personality_weight", 0.0)),
|
||||
category=q.get("category", "general"),
|
||||
description=q.get("description", ""),
|
||||
))
|
||||
return cases
|
||||
|
||||
def run_suite(self, cases: list[ChatTestCase]) -> list[ChatEvalResult]:
|
||||
"""Execute all test cases sequentially, scoring each response."""
|
||||
results: list[ChatEvalResult] = []
|
||||
|
||||
for i, case in enumerate(cases, 1):
|
||||
print(f"\n [{i}/{len(cases)}] {case.category}: {case.query[:60]}...")
|
||||
result = self._run_single(case)
|
||||
results.append(result)
|
||||
|
||||
if result.request_error:
|
||||
print(f" ✗ Request error: {result.request_error}")
|
||||
elif result.score and result.score.error:
|
||||
print(f" ✗ Scoring error: {result.score.error}")
|
||||
elif result.score:
|
||||
print(f" ✓ Composite: {result.score.composite:.3f} "
|
||||
f"(latency: {result.latency_seconds:.1f}s)")
|
||||
|
||||
return results
|
||||
|
||||
def _run_single(self, case: ChatTestCase) -> ChatEvalResult:
|
||||
"""Execute a single test case: call endpoint, parse SSE, score."""
|
||||
eval_result = ChatEvalResult(test_case=case)
|
||||
|
||||
# Call the chat endpoint
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
response_text, sources, cascade_tier = self._call_chat_endpoint(case)
|
||||
eval_result.latency_seconds = round(time.monotonic() - t0, 2)
|
||||
except Exception as exc:
|
||||
eval_result.latency_seconds = round(time.monotonic() - t0, 2)
|
||||
eval_result.request_error = str(exc)
|
||||
logger.error("chat_eval_request_error query=%r error=%s", case.query, exc)
|
||||
return eval_result
|
||||
|
||||
eval_result.response = response_text
|
||||
eval_result.sources = sources
|
||||
eval_result.cascade_tier = cascade_tier
|
||||
|
||||
if not response_text:
|
||||
eval_result.request_error = "Empty response from chat endpoint"
|
||||
return eval_result
|
||||
|
||||
# Score the response
|
||||
eval_result.score = self.scorer.score_response(
|
||||
query=case.query,
|
||||
response=response_text,
|
||||
sources=sources,
|
||||
personality_weight=case.personality_weight,
|
||||
creator_name=case.creator,
|
||||
)
|
||||
|
||||
return eval_result
|
||||
|
||||
def _call_chat_endpoint(
|
||||
self, case: ChatTestCase
|
||||
) -> tuple[str, list[dict], str]:
|
||||
"""Call the chat SSE endpoint and parse the event stream.
|
||||
|
||||
Returns (accumulated_text, sources_list, cascade_tier).
|
||||
"""
|
||||
url = f"{self.base_url}{_CHAT_ENDPOINT}"
|
||||
payload: dict[str, Any] = {"query": case.query}
|
||||
if case.creator:
|
||||
payload["creator"] = case.creator
|
||||
if case.personality_weight > 0:
|
||||
payload["personality_weight"] = case.personality_weight
|
||||
|
||||
sources: list[dict] = []
|
||||
accumulated = ""
|
||||
cascade_tier = ""
|
||||
|
||||
with httpx.Client(timeout=self.timeout) as client:
|
||||
with client.stream("POST", url, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
|
||||
buffer = ""
|
||||
for chunk in resp.iter_text():
|
||||
buffer += chunk
|
||||
# Parse SSE events from buffer
|
||||
while "\n\n" in buffer:
|
||||
event_block, buffer = buffer.split("\n\n", 1)
|
||||
event_type, event_data = self._parse_sse_event(event_block)
|
||||
|
||||
if event_type == "sources":
|
||||
sources = event_data if isinstance(event_data, list) else []
|
||||
elif event_type == "token":
|
||||
accumulated += event_data if isinstance(event_data, str) else str(event_data)
|
||||
elif event_type == "done":
|
||||
if isinstance(event_data, dict):
|
||||
cascade_tier = event_data.get("cascade_tier", "")
|
||||
elif event_type == "error":
|
||||
msg = event_data.get("message", str(event_data)) if isinstance(event_data, dict) else str(event_data)
|
||||
raise RuntimeError(f"Chat endpoint returned error: {msg}")
|
||||
|
||||
return accumulated, sources, cascade_tier
|
||||
|
||||
@staticmethod
|
||||
def _parse_sse_event(block: str) -> tuple[str, Any]:
|
||||
"""Parse a single SSE event block into (event_type, data)."""
|
||||
event_type = ""
|
||||
data_lines: list[str] = []
|
||||
|
||||
for line in block.strip().splitlines():
|
||||
if line.startswith("event: "):
|
||||
event_type = line[7:].strip()
|
||||
elif line.startswith("data: "):
|
||||
data_lines.append(line[6:])
|
||||
elif line.startswith("data:"):
|
||||
data_lines.append(line[5:])
|
||||
|
||||
raw_data = "\n".join(data_lines)
|
||||
try:
|
||||
parsed = json.loads(raw_data)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
parsed = raw_data # plain text token
|
||||
|
||||
return event_type, parsed
|
||||
|
||||
@staticmethod
|
||||
def write_results(
|
||||
results: list[ChatEvalResult],
|
||||
output_path: str | Path,
|
||||
) -> str:
|
||||
"""Write evaluation results to a JSON file. Returns the path."""
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
if out.is_dir():
|
||||
filepath = out / f"chat_eval_{timestamp}.json"
|
||||
else:
|
||||
filepath = out
|
||||
|
||||
# Build serializable payload
|
||||
entries: list[dict] = []
|
||||
for r in results:
|
||||
entry: dict[str, Any] = {
|
||||
"query": r.test_case.query,
|
||||
"creator": r.test_case.creator,
|
||||
"personality_weight": r.test_case.personality_weight,
|
||||
"category": r.test_case.category,
|
||||
"description": r.test_case.description,
|
||||
"response_length": len(r.response),
|
||||
"source_count": len(r.sources),
|
||||
"cascade_tier": r.cascade_tier,
|
||||
"latency_seconds": r.latency_seconds,
|
||||
}
|
||||
|
||||
if r.request_error:
|
||||
entry["error"] = r.request_error
|
||||
elif r.score:
|
||||
entry["scores"] = r.score.scores
|
||||
entry["composite"] = r.score.composite
|
||||
entry["justifications"] = r.score.justifications
|
||||
entry["scoring_time"] = r.score.elapsed_seconds
|
||||
if r.score.error:
|
||||
entry["scoring_error"] = r.score.error
|
||||
|
||||
entries.append(entry)
|
||||
|
||||
# Summary stats
|
||||
scored = [e for e in entries if "composite" in e]
|
||||
avg_composite = (
|
||||
sum(e["composite"] for e in scored) / len(scored) if scored else 0.0
|
||||
)
|
||||
dim_avgs: dict[str, float] = {}
|
||||
for dim in CHAT_DIMENSIONS:
|
||||
vals = [e["scores"][dim] for e in scored if dim in e.get("scores", {})]
|
||||
dim_avgs[dim] = round(sum(vals) / len(vals), 3) if vals else 0.0
|
||||
|
||||
payload = {
|
||||
"timestamp": timestamp,
|
||||
"total_queries": len(results),
|
||||
"scored_queries": len(scored),
|
||||
"errors": len(results) - len(scored),
|
||||
"average_composite": round(avg_composite, 3),
|
||||
"dimension_averages": dim_avgs,
|
||||
"results": entries,
|
||||
}
|
||||
|
||||
filepath.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
return str(filepath)
|
||||
|
||||
@staticmethod
|
||||
def print_summary(results: list[ChatEvalResult]) -> None:
|
||||
"""Print a summary table of evaluation results."""
|
||||
print("\n" + "=" * 72)
|
||||
print(" CHAT EVALUATION SUMMARY")
|
||||
print("=" * 72)
|
||||
|
||||
scored = [r for r in results if r.score and not r.score.error and not r.request_error]
|
||||
errored = [r for r in results if r.request_error or (r.score and r.score.error)]
|
||||
|
||||
if not scored:
|
||||
print("\n No successfully scored responses.\n")
|
||||
if errored:
|
||||
print(f" Errors: {len(errored)}")
|
||||
for r in errored:
|
||||
err = r.request_error or (r.score.error if r.score else "unknown")
|
||||
print(f" - {r.test_case.query[:50]}: {err}")
|
||||
print("=" * 72 + "\n")
|
||||
return
|
||||
|
||||
# Header
|
||||
print(f"\n {'Category':<12s} {'Query':<30s} {'Comp':>5s} {'Cite':>5s} {'Struct':>6s} {'Domain':>6s} {'Ground':>6s} {'Person':>6s}")
|
||||
print(f" {'─'*12} {'─'*30} {'─'*5} {'─'*5} {'─'*6} {'─'*6} {'─'*6} {'─'*6}")
|
||||
|
||||
for r in scored:
|
||||
s = r.score
|
||||
assert s is not None
|
||||
q = r.test_case.query[:30]
|
||||
cat = r.test_case.category[:12]
|
||||
print(
|
||||
f" {cat:<12s} {q:<30s} "
|
||||
f"{s.composite:5.2f} "
|
||||
f"{s.citation_accuracy:5.2f} "
|
||||
f"{s.response_structure:6.2f} "
|
||||
f"{s.domain_expertise:6.2f} "
|
||||
f"{s.source_grounding:6.2f} "
|
||||
f"{s.personality_fidelity:6.2f}"
|
||||
)
|
||||
|
||||
# Averages
|
||||
avg_comp = sum(r.score.composite for r in scored) / len(scored)
|
||||
avg_dims = {}
|
||||
for dim in CHAT_DIMENSIONS:
|
||||
vals = [r.score.scores.get(dim, 0.0) for r in scored]
|
||||
avg_dims[dim] = sum(vals) / len(vals)
|
||||
|
||||
print(f"\n {'AVERAGE':<12s} {'':30s} "
|
||||
f"{avg_comp:5.2f} "
|
||||
f"{avg_dims['citation_accuracy']:5.2f} "
|
||||
f"{avg_dims['response_structure']:6.2f} "
|
||||
f"{avg_dims['domain_expertise']:6.2f} "
|
||||
f"{avg_dims['source_grounding']:6.2f} "
|
||||
f"{avg_dims['personality_fidelity']:6.2f}")
|
||||
|
||||
if errored:
|
||||
print(f"\n Errors: {len(errored)}")
|
||||
for r in errored:
|
||||
err = r.request_error or (r.score.error if r.score else "unknown")
|
||||
print(f" - {r.test_case.query[:50]}: {err}")
|
||||
|
||||
print("=" * 72 + "\n")
|
||||
271
backend/pipeline/quality/chat_scorer.py
Normal file
271
backend/pipeline/quality/chat_scorer.py
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
|
||||
|
||||
Scores chat responses across 5 dimensions:
|
||||
- citation_accuracy: Are citations real and correctly numbered?
|
||||
- response_structure: Concise, well-organized, uses appropriate formatting?
|
||||
- domain_expertise: Music production terminology used naturally?
|
||||
- source_grounding: Claims backed by provided sources, no fabrication?
|
||||
- personality_fidelity: At weight>0, response reflects creator voice?
|
||||
|
||||
Run via: python -m pipeline.quality chat_eval --suite <path>
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import openai
|
||||
|
||||
from pipeline.llm_client import LLMClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CHAT_DIMENSIONS = [
|
||||
"citation_accuracy",
|
||||
"response_structure",
|
||||
"domain_expertise",
|
||||
"source_grounding",
|
||||
"personality_fidelity",
|
||||
]
|
||||
|
||||
CHAT_RUBRIC = """\
|
||||
You are an expert evaluator of AI chat response quality for a music production knowledge base.
|
||||
|
||||
You will be given:
|
||||
1. The user's query
|
||||
2. The assistant's response
|
||||
3. The numbered source citations that were provided to the assistant
|
||||
4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
|
||||
5. The creator_name (if any)
|
||||
|
||||
Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
|
||||
|
||||
**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
|
||||
- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
|
||||
- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
|
||||
- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
|
||||
|
||||
**response_structure** — Response is concise, well-organized, uses appropriate formatting
|
||||
- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
|
||||
- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
|
||||
- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
|
||||
|
||||
**domain_expertise** — Music production terminology used naturally and correctly
|
||||
- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
|
||||
- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
|
||||
- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
|
||||
|
||||
**source_grounding** — Claims are backed by provided sources, no fabrication
|
||||
- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
|
||||
- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
|
||||
- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
|
||||
|
||||
**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
|
||||
- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
|
||||
- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
|
||||
- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
|
||||
- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
|
||||
|
||||
Return ONLY a JSON object with this exact structure:
|
||||
{
|
||||
"citation_accuracy": <float 0.0-1.0>,
|
||||
"response_structure": <float 0.0-1.0>,
|
||||
"domain_expertise": <float 0.0-1.0>,
|
||||
"source_grounding": <float 0.0-1.0>,
|
||||
"personality_fidelity": <float 0.0-1.0>,
|
||||
"justifications": {
|
||||
"citation_accuracy": "<1-2 sentence justification>",
|
||||
"response_structure": "<1-2 sentence justification>",
|
||||
"domain_expertise": "<1-2 sentence justification>",
|
||||
"source_grounding": "<1-2 sentence justification>",
|
||||
"personality_fidelity": "<1-2 sentence justification>"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChatScoreResult:
|
||||
"""Outcome of scoring a chat response across quality dimensions."""
|
||||
|
||||
scores: dict[str, float] = field(default_factory=dict)
|
||||
composite: float = 0.0
|
||||
justifications: dict[str, str] = field(default_factory=dict)
|
||||
elapsed_seconds: float = 0.0
|
||||
error: str | None = None
|
||||
|
||||
# Convenience properties
|
||||
@property
|
||||
def citation_accuracy(self) -> float:
|
||||
return self.scores.get("citation_accuracy", 0.0)
|
||||
|
||||
@property
|
||||
def response_structure(self) -> float:
|
||||
return self.scores.get("response_structure", 0.0)
|
||||
|
||||
@property
|
||||
def domain_expertise(self) -> float:
|
||||
return self.scores.get("domain_expertise", 0.0)
|
||||
|
||||
@property
|
||||
def source_grounding(self) -> float:
|
||||
return self.scores.get("source_grounding", 0.0)
|
||||
|
||||
@property
|
||||
def personality_fidelity(self) -> float:
|
||||
return self.scores.get("personality_fidelity", 0.0)
|
||||
|
||||
|
||||
class ChatScoreRunner:
|
||||
"""Scores chat responses using LLM-as-judge evaluation."""
|
||||
|
||||
def __init__(self, client: LLMClient) -> None:
|
||||
self.client = client
|
||||
|
||||
def score_response(
|
||||
self,
|
||||
query: str,
|
||||
response: str,
|
||||
sources: list[dict],
|
||||
personality_weight: float = 0.0,
|
||||
creator_name: str | None = None,
|
||||
) -> ChatScoreResult:
|
||||
"""Score a single chat response against the 5 chat quality dimensions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query:
|
||||
The user's original query.
|
||||
response:
|
||||
The assistant's accumulated response text.
|
||||
sources:
|
||||
List of source citation dicts (as emitted by the SSE sources event).
|
||||
personality_weight:
|
||||
0.0 = encyclopedic mode, >0 = personality mode.
|
||||
creator_name:
|
||||
Creator name, if this was a creator-scoped query.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ChatScoreResult with per-dimension scores.
|
||||
"""
|
||||
sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
|
||||
|
||||
user_prompt = (
|
||||
f"## User Query\n\n{query}\n\n"
|
||||
f"## Assistant Response\n\n{response}\n\n"
|
||||
f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
|
||||
f"## Metadata\n\n"
|
||||
f"- personality_weight: {personality_weight}\n"
|
||||
f"- creator_name: {creator_name or '(none)'}\n\n"
|
||||
f"Score this chat response across all 5 dimensions."
|
||||
)
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
from pydantic import BaseModel as _BM
|
||||
resp = self.client.complete(
|
||||
system_prompt=CHAT_RUBRIC,
|
||||
user_prompt=user_prompt,
|
||||
response_model=_BM,
|
||||
modality="chat",
|
||||
)
|
||||
elapsed = round(time.monotonic() - t0, 2)
|
||||
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
||||
elapsed = round(time.monotonic() - t0, 2)
|
||||
return ChatScoreResult(
|
||||
elapsed_seconds=elapsed,
|
||||
error=f"Cannot reach LLM judge. Error: {exc}",
|
||||
)
|
||||
|
||||
raw_text = str(resp).strip()
|
||||
try:
|
||||
parsed = json.loads(raw_text)
|
||||
except json.JSONDecodeError:
|
||||
logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
|
||||
return ChatScoreResult(
|
||||
elapsed_seconds=elapsed,
|
||||
error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
|
||||
)
|
||||
|
||||
return self._parse_scores(parsed, elapsed)
|
||||
|
||||
def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
|
||||
"""Extract and validate scores from parsed JSON judge response."""
|
||||
scores: dict[str, float] = {}
|
||||
justifications: dict[str, str] = {}
|
||||
|
||||
raw_justifications = parsed.get("justifications", {})
|
||||
if not isinstance(raw_justifications, dict):
|
||||
raw_justifications = {}
|
||||
|
||||
for dim in CHAT_DIMENSIONS:
|
||||
raw = parsed.get(dim)
|
||||
if raw is None:
|
||||
logger.warning("Missing dimension '%s' in chat judge response", dim)
|
||||
scores[dim] = 0.0
|
||||
justifications[dim] = "(missing from judge response)"
|
||||
continue
|
||||
|
||||
try:
|
||||
val = float(raw)
|
||||
scores[dim] = max(0.0, min(1.0, val))
|
||||
except (TypeError, ValueError):
|
||||
logger.warning("Invalid value for '%s': %r", dim, raw)
|
||||
scores[dim] = 0.0
|
||||
justifications[dim] = f"(invalid value: {raw!r})"
|
||||
continue
|
||||
|
||||
justifications[dim] = str(raw_justifications.get(dim, ""))
|
||||
|
||||
composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
|
||||
|
||||
return ChatScoreResult(
|
||||
scores=scores,
|
||||
composite=round(composite, 3),
|
||||
justifications=justifications,
|
||||
elapsed_seconds=elapsed,
|
||||
)
|
||||
|
||||
def print_report(self, result: ChatScoreResult, query: str = "") -> None:
|
||||
"""Print a formatted chat scoring report to stdout."""
|
||||
print("\n" + "=" * 60)
|
||||
print(" CHAT QUALITY SCORE REPORT")
|
||||
if query:
|
||||
print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}")
|
||||
print("=" * 60)
|
||||
|
||||
if result.error:
|
||||
print(f"\n ✗ Error: {result.error}\n")
|
||||
print("=" * 60 + "\n")
|
||||
return
|
||||
|
||||
for dim in CHAT_DIMENSIONS:
|
||||
score = result.scores.get(dim, 0.0)
|
||||
filled = int(score * 20)
|
||||
bar = "█" * filled + "░" * (20 - filled)
|
||||
justification = result.justifications.get(dim, "")
|
||||
print(f"\n {dim.replace('_', ' ').title()}")
|
||||
print(f" Score: {score:.2f} {bar}")
|
||||
if justification:
|
||||
# Simple word wrap at ~56 chars
|
||||
words = justification.split()
|
||||
lines: list[str] = []
|
||||
current = ""
|
||||
for word in words:
|
||||
if current and len(current) + len(word) + 1 > 56:
|
||||
lines.append(current)
|
||||
current = word
|
||||
else:
|
||||
current = f"{current} {word}" if current else word
|
||||
if current:
|
||||
lines.append(current)
|
||||
for line in lines:
|
||||
print(f" {line}")
|
||||
|
||||
print("\n" + "-" * 60)
|
||||
print(f" Composite: {result.composite:.3f}")
|
||||
print(f" Time: {result.elapsed_seconds}s")
|
||||
print("=" * 60 + "\n")
|
||||
72
backend/pipeline/quality/fixtures/chat_test_suite.yaml
Normal file
72
backend/pipeline/quality/fixtures/chat_test_suite.yaml
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
# Chat quality evaluation test suite
|
||||
# 10 representative queries across 4 categories:
|
||||
# - technical: How-to questions about specific production techniques
|
||||
# - conceptual: Broader understanding questions about audio concepts
|
||||
# - creator: Creator-scoped queries at different personality weights
|
||||
# - cross_creator: Queries spanning multiple creators' approaches
|
||||
|
||||
queries:
|
||||
# ── Technical how-to (2) ────────────────────────────────────────────
|
||||
- query: "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: technical
|
||||
description: "Common sidechain compression setup — expects specific settings (ratio, attack, release)"
|
||||
|
||||
- query: "What are the best EQ settings for cleaning up a muddy vocal recording?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: technical
|
||||
description: "Vocal EQ technique — expects frequency ranges, Q values, cut/boost guidance"
|
||||
|
||||
# ── Conceptual (2) ─────────────────────────────────────────────────
|
||||
- query: "What is the difference between parallel compression and serial compression, and when should I use each?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: conceptual
|
||||
description: "Conceptual comparison — expects clear definitions, use cases, pros/cons"
|
||||
|
||||
- query: "How does sample rate affect sound quality in music production?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: conceptual
|
||||
description: "Audio fundamentals — expects Nyquist, aliasing, practical guidance"
|
||||
|
||||
# ── Creator-specific: encyclopedic (2) ──────────────────────────────
|
||||
- query: "How does this creator approach sound design for bass sounds?"
|
||||
creator: "KEOTA"
|
||||
personality_weight: 0.0
|
||||
category: creator_encyclopedic
|
||||
description: "Creator-scoped query at weight=0 — should be neutral/encyclopedic about KEOTA's techniques"
|
||||
|
||||
- query: "What mixing techniques does this creator recommend for achieving width in a mix?"
|
||||
creator: "Mr. Bill"
|
||||
personality_weight: 0.0
|
||||
category: creator_encyclopedic
|
||||
description: "Creator-scoped query at weight=0 — neutral tone about Mr. Bill's approach"
|
||||
|
||||
# ── Creator-specific: personality (2) ───────────────────────────────
|
||||
- query: "How does this creator approach sound design for bass sounds?"
|
||||
creator: "KEOTA"
|
||||
personality_weight: 0.7
|
||||
category: creator_personality
|
||||
description: "Same query as above but at weight=0.7 — should reflect KEOTA's voice and teaching style"
|
||||
|
||||
- query: "What mixing techniques does this creator recommend for achieving width in a mix?"
|
||||
creator: "Mr. Bill"
|
||||
personality_weight: 0.7
|
||||
category: creator_personality
|
||||
description: "Same query as above but at weight=0.7 — should reflect Mr. Bill's voice"
|
||||
|
||||
# ── Cross-creator (2) ──────────────────────────────────────────────
|
||||
- query: "What are the different approaches to layering synth sounds across creators?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: cross_creator
|
||||
description: "Cross-creator comparison — should cite multiple creators' techniques"
|
||||
|
||||
- query: "How do different producers approach drum processing and what plugins do they prefer?"
|
||||
creator: null
|
||||
personality_weight: 0.0
|
||||
category: cross_creator
|
||||
description: "Cross-creator comparison on drums — expects multiple perspectives with citations"
|
||||
Loading…
Add table
Reference in a new issue