test: Created chat-specific LLM-as-judge scorer (5 dimensions), SSE-par…
- "backend/pipeline/quality/chat_scorer.py" - "backend/pipeline/quality/chat_eval.py" - "backend/pipeline/quality/fixtures/chat_test_suite.yaml" - "backend/pipeline/quality/__main__.py" GSD-Task: S09/T01
This commit is contained in:
parent
183d852f31
commit
90bb90e989
4 changed files with 779 additions and 0 deletions
|
|
@ -18,6 +18,8 @@ from pathlib import Path
|
||||||
from config import get_settings
|
from config import get_settings
|
||||||
from pipeline.llm_client import LLMClient
|
from pipeline.llm_client import LLMClient
|
||||||
|
|
||||||
|
from .chat_eval import ChatEvalRunner
|
||||||
|
from .chat_scorer import ChatScoreRunner
|
||||||
from .fitness import FitnessRunner
|
from .fitness import FitnessRunner
|
||||||
from .optimizer import OptimizationLoop, OptimizationResult
|
from .optimizer import OptimizationLoop, OptimizationResult
|
||||||
from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner
|
from .scorer import DIMENSIONS, STAGE_CONFIGS, ScoreRunner
|
||||||
|
|
@ -260,6 +262,36 @@ def main() -> int:
|
||||||
help="Write the winning prompt back to the stage's prompt file (backs up the original first)",
|
help="Write the winning prompt back to the stage's prompt file (backs up the original first)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# -- chat_eval subcommand --
|
||||||
|
chat_parser = sub.add_parser(
|
||||||
|
"chat_eval",
|
||||||
|
help="Evaluate chat quality across a test suite of queries",
|
||||||
|
)
|
||||||
|
chat_parser.add_argument(
|
||||||
|
"--suite",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to a chat test suite YAML/JSON file",
|
||||||
|
)
|
||||||
|
chat_parser.add_argument(
|
||||||
|
"--base-url",
|
||||||
|
type=str,
|
||||||
|
default="http://localhost:8096",
|
||||||
|
help="Chat API base URL (default: http://localhost:8096)",
|
||||||
|
)
|
||||||
|
chat_parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
default="backend/pipeline/quality/results/",
|
||||||
|
help="Output path for results JSON (default: backend/pipeline/quality/results/)",
|
||||||
|
)
|
||||||
|
chat_parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=float,
|
||||||
|
default=120.0,
|
||||||
|
help="Request timeout in seconds (default: 120)",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.command is None:
|
if args.command is None:
|
||||||
|
|
@ -281,6 +313,9 @@ def main() -> int:
|
||||||
if args.command == "apply":
|
if args.command == "apply":
|
||||||
return _run_apply(args)
|
return _run_apply(args)
|
||||||
|
|
||||||
|
if args.command == "chat_eval":
|
||||||
|
return _run_chat_eval(args)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -558,5 +593,54 @@ def _run_apply(args: argparse.Namespace) -> int:
|
||||||
return 0 if success else 1
|
return 0 if success else 1
|
||||||
|
|
||||||
|
|
||||||
|
def _run_chat_eval(args: argparse.Namespace) -> int:
|
||||||
|
"""Execute the chat_eval subcommand — evaluate chat quality across a test suite."""
|
||||||
|
suite_path = Path(args.suite)
|
||||||
|
if not suite_path.exists():
|
||||||
|
print(f"Error: suite file not found: {args.suite}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Load test cases
|
||||||
|
try:
|
||||||
|
cases = ChatEvalRunner.load_suite(suite_path)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"Error loading test suite: {exc}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if not cases:
|
||||||
|
print("Error: test suite contains no queries", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print(f"\n Chat Evaluation: {len(cases)} queries from {suite_path}")
|
||||||
|
print(f" Endpoint: {args.base_url}")
|
||||||
|
|
||||||
|
# Build scorer and runner
|
||||||
|
settings = get_settings()
|
||||||
|
client = LLMClient(settings)
|
||||||
|
scorer = ChatScoreRunner(client)
|
||||||
|
runner = ChatEvalRunner(
|
||||||
|
scorer=scorer,
|
||||||
|
base_url=args.base_url,
|
||||||
|
timeout=args.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
results = runner.run_suite(cases)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
runner.print_summary(results)
|
||||||
|
|
||||||
|
# Write results
|
||||||
|
try:
|
||||||
|
json_path = runner.write_results(results, args.output)
|
||||||
|
print(f" Results written to: {json_path}")
|
||||||
|
except OSError as exc:
|
||||||
|
print(f" Warning: failed to write results: {exc}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Exit code: 0 if at least one scored, 1 if all errored
|
||||||
|
scored = [r for r in results if r.score and not r.score.error and not r.request_error]
|
||||||
|
return 0 if scored else 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
|
||||||
352
backend/pipeline/quality/chat_eval.py
Normal file
352
backend/pipeline/quality/chat_eval.py
Normal file
|
|
@ -0,0 +1,352 @@
|
||||||
|
"""Chat evaluation harness — sends queries to the live chat endpoint, scores responses.
|
||||||
|
|
||||||
|
Loads a test suite (YAML or JSON), calls the chat HTTP endpoint for each query,
|
||||||
|
parses SSE events to collect response text and sources, then scores each using
|
||||||
|
ChatScoreRunner. Writes results to a JSON file.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml
|
||||||
|
python -m pipeline.quality chat_eval --suite fixtures/chat_test_suite.yaml --base-url http://ub01:8096
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from pipeline.llm_client import LLMClient
|
||||||
|
from pipeline.quality.chat_scorer import CHAT_DIMENSIONS, ChatScoreResult, ChatScoreRunner
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_DEFAULT_BASE_URL = "http://localhost:8096"
|
||||||
|
_CHAT_ENDPOINT = "/api/chat"
|
||||||
|
_REQUEST_TIMEOUT = 120.0 # seconds — LLM streaming can be slow
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChatTestCase:
|
||||||
|
"""A single test case from the test suite."""
|
||||||
|
|
||||||
|
query: str
|
||||||
|
creator: str | None = None
|
||||||
|
personality_weight: float = 0.0
|
||||||
|
category: str = "general"
|
||||||
|
description: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChatEvalResult:
|
||||||
|
"""Result of evaluating a single test case."""
|
||||||
|
|
||||||
|
test_case: ChatTestCase
|
||||||
|
response: str = ""
|
||||||
|
sources: list[dict] = field(default_factory=list)
|
||||||
|
cascade_tier: str = ""
|
||||||
|
score: ChatScoreResult | None = None
|
||||||
|
request_error: str | None = None
|
||||||
|
latency_seconds: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class ChatEvalRunner:
|
||||||
|
"""Runs a chat evaluation suite against a live endpoint."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
scorer: ChatScoreRunner,
|
||||||
|
base_url: str = _DEFAULT_BASE_URL,
|
||||||
|
timeout: float = _REQUEST_TIMEOUT,
|
||||||
|
) -> None:
|
||||||
|
self.scorer = scorer
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_suite(path: str | Path) -> list[ChatTestCase]:
|
||||||
|
"""Load test cases from a YAML or JSON file.
|
||||||
|
|
||||||
|
Expected format (YAML):
|
||||||
|
queries:
|
||||||
|
- query: "How do I sidechain a bass?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: technical
|
||||||
|
description: "Basic sidechain compression question"
|
||||||
|
"""
|
||||||
|
filepath = Path(path)
|
||||||
|
text = filepath.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
if filepath.suffix in (".yaml", ".yml"):
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"PyYAML is required to load YAML test suites. "
|
||||||
|
"Install with: pip install pyyaml"
|
||||||
|
)
|
||||||
|
data = yaml.safe_load(text)
|
||||||
|
else:
|
||||||
|
data = json.loads(text)
|
||||||
|
|
||||||
|
queries = data.get("queries", [])
|
||||||
|
cases: list[ChatTestCase] = []
|
||||||
|
for q in queries:
|
||||||
|
cases.append(ChatTestCase(
|
||||||
|
query=q["query"],
|
||||||
|
creator=q.get("creator"),
|
||||||
|
personality_weight=float(q.get("personality_weight", 0.0)),
|
||||||
|
category=q.get("category", "general"),
|
||||||
|
description=q.get("description", ""),
|
||||||
|
))
|
||||||
|
return cases
|
||||||
|
|
||||||
|
def run_suite(self, cases: list[ChatTestCase]) -> list[ChatEvalResult]:
|
||||||
|
"""Execute all test cases sequentially, scoring each response."""
|
||||||
|
results: list[ChatEvalResult] = []
|
||||||
|
|
||||||
|
for i, case in enumerate(cases, 1):
|
||||||
|
print(f"\n [{i}/{len(cases)}] {case.category}: {case.query[:60]}...")
|
||||||
|
result = self._run_single(case)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if result.request_error:
|
||||||
|
print(f" ✗ Request error: {result.request_error}")
|
||||||
|
elif result.score and result.score.error:
|
||||||
|
print(f" ✗ Scoring error: {result.score.error}")
|
||||||
|
elif result.score:
|
||||||
|
print(f" ✓ Composite: {result.score.composite:.3f} "
|
||||||
|
f"(latency: {result.latency_seconds:.1f}s)")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _run_single(self, case: ChatTestCase) -> ChatEvalResult:
|
||||||
|
"""Execute a single test case: call endpoint, parse SSE, score."""
|
||||||
|
eval_result = ChatEvalResult(test_case=case)
|
||||||
|
|
||||||
|
# Call the chat endpoint
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
response_text, sources, cascade_tier = self._call_chat_endpoint(case)
|
||||||
|
eval_result.latency_seconds = round(time.monotonic() - t0, 2)
|
||||||
|
except Exception as exc:
|
||||||
|
eval_result.latency_seconds = round(time.monotonic() - t0, 2)
|
||||||
|
eval_result.request_error = str(exc)
|
||||||
|
logger.error("chat_eval_request_error query=%r error=%s", case.query, exc)
|
||||||
|
return eval_result
|
||||||
|
|
||||||
|
eval_result.response = response_text
|
||||||
|
eval_result.sources = sources
|
||||||
|
eval_result.cascade_tier = cascade_tier
|
||||||
|
|
||||||
|
if not response_text:
|
||||||
|
eval_result.request_error = "Empty response from chat endpoint"
|
||||||
|
return eval_result
|
||||||
|
|
||||||
|
# Score the response
|
||||||
|
eval_result.score = self.scorer.score_response(
|
||||||
|
query=case.query,
|
||||||
|
response=response_text,
|
||||||
|
sources=sources,
|
||||||
|
personality_weight=case.personality_weight,
|
||||||
|
creator_name=case.creator,
|
||||||
|
)
|
||||||
|
|
||||||
|
return eval_result
|
||||||
|
|
||||||
|
def _call_chat_endpoint(
|
||||||
|
self, case: ChatTestCase
|
||||||
|
) -> tuple[str, list[dict], str]:
|
||||||
|
"""Call the chat SSE endpoint and parse the event stream.
|
||||||
|
|
||||||
|
Returns (accumulated_text, sources_list, cascade_tier).
|
||||||
|
"""
|
||||||
|
url = f"{self.base_url}{_CHAT_ENDPOINT}"
|
||||||
|
payload: dict[str, Any] = {"query": case.query}
|
||||||
|
if case.creator:
|
||||||
|
payload["creator"] = case.creator
|
||||||
|
if case.personality_weight > 0:
|
||||||
|
payload["personality_weight"] = case.personality_weight
|
||||||
|
|
||||||
|
sources: list[dict] = []
|
||||||
|
accumulated = ""
|
||||||
|
cascade_tier = ""
|
||||||
|
|
||||||
|
with httpx.Client(timeout=self.timeout) as client:
|
||||||
|
with client.stream("POST", url, json=payload) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
buffer = ""
|
||||||
|
for chunk in resp.iter_text():
|
||||||
|
buffer += chunk
|
||||||
|
# Parse SSE events from buffer
|
||||||
|
while "\n\n" in buffer:
|
||||||
|
event_block, buffer = buffer.split("\n\n", 1)
|
||||||
|
event_type, event_data = self._parse_sse_event(event_block)
|
||||||
|
|
||||||
|
if event_type == "sources":
|
||||||
|
sources = event_data if isinstance(event_data, list) else []
|
||||||
|
elif event_type == "token":
|
||||||
|
accumulated += event_data if isinstance(event_data, str) else str(event_data)
|
||||||
|
elif event_type == "done":
|
||||||
|
if isinstance(event_data, dict):
|
||||||
|
cascade_tier = event_data.get("cascade_tier", "")
|
||||||
|
elif event_type == "error":
|
||||||
|
msg = event_data.get("message", str(event_data)) if isinstance(event_data, dict) else str(event_data)
|
||||||
|
raise RuntimeError(f"Chat endpoint returned error: {msg}")
|
||||||
|
|
||||||
|
return accumulated, sources, cascade_tier
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_sse_event(block: str) -> tuple[str, Any]:
|
||||||
|
"""Parse a single SSE event block into (event_type, data)."""
|
||||||
|
event_type = ""
|
||||||
|
data_lines: list[str] = []
|
||||||
|
|
||||||
|
for line in block.strip().splitlines():
|
||||||
|
if line.startswith("event: "):
|
||||||
|
event_type = line[7:].strip()
|
||||||
|
elif line.startswith("data: "):
|
||||||
|
data_lines.append(line[6:])
|
||||||
|
elif line.startswith("data:"):
|
||||||
|
data_lines.append(line[5:])
|
||||||
|
|
||||||
|
raw_data = "\n".join(data_lines)
|
||||||
|
try:
|
||||||
|
parsed = json.loads(raw_data)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
parsed = raw_data # plain text token
|
||||||
|
|
||||||
|
return event_type, parsed
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_results(
|
||||||
|
results: list[ChatEvalResult],
|
||||||
|
output_path: str | Path,
|
||||||
|
) -> str:
|
||||||
|
"""Write evaluation results to a JSON file. Returns the path."""
|
||||||
|
out = Path(output_path)
|
||||||
|
out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||||
|
if out.is_dir():
|
||||||
|
filepath = out / f"chat_eval_{timestamp}.json"
|
||||||
|
else:
|
||||||
|
filepath = out
|
||||||
|
|
||||||
|
# Build serializable payload
|
||||||
|
entries: list[dict] = []
|
||||||
|
for r in results:
|
||||||
|
entry: dict[str, Any] = {
|
||||||
|
"query": r.test_case.query,
|
||||||
|
"creator": r.test_case.creator,
|
||||||
|
"personality_weight": r.test_case.personality_weight,
|
||||||
|
"category": r.test_case.category,
|
||||||
|
"description": r.test_case.description,
|
||||||
|
"response_length": len(r.response),
|
||||||
|
"source_count": len(r.sources),
|
||||||
|
"cascade_tier": r.cascade_tier,
|
||||||
|
"latency_seconds": r.latency_seconds,
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.request_error:
|
||||||
|
entry["error"] = r.request_error
|
||||||
|
elif r.score:
|
||||||
|
entry["scores"] = r.score.scores
|
||||||
|
entry["composite"] = r.score.composite
|
||||||
|
entry["justifications"] = r.score.justifications
|
||||||
|
entry["scoring_time"] = r.score.elapsed_seconds
|
||||||
|
if r.score.error:
|
||||||
|
entry["scoring_error"] = r.score.error
|
||||||
|
|
||||||
|
entries.append(entry)
|
||||||
|
|
||||||
|
# Summary stats
|
||||||
|
scored = [e for e in entries if "composite" in e]
|
||||||
|
avg_composite = (
|
||||||
|
sum(e["composite"] for e in scored) / len(scored) if scored else 0.0
|
||||||
|
)
|
||||||
|
dim_avgs: dict[str, float] = {}
|
||||||
|
for dim in CHAT_DIMENSIONS:
|
||||||
|
vals = [e["scores"][dim] for e in scored if dim in e.get("scores", {})]
|
||||||
|
dim_avgs[dim] = round(sum(vals) / len(vals), 3) if vals else 0.0
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"total_queries": len(results),
|
||||||
|
"scored_queries": len(scored),
|
||||||
|
"errors": len(results) - len(scored),
|
||||||
|
"average_composite": round(avg_composite, 3),
|
||||||
|
"dimension_averages": dim_avgs,
|
||||||
|
"results": entries,
|
||||||
|
}
|
||||||
|
|
||||||
|
filepath.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||||
|
return str(filepath)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def print_summary(results: list[ChatEvalResult]) -> None:
|
||||||
|
"""Print a summary table of evaluation results."""
|
||||||
|
print("\n" + "=" * 72)
|
||||||
|
print(" CHAT EVALUATION SUMMARY")
|
||||||
|
print("=" * 72)
|
||||||
|
|
||||||
|
scored = [r for r in results if r.score and not r.score.error and not r.request_error]
|
||||||
|
errored = [r for r in results if r.request_error or (r.score and r.score.error)]
|
||||||
|
|
||||||
|
if not scored:
|
||||||
|
print("\n No successfully scored responses.\n")
|
||||||
|
if errored:
|
||||||
|
print(f" Errors: {len(errored)}")
|
||||||
|
for r in errored:
|
||||||
|
err = r.request_error or (r.score.error if r.score else "unknown")
|
||||||
|
print(f" - {r.test_case.query[:50]}: {err}")
|
||||||
|
print("=" * 72 + "\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Header
|
||||||
|
print(f"\n {'Category':<12s} {'Query':<30s} {'Comp':>5s} {'Cite':>5s} {'Struct':>6s} {'Domain':>6s} {'Ground':>6s} {'Person':>6s}")
|
||||||
|
print(f" {'─'*12} {'─'*30} {'─'*5} {'─'*5} {'─'*6} {'─'*6} {'─'*6} {'─'*6}")
|
||||||
|
|
||||||
|
for r in scored:
|
||||||
|
s = r.score
|
||||||
|
assert s is not None
|
||||||
|
q = r.test_case.query[:30]
|
||||||
|
cat = r.test_case.category[:12]
|
||||||
|
print(
|
||||||
|
f" {cat:<12s} {q:<30s} "
|
||||||
|
f"{s.composite:5.2f} "
|
||||||
|
f"{s.citation_accuracy:5.2f} "
|
||||||
|
f"{s.response_structure:6.2f} "
|
||||||
|
f"{s.domain_expertise:6.2f} "
|
||||||
|
f"{s.source_grounding:6.2f} "
|
||||||
|
f"{s.personality_fidelity:6.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Averages
|
||||||
|
avg_comp = sum(r.score.composite for r in scored) / len(scored)
|
||||||
|
avg_dims = {}
|
||||||
|
for dim in CHAT_DIMENSIONS:
|
||||||
|
vals = [r.score.scores.get(dim, 0.0) for r in scored]
|
||||||
|
avg_dims[dim] = sum(vals) / len(vals)
|
||||||
|
|
||||||
|
print(f"\n {'AVERAGE':<12s} {'':30s} "
|
||||||
|
f"{avg_comp:5.2f} "
|
||||||
|
f"{avg_dims['citation_accuracy']:5.2f} "
|
||||||
|
f"{avg_dims['response_structure']:6.2f} "
|
||||||
|
f"{avg_dims['domain_expertise']:6.2f} "
|
||||||
|
f"{avg_dims['source_grounding']:6.2f} "
|
||||||
|
f"{avg_dims['personality_fidelity']:6.2f}")
|
||||||
|
|
||||||
|
if errored:
|
||||||
|
print(f"\n Errors: {len(errored)}")
|
||||||
|
for r in errored:
|
||||||
|
err = r.request_error or (r.score.error if r.score else "unknown")
|
||||||
|
print(f" - {r.test_case.query[:50]}: {err}")
|
||||||
|
|
||||||
|
print("=" * 72 + "\n")
|
||||||
271
backend/pipeline/quality/chat_scorer.py
Normal file
271
backend/pipeline/quality/chat_scorer.py
Normal file
|
|
@ -0,0 +1,271 @@
|
||||||
|
"""Chat-specific quality scorer — LLM-as-judge evaluation for chat responses.
|
||||||
|
|
||||||
|
Scores chat responses across 5 dimensions:
|
||||||
|
- citation_accuracy: Are citations real and correctly numbered?
|
||||||
|
- response_structure: Concise, well-organized, uses appropriate formatting?
|
||||||
|
- domain_expertise: Music production terminology used naturally?
|
||||||
|
- source_grounding: Claims backed by provided sources, no fabrication?
|
||||||
|
- personality_fidelity: At weight>0, response reflects creator voice?
|
||||||
|
|
||||||
|
Run via: python -m pipeline.quality chat_eval --suite <path>
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
import openai
|
||||||
|
|
||||||
|
from pipeline.llm_client import LLMClient
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CHAT_DIMENSIONS = [
|
||||||
|
"citation_accuracy",
|
||||||
|
"response_structure",
|
||||||
|
"domain_expertise",
|
||||||
|
"source_grounding",
|
||||||
|
"personality_fidelity",
|
||||||
|
]
|
||||||
|
|
||||||
|
CHAT_RUBRIC = """\
|
||||||
|
You are an expert evaluator of AI chat response quality for a music production knowledge base.
|
||||||
|
|
||||||
|
You will be given:
|
||||||
|
1. The user's query
|
||||||
|
2. The assistant's response
|
||||||
|
3. The numbered source citations that were provided to the assistant
|
||||||
|
4. The personality_weight (0.0 = encyclopedic, >0 = creator voice expected)
|
||||||
|
5. The creator_name (if any)
|
||||||
|
|
||||||
|
Evaluate the response across these 5 dimensions, scoring each 0.0 to 1.0:
|
||||||
|
|
||||||
|
**citation_accuracy** — Citations are real, correctly numbered, and point to relevant sources
|
||||||
|
- 0.9-1.0: Every [N] citation references a real source number, citations are placed next to the claim they support, no phantom citations
|
||||||
|
- 0.5-0.7: Most citations are valid but some are misplaced or reference non-existent source numbers
|
||||||
|
- 0.0-0.3: Many phantom citations, wrong numbers, or citations placed randomly without connection to claims
|
||||||
|
|
||||||
|
**response_structure** — Response is concise, well-organized, uses appropriate formatting
|
||||||
|
- 0.9-1.0: Clear paragraphs, uses bullet lists for steps/lists, bold for key terms, appropriate length (not padded)
|
||||||
|
- 0.5-0.7: Readable but could be better organized — wall of text, missing formatting where it would help
|
||||||
|
- 0.0-0.3: Disorganized, excessively long or too terse, no formatting, hard to scan
|
||||||
|
|
||||||
|
**domain_expertise** — Music production terminology used naturally and correctly
|
||||||
|
- 0.9-1.0: Uses correct audio/synth/mixing terminology, explains technical terms when appropriate, sounds like a knowledgeable producer
|
||||||
|
- 0.5-0.7: Generally correct but some terminology is vague ("adjust the sound" vs "shape the transient") or misused
|
||||||
|
- 0.0-0.3: Generic language, avoids domain terminology, or uses terms incorrectly
|
||||||
|
|
||||||
|
**source_grounding** — Claims are backed by provided sources, no fabrication
|
||||||
|
- 0.9-1.0: Every factual claim traces to a provided source, no invented details (plugin names, settings, frequencies not in sources)
|
||||||
|
- 0.5-0.7: Mostly grounded but 1-2 claims seem embellished or not directly from sources
|
||||||
|
- 0.0-0.3: Contains hallucinated specifics — settings, plugin names, or techniques not present in any source
|
||||||
|
|
||||||
|
**personality_fidelity** — When personality_weight > 0, response reflects the creator's voice proportional to the weight
|
||||||
|
- If personality_weight == 0: Score based on neutral encyclopedic tone (should NOT show personality). Neutral, informative = 1.0. Forced personality = 0.5.
|
||||||
|
- If personality_weight > 0 and personality_weight < 0.5: Subtle personality hints expected. Score higher if tone is lightly flavored but still mainly encyclopedic.
|
||||||
|
- If personality_weight >= 0.5: Clear creator voice expected. Score higher for signature phrases, teaching style, energy matching the named creator.
|
||||||
|
- If no creator_name is provided: Score 1.0 if response is neutral/encyclopedic, lower if it adopts an unexplained persona.
|
||||||
|
|
||||||
|
Return ONLY a JSON object with this exact structure:
|
||||||
|
{
|
||||||
|
"citation_accuracy": <float 0.0-1.0>,
|
||||||
|
"response_structure": <float 0.0-1.0>,
|
||||||
|
"domain_expertise": <float 0.0-1.0>,
|
||||||
|
"source_grounding": <float 0.0-1.0>,
|
||||||
|
"personality_fidelity": <float 0.0-1.0>,
|
||||||
|
"justifications": {
|
||||||
|
"citation_accuracy": "<1-2 sentence justification>",
|
||||||
|
"response_structure": "<1-2 sentence justification>",
|
||||||
|
"domain_expertise": "<1-2 sentence justification>",
|
||||||
|
"source_grounding": "<1-2 sentence justification>",
|
||||||
|
"personality_fidelity": "<1-2 sentence justification>"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChatScoreResult:
|
||||||
|
"""Outcome of scoring a chat response across quality dimensions."""
|
||||||
|
|
||||||
|
scores: dict[str, float] = field(default_factory=dict)
|
||||||
|
composite: float = 0.0
|
||||||
|
justifications: dict[str, str] = field(default_factory=dict)
|
||||||
|
elapsed_seconds: float = 0.0
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
# Convenience properties
|
||||||
|
@property
|
||||||
|
def citation_accuracy(self) -> float:
|
||||||
|
return self.scores.get("citation_accuracy", 0.0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def response_structure(self) -> float:
|
||||||
|
return self.scores.get("response_structure", 0.0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def domain_expertise(self) -> float:
|
||||||
|
return self.scores.get("domain_expertise", 0.0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source_grounding(self) -> float:
|
||||||
|
return self.scores.get("source_grounding", 0.0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def personality_fidelity(self) -> float:
|
||||||
|
return self.scores.get("personality_fidelity", 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
class ChatScoreRunner:
|
||||||
|
"""Scores chat responses using LLM-as-judge evaluation."""
|
||||||
|
|
||||||
|
def __init__(self, client: LLMClient) -> None:
|
||||||
|
self.client = client
|
||||||
|
|
||||||
|
def score_response(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
response: str,
|
||||||
|
sources: list[dict],
|
||||||
|
personality_weight: float = 0.0,
|
||||||
|
creator_name: str | None = None,
|
||||||
|
) -> ChatScoreResult:
|
||||||
|
"""Score a single chat response against the 5 chat quality dimensions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query:
|
||||||
|
The user's original query.
|
||||||
|
response:
|
||||||
|
The assistant's accumulated response text.
|
||||||
|
sources:
|
||||||
|
List of source citation dicts (as emitted by the SSE sources event).
|
||||||
|
personality_weight:
|
||||||
|
0.0 = encyclopedic mode, >0 = personality mode.
|
||||||
|
creator_name:
|
||||||
|
Creator name, if this was a creator-scoped query.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ChatScoreResult with per-dimension scores.
|
||||||
|
"""
|
||||||
|
sources_block = json.dumps(sources, indent=2) if sources else "(no sources)"
|
||||||
|
|
||||||
|
user_prompt = (
|
||||||
|
f"## User Query\n\n{query}\n\n"
|
||||||
|
f"## Assistant Response\n\n{response}\n\n"
|
||||||
|
f"## Sources Provided\n\n```json\n{sources_block}\n```\n\n"
|
||||||
|
f"## Metadata\n\n"
|
||||||
|
f"- personality_weight: {personality_weight}\n"
|
||||||
|
f"- creator_name: {creator_name or '(none)'}\n\n"
|
||||||
|
f"Score this chat response across all 5 dimensions."
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
from pydantic import BaseModel as _BM
|
||||||
|
resp = self.client.complete(
|
||||||
|
system_prompt=CHAT_RUBRIC,
|
||||||
|
user_prompt=user_prompt,
|
||||||
|
response_model=_BM,
|
||||||
|
modality="chat",
|
||||||
|
)
|
||||||
|
elapsed = round(time.monotonic() - t0, 2)
|
||||||
|
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
||||||
|
elapsed = round(time.monotonic() - t0, 2)
|
||||||
|
return ChatScoreResult(
|
||||||
|
elapsed_seconds=elapsed,
|
||||||
|
error=f"Cannot reach LLM judge. Error: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_text = str(resp).strip()
|
||||||
|
try:
|
||||||
|
parsed = json.loads(raw_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.error("Malformed chat judge response (not JSON): %.300s", raw_text)
|
||||||
|
return ChatScoreResult(
|
||||||
|
elapsed_seconds=elapsed,
|
||||||
|
error=f"Malformed judge response. Raw excerpt: {raw_text[:200]}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._parse_scores(parsed, elapsed)
|
||||||
|
|
||||||
|
def _parse_scores(self, parsed: dict, elapsed: float) -> ChatScoreResult:
|
||||||
|
"""Extract and validate scores from parsed JSON judge response."""
|
||||||
|
scores: dict[str, float] = {}
|
||||||
|
justifications: dict[str, str] = {}
|
||||||
|
|
||||||
|
raw_justifications = parsed.get("justifications", {})
|
||||||
|
if not isinstance(raw_justifications, dict):
|
||||||
|
raw_justifications = {}
|
||||||
|
|
||||||
|
for dim in CHAT_DIMENSIONS:
|
||||||
|
raw = parsed.get(dim)
|
||||||
|
if raw is None:
|
||||||
|
logger.warning("Missing dimension '%s' in chat judge response", dim)
|
||||||
|
scores[dim] = 0.0
|
||||||
|
justifications[dim] = "(missing from judge response)"
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
val = float(raw)
|
||||||
|
scores[dim] = max(0.0, min(1.0, val))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
logger.warning("Invalid value for '%s': %r", dim, raw)
|
||||||
|
scores[dim] = 0.0
|
||||||
|
justifications[dim] = f"(invalid value: {raw!r})"
|
||||||
|
continue
|
||||||
|
|
||||||
|
justifications[dim] = str(raw_justifications.get(dim, ""))
|
||||||
|
|
||||||
|
composite = sum(scores.values()) / len(CHAT_DIMENSIONS) if CHAT_DIMENSIONS else 0.0
|
||||||
|
|
||||||
|
return ChatScoreResult(
|
||||||
|
scores=scores,
|
||||||
|
composite=round(composite, 3),
|
||||||
|
justifications=justifications,
|
||||||
|
elapsed_seconds=elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
def print_report(self, result: ChatScoreResult, query: str = "") -> None:
|
||||||
|
"""Print a formatted chat scoring report to stdout."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(" CHAT QUALITY SCORE REPORT")
|
||||||
|
if query:
|
||||||
|
print(f" Query: {query[:60]}{'...' if len(query) > 60 else ''}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
if result.error:
|
||||||
|
print(f"\n ✗ Error: {result.error}\n")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
for dim in CHAT_DIMENSIONS:
|
||||||
|
score = result.scores.get(dim, 0.0)
|
||||||
|
filled = int(score * 20)
|
||||||
|
bar = "█" * filled + "░" * (20 - filled)
|
||||||
|
justification = result.justifications.get(dim, "")
|
||||||
|
print(f"\n {dim.replace('_', ' ').title()}")
|
||||||
|
print(f" Score: {score:.2f} {bar}")
|
||||||
|
if justification:
|
||||||
|
# Simple word wrap at ~56 chars
|
||||||
|
words = justification.split()
|
||||||
|
lines: list[str] = []
|
||||||
|
current = ""
|
||||||
|
for word in words:
|
||||||
|
if current and len(current) + len(word) + 1 > 56:
|
||||||
|
lines.append(current)
|
||||||
|
current = word
|
||||||
|
else:
|
||||||
|
current = f"{current} {word}" if current else word
|
||||||
|
if current:
|
||||||
|
lines.append(current)
|
||||||
|
for line in lines:
|
||||||
|
print(f" {line}")
|
||||||
|
|
||||||
|
print("\n" + "-" * 60)
|
||||||
|
print(f" Composite: {result.composite:.3f}")
|
||||||
|
print(f" Time: {result.elapsed_seconds}s")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
72
backend/pipeline/quality/fixtures/chat_test_suite.yaml
Normal file
72
backend/pipeline/quality/fixtures/chat_test_suite.yaml
Normal file
|
|
@ -0,0 +1,72 @@
|
||||||
|
# Chat quality evaluation test suite
|
||||||
|
# 10 representative queries across 4 categories:
|
||||||
|
# - technical: How-to questions about specific production techniques
|
||||||
|
# - conceptual: Broader understanding questions about audio concepts
|
||||||
|
# - creator: Creator-scoped queries at different personality weights
|
||||||
|
# - cross_creator: Queries spanning multiple creators' approaches
|
||||||
|
|
||||||
|
queries:
|
||||||
|
# ── Technical how-to (2) ────────────────────────────────────────────
|
||||||
|
- query: "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: technical
|
||||||
|
description: "Common sidechain compression setup — expects specific settings (ratio, attack, release)"
|
||||||
|
|
||||||
|
- query: "What are the best EQ settings for cleaning up a muddy vocal recording?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: technical
|
||||||
|
description: "Vocal EQ technique — expects frequency ranges, Q values, cut/boost guidance"
|
||||||
|
|
||||||
|
# ── Conceptual (2) ─────────────────────────────────────────────────
|
||||||
|
- query: "What is the difference between parallel compression and serial compression, and when should I use each?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: conceptual
|
||||||
|
description: "Conceptual comparison — expects clear definitions, use cases, pros/cons"
|
||||||
|
|
||||||
|
- query: "How does sample rate affect sound quality in music production?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: conceptual
|
||||||
|
description: "Audio fundamentals — expects Nyquist, aliasing, practical guidance"
|
||||||
|
|
||||||
|
# ── Creator-specific: encyclopedic (2) ──────────────────────────────
|
||||||
|
- query: "How does this creator approach sound design for bass sounds?"
|
||||||
|
creator: "KEOTA"
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: creator_encyclopedic
|
||||||
|
description: "Creator-scoped query at weight=0 — should be neutral/encyclopedic about KEOTA's techniques"
|
||||||
|
|
||||||
|
- query: "What mixing techniques does this creator recommend for achieving width in a mix?"
|
||||||
|
creator: "Mr. Bill"
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: creator_encyclopedic
|
||||||
|
description: "Creator-scoped query at weight=0 — neutral tone about Mr. Bill's approach"
|
||||||
|
|
||||||
|
# ── Creator-specific: personality (2) ───────────────────────────────
|
||||||
|
- query: "How does this creator approach sound design for bass sounds?"
|
||||||
|
creator: "KEOTA"
|
||||||
|
personality_weight: 0.7
|
||||||
|
category: creator_personality
|
||||||
|
description: "Same query as above but at weight=0.7 — should reflect KEOTA's voice and teaching style"
|
||||||
|
|
||||||
|
- query: "What mixing techniques does this creator recommend for achieving width in a mix?"
|
||||||
|
creator: "Mr. Bill"
|
||||||
|
personality_weight: 0.7
|
||||||
|
category: creator_personality
|
||||||
|
description: "Same query as above but at weight=0.7 — should reflect Mr. Bill's voice"
|
||||||
|
|
||||||
|
# ── Cross-creator (2) ──────────────────────────────────────────────
|
||||||
|
- query: "What are the different approaches to layering synth sounds across creators?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: cross_creator
|
||||||
|
description: "Cross-creator comparison — should cite multiple creators' techniques"
|
||||||
|
|
||||||
|
- query: "How do different producers approach drum processing and what plugins do they prefer?"
|
||||||
|
creator: null
|
||||||
|
personality_weight: 0.0
|
||||||
|
category: cross_creator
|
||||||
|
description: "Cross-creator comparison on drums — expects multiple perspectives with citations"
|
||||||
Loading…
Add table
Reference in a new issue