chrysopedia/backend/scripts/compare_search.py
jlightner 17b43d9778 feat: Added LightRAG /query/data as primary search engine with file_sou…
- "backend/config.py"
- "backend/search_service.py"

GSD-Task: S01/T01
2026-04-04 04:44:24 +00:00

547 lines
20 KiB
Python

#!/usr/bin/env python3
"""A/B comparison of Chrysopedia's Qdrant search vs LightRAG retrieval.
Runs a set of queries against both backends and produces a scored comparison
report. Designed to run inside the chrysopedia-api container (has network
access to both services) or via tunneled URLs.
Usage:
# Dry run — show query set without executing
python3 /app/scripts/compare_search.py --dry-run
# Run first 5 queries
python3 /app/scripts/compare_search.py --limit 5
# Full comparison
python3 /app/scripts/compare_search.py
# Custom URLs
python3 /app/scripts/compare_search.py --api-url http://localhost:8000 --lightrag-url http://localhost:9621
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
logger = logging.getLogger("compare_search")
# ── Query set ────────────────────────────────────────────────────────────────
# Real user queries (from search_log)
USER_QUERIES = [
"squelch",
"keota snare",
"reverb",
"how does keota snare",
"bass",
"groove",
"drums",
"fx",
"textures",
"daw setup",
"synthesis",
"how does keota",
"over-leveling snare to control compression behavior",
]
# Curated domain queries — test different retrieval patterns
CURATED_QUERIES = [
# Broad topic queries
"bass design techniques",
"reverb chains and spatial effects",
"how to layer drums",
# Cross-entity synthesis (LightRAG strength)
"what plugins are commonly used for bass sounds",
"compare different approaches to snare layering",
"how do different producers approach sound design",
# Exact lookup (Qdrant strength)
"COPYCATT",
"Emperor arrangement",
# How-to / procedural
"how to create tension in a buildup",
"step by step resampling workflow",
# Concept queries
"frequency spectrum balance",
"signal chain for drums",
]
ALL_QUERIES = USER_QUERIES + CURATED_QUERIES
# ── Data structures ──────────────────────────────────────────────────────────
@dataclass
class SearchResult:
title: str
score: float
snippet: str
result_type: str = ""
creator: str = ""
slug: str = ""
@dataclass
class QdrantSearchResponse:
query: str
results: list[SearchResult] = field(default_factory=list)
partial_matches: list[SearchResult] = field(default_factory=list)
total: int = 0
latency_ms: float = 0.0
error: str = ""
@dataclass
class LightRAGResponse:
query: str
response_text: str = ""
references: list[dict[str, Any]] = field(default_factory=list)
latency_ms: float = 0.0
error: str = ""
@dataclass
class QueryComparison:
query: str
query_type: str # "user" or "curated"
qdrant: QdrantSearchResponse | None = None
lightrag: LightRAGResponse | None = None
# Scores (populated by scoring phase)
qdrant_relevance: float = 0.0
qdrant_coverage: int = 0
qdrant_diversity: int = 0
lightrag_relevance: float = 0.0
lightrag_coverage: int = 0
lightrag_answer_quality: float = 0.0
winner: str = "" # "qdrant", "lightrag", "tie"
# ── Qdrant search client ────────────────────────────────────────────────────
def query_qdrant_search(api_url: str, query: str, limit: int = 20) -> QdrantSearchResponse:
"""Query the Chrysopedia search API (Qdrant + keyword)."""
url = f"{api_url}/api/v1/search"
params = {"q": query, "scope": "all", "limit": limit}
start = time.monotonic()
try:
resp = httpx.get(url, params=params, timeout=15)
latency = (time.monotonic() - start) * 1000
resp.raise_for_status()
data = resp.json()
except httpx.HTTPError as e:
latency = (time.monotonic() - start) * 1000
return QdrantSearchResponse(query=query, latency_ms=latency, error=str(e))
items = data.get("items", [])
partial = data.get("partial_matches", [])
results = [
SearchResult(
title=item.get("title", ""),
score=item.get("score", 0.0),
snippet=item.get("summary", "")[:200],
result_type=item.get("type", ""),
creator=item.get("creator_name", ""),
slug=item.get("slug", ""),
)
for item in items
]
partial_results = [
SearchResult(
title=item.get("title", ""),
score=item.get("score", 0.0),
snippet=item.get("summary", "")[:200],
result_type=item.get("type", ""),
creator=item.get("creator_name", ""),
slug=item.get("slug", ""),
)
for item in partial
]
return QdrantSearchResponse(
query=query,
results=results,
partial_matches=partial_results,
total=data.get("total", 0),
latency_ms=latency,
)
# ── LightRAG client ─────────────────────────────────────────────────────────
def query_lightrag(lightrag_url: str, query: str, mode: str = "hybrid") -> LightRAGResponse:
"""Query the LightRAG API."""
url = f"{lightrag_url}/query"
payload = {"query": query, "mode": mode}
start = time.monotonic()
try:
# LightRAG queries involve LLM inference — can take 2-4 minutes each
resp = httpx.post(url, json=payload, timeout=300)
latency = (time.monotonic() - start) * 1000
resp.raise_for_status()
data = resp.json()
except httpx.HTTPError as e:
latency = (time.monotonic() - start) * 1000
return LightRAGResponse(query=query, latency_ms=latency, error=str(e))
return LightRAGResponse(
query=query,
response_text=data.get("response", ""),
references=[
{"id": ref.get("reference_id", ""), "file_path": ref.get("file_path", "")}
for ref in data.get("references", [])
],
latency_ms=latency,
)
# ── Scoring ──────────────────────────────────────────────────────────────────
def _token_overlap(query: str, text: str) -> float:
"""Fraction of query tokens found in text (case-insensitive)."""
if not text:
return 0.0
query_tokens = {t.lower() for t in query.split() if len(t) > 2}
if not query_tokens:
return 0.0
text_lower = text.lower()
matched = sum(1 for t in query_tokens if t in text_lower)
return matched / len(query_tokens)
def score_qdrant_results(comp: QueryComparison) -> None:
"""Score Qdrant results on relevance, coverage, and diversity."""
if not comp.qdrant or comp.qdrant.error:
return
results = comp.qdrant.results
if not results:
# Check partial matches
results = comp.qdrant.partial_matches
if not results:
comp.qdrant_relevance = 0.0
comp.qdrant_coverage = 0
comp.qdrant_diversity = 0
return
# Relevance: average token overlap across top-5 results
overlaps = []
for r in results[:5]:
combined = f"{r.title} {r.snippet} {r.creator}"
overlaps.append(_token_overlap(comp.query, combined))
comp.qdrant_relevance = round((sum(overlaps) / len(overlaps)) * 5, 2) if overlaps else 0.0
# Coverage: unique technique pages
slugs = {r.slug for r in results if r.slug}
comp.qdrant_coverage = len(slugs)
# Diversity: unique creators
creators = {r.creator for r in results if r.creator}
comp.qdrant_diversity = len(creators)
def score_lightrag_results(comp: QueryComparison) -> None:
"""Score LightRAG results on relevance, coverage, and answer quality."""
if not comp.lightrag or comp.lightrag.error:
return
text = comp.lightrag.response_text
refs = comp.lightrag.references
if not text:
comp.lightrag_relevance = 0.0
comp.lightrag_coverage = 0
comp.lightrag_answer_quality = 0.0
return
# Relevance: token overlap between query and response
comp.lightrag_relevance = round(_token_overlap(comp.query, text) * 5, 2)
# Coverage: unique technique pages referenced
unique_sources = {r["file_path"] for r in refs if r.get("file_path")}
comp.lightrag_coverage = len(unique_sources)
# Answer quality (0-5 composite):
quality = 0.0
# Length: longer synthesized answers are generally better (up to a point)
word_count = len(text.split())
if word_count > 20:
quality += 1.0
if word_count > 100:
quality += 0.5
if word_count > 200:
quality += 0.5
# References: more cross-page references = better synthesis
if len(unique_sources) >= 2:
quality += 1.0
if len(unique_sources) >= 4:
quality += 0.5
# Structure: has headings, bullet points, or numbered lists
if "**" in text or "##" in text:
quality += 0.5
if "- " in text or "* " in text:
quality += 0.5
# Doesn't say "no information available" or similar
negative_phrases = ["no information", "not mentioned", "no data", "cannot find"]
has_negative = any(phrase in text.lower() for phrase in negative_phrases)
if not has_negative:
quality += 0.5
else:
quality -= 1.0
comp.lightrag_answer_quality = round(min(quality, 5.0), 2)
def determine_winner(comp: QueryComparison) -> None:
"""Determine which backend wins for this query."""
# Composite score: relevance weight 0.4, coverage 0.3, quality/diversity 0.3
qdrant_score = (
comp.qdrant_relevance * 0.4
+ min(comp.qdrant_coverage, 5) * 0.3
+ min(comp.qdrant_diversity, 3) * 0.3
)
lightrag_score = (
comp.lightrag_relevance * 0.4
+ min(comp.lightrag_coverage, 5) * 0.3
+ comp.lightrag_answer_quality * 0.3
)
if abs(qdrant_score - lightrag_score) < 0.5:
comp.winner = "tie"
elif qdrant_score > lightrag_score:
comp.winner = "qdrant"
else:
comp.winner = "lightrag"
# ── Report generation ────────────────────────────────────────────────────────
def generate_markdown_report(comparisons: list[QueryComparison], output_dir: Path) -> Path:
"""Generate a human-readable markdown comparison report."""
lines: list[str] = []
lines.append("# Search A/B Comparison: Qdrant vs LightRAG")
lines.append(f"\n_Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}_")
lines.append(f"\n**Queries evaluated:** {len(comparisons)}")
# Aggregate stats
wins = {"qdrant": 0, "lightrag": 0, "tie": 0}
qdrant_latencies = []
lightrag_latencies = []
for c in comparisons:
wins[c.winner] += 1
if c.qdrant and not c.qdrant.error:
qdrant_latencies.append(c.qdrant.latency_ms)
if c.lightrag and not c.lightrag.error:
lightrag_latencies.append(c.lightrag.latency_ms)
lines.append("\n## Aggregate Results\n")
lines.append(f"| Metric | Qdrant Search | LightRAG |")
lines.append(f"|--------|:-------------:|:--------:|")
lines.append(f"| **Wins** | {wins['qdrant']} | {wins['lightrag']} |")
lines.append(f"| **Ties** | {wins['tie']} | {wins['tie']} |")
avg_q_str = f"{sum(qdrant_latencies) / len(qdrant_latencies):.0f}ms" if qdrant_latencies else "N/A"
avg_l_str = f"{sum(lightrag_latencies) / len(lightrag_latencies):.0f}ms" if lightrag_latencies else "N/A"
lines.append(f"| **Avg latency** | {avg_q_str} | {avg_l_str} |")
avg_qr = sum(c.qdrant_relevance for c in comparisons) / len(comparisons) if comparisons else 0
avg_lr = sum(c.lightrag_relevance for c in comparisons) / len(comparisons) if comparisons else 0
lines.append(f"| **Avg relevance** | {avg_qr:.2f}/5 | {avg_lr:.2f}/5 |")
avg_qc = sum(c.qdrant_coverage for c in comparisons) / len(comparisons) if comparisons else 0
avg_lc = sum(c.lightrag_coverage for c in comparisons) / len(comparisons) if comparisons else 0
lines.append(f"| **Avg coverage** | {avg_qc:.1f} pages | {avg_lc:.1f} refs |")
# Per-query detail
lines.append("\n## Per-Query Comparison\n")
lines.append("| # | Query | Type | Qdrant Rel | LR Rel | Qdrant Cov | LR Cov | LR Quality | Winner |")
lines.append("|---|-------|------|:----------:|:------:|:----------:|:------:|:----------:|:------:|")
for i, c in enumerate(comparisons, 1):
q_display = c.query[:45] + "" if len(c.query) > 45 else c.query
winner_emoji = {"qdrant": "🔵", "lightrag": "🟢", "tie": ""}[c.winner]
lines.append(
f"| {i} | {q_display} | {c.query_type} | {c.qdrant_relevance:.1f} | "
f"{c.lightrag_relevance:.1f} | {c.qdrant_coverage} | {c.lightrag_coverage} | "
f"{c.lightrag_answer_quality:.1f} | {winner_emoji} {c.winner} |"
)
# Detailed results for interesting queries
lines.append("\n## Notable Comparisons\n")
# Pick queries where there's a clear winner with interesting differences
notable = [c for c in comparisons if c.winner != "tie"][:5]
for c in notable:
lines.append(f"### Query: \"{c.query}\"\n")
lines.append(f"**Winner: {c.winner}**\n")
if c.qdrant and c.qdrant.results:
lines.append("**Qdrant results:**")
for r in c.qdrant.results[:3]:
lines.append(f"- {r.title} (by {r.creator}, score: {r.score:.2f})")
lines.append("")
if c.lightrag and c.lightrag.response_text:
# Show first 300 chars of LightRAG response
preview = c.lightrag.response_text[:300]
if len(c.lightrag.response_text) > 300:
preview += ""
lines.append(f"**LightRAG response preview:**")
lines.append(f"> {preview}\n")
if c.lightrag.references:
ref_slugs = [r["file_path"] for r in c.lightrag.references[:5]]
lines.append(f"References: {', '.join(ref_slugs)}\n")
# Data coverage note
lines.append("\n## Data Coverage Note\n")
lines.append(
"LightRAG has 18 of 93 technique pages indexed. "
"Results may improve significantly after full reindexing. "
"Qdrant has all 93 pages embedded."
)
report_path = output_dir / "comparison_report.md"
report_path.write_text("\n".join(lines), encoding="utf-8")
return report_path
def generate_json_report(comparisons: list[QueryComparison], output_dir: Path) -> Path:
"""Write full structured comparison data to JSON."""
def _serialize(obj):
if hasattr(obj, "__dict__"):
return {k: _serialize(v) for k, v in obj.__dict__.items()}
if isinstance(obj, list):
return [_serialize(i) for i in obj]
if isinstance(obj, dict):
return {k: _serialize(v) for k, v in obj.items()}
return obj
data = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"query_count": len(comparisons),
"comparisons": [_serialize(c) for c in comparisons],
}
report_path = output_dir / "comparison_report.json"
report_path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8")
return report_path
# ── Main ─────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="A/B compare Qdrant search vs LightRAG")
parser.add_argument(
"--api-url",
default=os.environ.get("API_URL", "http://127.0.0.1:8000"),
help="Chrysopedia API base URL (default: http://127.0.0.1:8000)",
)
parser.add_argument(
"--lightrag-url",
default=os.environ.get("LIGHTRAG_URL", "http://chrysopedia-lightrag:9621"),
help="LightRAG API base URL (default: http://chrysopedia-lightrag:9621)",
)
parser.add_argument(
"--output-dir",
default=os.environ.get("OUTPUT_DIR", "/app/scripts/output"),
help="Output directory for reports",
)
parser.add_argument("--limit", type=int, default=None, help="Process only first N queries")
parser.add_argument("--dry-run", action="store_true", help="Show query set without executing")
parser.add_argument("--verbose", "-v", action="store_true", help="Debug logging")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
queries = ALL_QUERIES[:args.limit] if args.limit else ALL_QUERIES
if args.dry_run:
print(f"Query set ({len(queries)} queries):")
for i, q in enumerate(queries, 1):
qtype = "user" if q in USER_QUERIES else "curated"
print(f" {i:2d}. [{qtype:>7s}] {q}")
return
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
comparisons: list[QueryComparison] = []
for i, query in enumerate(queries, 1):
qtype = "user" if query in USER_QUERIES else "curated"
logger.info("[%d/%d] Query: %r (%s)", i, len(queries), query, qtype)
# Query both backends
qdrant_resp = query_qdrant_search(args.api_url, query)
lightrag_resp = query_lightrag(args.lightrag_url, query)
if qdrant_resp.error:
logger.warning(" Qdrant error: %s", qdrant_resp.error)
else:
logger.info(" Qdrant: %d results in %.0fms", qdrant_resp.total, qdrant_resp.latency_ms)
if lightrag_resp.error:
logger.warning(" LightRAG error: %s", lightrag_resp.error)
else:
ref_count = len(lightrag_resp.references)
word_count = len(lightrag_resp.response_text.split())
logger.info(" LightRAG: %d words, %d refs in %.0fms", word_count, ref_count, lightrag_resp.latency_ms)
comp = QueryComparison(query=query, query_type=qtype, qdrant=qdrant_resp, lightrag=lightrag_resp)
# Score
score_qdrant_results(comp)
score_lightrag_results(comp)
determine_winner(comp)
logger.info(
" Scores → Qdrant: rel=%.1f cov=%d div=%d | LightRAG: rel=%.1f cov=%d qual=%.1f | Winner: %s",
comp.qdrant_relevance, comp.qdrant_coverage, comp.qdrant_diversity,
comp.lightrag_relevance, comp.lightrag_coverage, comp.lightrag_answer_quality,
comp.winner,
)
comparisons.append(comp)
# Generate reports
logger.info("Generating reports...")
md_path = generate_markdown_report(comparisons, output_dir)
json_path = generate_json_report(comparisons, output_dir)
# Summary
wins = {"qdrant": 0, "lightrag": 0, "tie": 0}
for c in comparisons:
wins[c.winner] += 1
print(f"\n{'=' * 60}")
print(f"Comparison complete: {len(comparisons)} queries")
print(f" Qdrant wins: {wins['qdrant']}")
print(f" LightRAG wins: {wins['lightrag']}")
print(f" Ties: {wins['tie']}")
print(f"\nReports:")
print(f" {md_path}")
print(f" {json_path}")
print(f"{'=' * 60}")
if __name__ == "__main__":
main()