547 lines
20 KiB
Python
547 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""A/B comparison of Chrysopedia's Qdrant search vs LightRAG retrieval.
|
|
|
|
Runs a set of queries against both backends and produces a scored comparison
|
|
report. Designed to run inside the chrysopedia-api container (has network
|
|
access to both services) or via tunneled URLs.
|
|
|
|
Usage:
|
|
# Dry run — show query set without executing
|
|
python3 /app/scripts/compare_search.py --dry-run
|
|
|
|
# Run first 5 queries
|
|
python3 /app/scripts/compare_search.py --limit 5
|
|
|
|
# Full comparison
|
|
python3 /app/scripts/compare_search.py
|
|
|
|
# Custom URLs
|
|
python3 /app/scripts/compare_search.py --api-url http://localhost:8000 --lightrag-url http://localhost:9621
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger("compare_search")
|
|
|
|
# ── Query set ────────────────────────────────────────────────────────────────
|
|
|
|
# Real user queries (from search_log)
|
|
USER_QUERIES = [
|
|
"squelch",
|
|
"keota snare",
|
|
"reverb",
|
|
"how does keota snare",
|
|
"bass",
|
|
"groove",
|
|
"drums",
|
|
"fx",
|
|
"textures",
|
|
"daw setup",
|
|
"synthesis",
|
|
"how does keota",
|
|
"over-leveling snare to control compression behavior",
|
|
]
|
|
|
|
# Curated domain queries — test different retrieval patterns
|
|
CURATED_QUERIES = [
|
|
# Broad topic queries
|
|
"bass design techniques",
|
|
"reverb chains and spatial effects",
|
|
"how to layer drums",
|
|
# Cross-entity synthesis (LightRAG strength)
|
|
"what plugins are commonly used for bass sounds",
|
|
"compare different approaches to snare layering",
|
|
"how do different producers approach sound design",
|
|
# Exact lookup (Qdrant strength)
|
|
"COPYCATT",
|
|
"Emperor arrangement",
|
|
# How-to / procedural
|
|
"how to create tension in a buildup",
|
|
"step by step resampling workflow",
|
|
# Concept queries
|
|
"frequency spectrum balance",
|
|
"signal chain for drums",
|
|
]
|
|
|
|
ALL_QUERIES = USER_QUERIES + CURATED_QUERIES
|
|
|
|
|
|
# ── Data structures ──────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
title: str
|
|
score: float
|
|
snippet: str
|
|
result_type: str = ""
|
|
creator: str = ""
|
|
slug: str = ""
|
|
|
|
@dataclass
|
|
class QdrantSearchResponse:
|
|
query: str
|
|
results: list[SearchResult] = field(default_factory=list)
|
|
partial_matches: list[SearchResult] = field(default_factory=list)
|
|
total: int = 0
|
|
latency_ms: float = 0.0
|
|
error: str = ""
|
|
|
|
@dataclass
|
|
class LightRAGResponse:
|
|
query: str
|
|
response_text: str = ""
|
|
references: list[dict[str, Any]] = field(default_factory=list)
|
|
latency_ms: float = 0.0
|
|
error: str = ""
|
|
|
|
@dataclass
|
|
class QueryComparison:
|
|
query: str
|
|
query_type: str # "user" or "curated"
|
|
qdrant: QdrantSearchResponse | None = None
|
|
lightrag: LightRAGResponse | None = None
|
|
# Scores (populated by scoring phase)
|
|
qdrant_relevance: float = 0.0
|
|
qdrant_coverage: int = 0
|
|
qdrant_diversity: int = 0
|
|
lightrag_relevance: float = 0.0
|
|
lightrag_coverage: int = 0
|
|
lightrag_answer_quality: float = 0.0
|
|
winner: str = "" # "qdrant", "lightrag", "tie"
|
|
|
|
|
|
# ── Qdrant search client ────────────────────────────────────────────────────
|
|
|
|
def query_qdrant_search(api_url: str, query: str, limit: int = 20) -> QdrantSearchResponse:
|
|
"""Query the Chrysopedia search API (Qdrant + keyword)."""
|
|
url = f"{api_url}/api/v1/search"
|
|
params = {"q": query, "scope": "all", "limit": limit}
|
|
|
|
start = time.monotonic()
|
|
try:
|
|
resp = httpx.get(url, params=params, timeout=15)
|
|
latency = (time.monotonic() - start) * 1000
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except httpx.HTTPError as e:
|
|
latency = (time.monotonic() - start) * 1000
|
|
return QdrantSearchResponse(query=query, latency_ms=latency, error=str(e))
|
|
|
|
items = data.get("items", [])
|
|
partial = data.get("partial_matches", [])
|
|
|
|
results = [
|
|
SearchResult(
|
|
title=item.get("title", ""),
|
|
score=item.get("score", 0.0),
|
|
snippet=item.get("summary", "")[:200],
|
|
result_type=item.get("type", ""),
|
|
creator=item.get("creator_name", ""),
|
|
slug=item.get("slug", ""),
|
|
)
|
|
for item in items
|
|
]
|
|
partial_results = [
|
|
SearchResult(
|
|
title=item.get("title", ""),
|
|
score=item.get("score", 0.0),
|
|
snippet=item.get("summary", "")[:200],
|
|
result_type=item.get("type", ""),
|
|
creator=item.get("creator_name", ""),
|
|
slug=item.get("slug", ""),
|
|
)
|
|
for item in partial
|
|
]
|
|
|
|
return QdrantSearchResponse(
|
|
query=query,
|
|
results=results,
|
|
partial_matches=partial_results,
|
|
total=data.get("total", 0),
|
|
latency_ms=latency,
|
|
)
|
|
|
|
|
|
# ── LightRAG client ─────────────────────────────────────────────────────────
|
|
|
|
def query_lightrag(lightrag_url: str, query: str, mode: str = "hybrid") -> LightRAGResponse:
|
|
"""Query the LightRAG API."""
|
|
url = f"{lightrag_url}/query"
|
|
payload = {"query": query, "mode": mode}
|
|
|
|
start = time.monotonic()
|
|
try:
|
|
# LightRAG queries involve LLM inference — can take 2-4 minutes each
|
|
resp = httpx.post(url, json=payload, timeout=300)
|
|
latency = (time.monotonic() - start) * 1000
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except httpx.HTTPError as e:
|
|
latency = (time.monotonic() - start) * 1000
|
|
return LightRAGResponse(query=query, latency_ms=latency, error=str(e))
|
|
|
|
return LightRAGResponse(
|
|
query=query,
|
|
response_text=data.get("response", ""),
|
|
references=[
|
|
{"id": ref.get("reference_id", ""), "file_path": ref.get("file_path", "")}
|
|
for ref in data.get("references", [])
|
|
],
|
|
latency_ms=latency,
|
|
)
|
|
|
|
|
|
# ── Scoring ──────────────────────────────────────────────────────────────────
|
|
|
|
def _token_overlap(query: str, text: str) -> float:
|
|
"""Fraction of query tokens found in text (case-insensitive)."""
|
|
if not text:
|
|
return 0.0
|
|
query_tokens = {t.lower() for t in query.split() if len(t) > 2}
|
|
if not query_tokens:
|
|
return 0.0
|
|
text_lower = text.lower()
|
|
matched = sum(1 for t in query_tokens if t in text_lower)
|
|
return matched / len(query_tokens)
|
|
|
|
|
|
def score_qdrant_results(comp: QueryComparison) -> None:
|
|
"""Score Qdrant results on relevance, coverage, and diversity."""
|
|
if not comp.qdrant or comp.qdrant.error:
|
|
return
|
|
|
|
results = comp.qdrant.results
|
|
if not results:
|
|
# Check partial matches
|
|
results = comp.qdrant.partial_matches
|
|
|
|
if not results:
|
|
comp.qdrant_relevance = 0.0
|
|
comp.qdrant_coverage = 0
|
|
comp.qdrant_diversity = 0
|
|
return
|
|
|
|
# Relevance: average token overlap across top-5 results
|
|
overlaps = []
|
|
for r in results[:5]:
|
|
combined = f"{r.title} {r.snippet} {r.creator}"
|
|
overlaps.append(_token_overlap(comp.query, combined))
|
|
comp.qdrant_relevance = round((sum(overlaps) / len(overlaps)) * 5, 2) if overlaps else 0.0
|
|
|
|
# Coverage: unique technique pages
|
|
slugs = {r.slug for r in results if r.slug}
|
|
comp.qdrant_coverage = len(slugs)
|
|
|
|
# Diversity: unique creators
|
|
creators = {r.creator for r in results if r.creator}
|
|
comp.qdrant_diversity = len(creators)
|
|
|
|
|
|
def score_lightrag_results(comp: QueryComparison) -> None:
|
|
"""Score LightRAG results on relevance, coverage, and answer quality."""
|
|
if not comp.lightrag or comp.lightrag.error:
|
|
return
|
|
|
|
text = comp.lightrag.response_text
|
|
refs = comp.lightrag.references
|
|
|
|
if not text:
|
|
comp.lightrag_relevance = 0.0
|
|
comp.lightrag_coverage = 0
|
|
comp.lightrag_answer_quality = 0.0
|
|
return
|
|
|
|
# Relevance: token overlap between query and response
|
|
comp.lightrag_relevance = round(_token_overlap(comp.query, text) * 5, 2)
|
|
|
|
# Coverage: unique technique pages referenced
|
|
unique_sources = {r["file_path"] for r in refs if r.get("file_path")}
|
|
comp.lightrag_coverage = len(unique_sources)
|
|
|
|
# Answer quality (0-5 composite):
|
|
quality = 0.0
|
|
|
|
# Length: longer synthesized answers are generally better (up to a point)
|
|
word_count = len(text.split())
|
|
if word_count > 20:
|
|
quality += 1.0
|
|
if word_count > 100:
|
|
quality += 0.5
|
|
if word_count > 200:
|
|
quality += 0.5
|
|
|
|
# References: more cross-page references = better synthesis
|
|
if len(unique_sources) >= 2:
|
|
quality += 1.0
|
|
if len(unique_sources) >= 4:
|
|
quality += 0.5
|
|
|
|
# Structure: has headings, bullet points, or numbered lists
|
|
if "**" in text or "##" in text:
|
|
quality += 0.5
|
|
if "- " in text or "* " in text:
|
|
quality += 0.5
|
|
|
|
# Doesn't say "no information available" or similar
|
|
negative_phrases = ["no information", "not mentioned", "no data", "cannot find"]
|
|
has_negative = any(phrase in text.lower() for phrase in negative_phrases)
|
|
if not has_negative:
|
|
quality += 0.5
|
|
else:
|
|
quality -= 1.0
|
|
|
|
comp.lightrag_answer_quality = round(min(quality, 5.0), 2)
|
|
|
|
|
|
def determine_winner(comp: QueryComparison) -> None:
|
|
"""Determine which backend wins for this query."""
|
|
# Composite score: relevance weight 0.4, coverage 0.3, quality/diversity 0.3
|
|
qdrant_score = (
|
|
comp.qdrant_relevance * 0.4
|
|
+ min(comp.qdrant_coverage, 5) * 0.3
|
|
+ min(comp.qdrant_diversity, 3) * 0.3
|
|
)
|
|
lightrag_score = (
|
|
comp.lightrag_relevance * 0.4
|
|
+ min(comp.lightrag_coverage, 5) * 0.3
|
|
+ comp.lightrag_answer_quality * 0.3
|
|
)
|
|
|
|
if abs(qdrant_score - lightrag_score) < 0.5:
|
|
comp.winner = "tie"
|
|
elif qdrant_score > lightrag_score:
|
|
comp.winner = "qdrant"
|
|
else:
|
|
comp.winner = "lightrag"
|
|
|
|
|
|
# ── Report generation ────────────────────────────────────────────────────────
|
|
|
|
def generate_markdown_report(comparisons: list[QueryComparison], output_dir: Path) -> Path:
|
|
"""Generate a human-readable markdown comparison report."""
|
|
lines: list[str] = []
|
|
|
|
lines.append("# Search A/B Comparison: Qdrant vs LightRAG")
|
|
lines.append(f"\n_Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}_")
|
|
lines.append(f"\n**Queries evaluated:** {len(comparisons)}")
|
|
|
|
# Aggregate stats
|
|
wins = {"qdrant": 0, "lightrag": 0, "tie": 0}
|
|
qdrant_latencies = []
|
|
lightrag_latencies = []
|
|
for c in comparisons:
|
|
wins[c.winner] += 1
|
|
if c.qdrant and not c.qdrant.error:
|
|
qdrant_latencies.append(c.qdrant.latency_ms)
|
|
if c.lightrag and not c.lightrag.error:
|
|
lightrag_latencies.append(c.lightrag.latency_ms)
|
|
|
|
lines.append("\n## Aggregate Results\n")
|
|
lines.append(f"| Metric | Qdrant Search | LightRAG |")
|
|
lines.append(f"|--------|:-------------:|:--------:|")
|
|
lines.append(f"| **Wins** | {wins['qdrant']} | {wins['lightrag']} |")
|
|
lines.append(f"| **Ties** | {wins['tie']} | {wins['tie']} |")
|
|
|
|
avg_q_str = f"{sum(qdrant_latencies) / len(qdrant_latencies):.0f}ms" if qdrant_latencies else "N/A"
|
|
avg_l_str = f"{sum(lightrag_latencies) / len(lightrag_latencies):.0f}ms" if lightrag_latencies else "N/A"
|
|
lines.append(f"| **Avg latency** | {avg_q_str} | {avg_l_str} |")
|
|
|
|
avg_qr = sum(c.qdrant_relevance for c in comparisons) / len(comparisons) if comparisons else 0
|
|
avg_lr = sum(c.lightrag_relevance for c in comparisons) / len(comparisons) if comparisons else 0
|
|
lines.append(f"| **Avg relevance** | {avg_qr:.2f}/5 | {avg_lr:.2f}/5 |")
|
|
|
|
avg_qc = sum(c.qdrant_coverage for c in comparisons) / len(comparisons) if comparisons else 0
|
|
avg_lc = sum(c.lightrag_coverage for c in comparisons) / len(comparisons) if comparisons else 0
|
|
lines.append(f"| **Avg coverage** | {avg_qc:.1f} pages | {avg_lc:.1f} refs |")
|
|
|
|
# Per-query detail
|
|
lines.append("\n## Per-Query Comparison\n")
|
|
lines.append("| # | Query | Type | Qdrant Rel | LR Rel | Qdrant Cov | LR Cov | LR Quality | Winner |")
|
|
lines.append("|---|-------|------|:----------:|:------:|:----------:|:------:|:----------:|:------:|")
|
|
|
|
for i, c in enumerate(comparisons, 1):
|
|
q_display = c.query[:45] + "…" if len(c.query) > 45 else c.query
|
|
winner_emoji = {"qdrant": "🔵", "lightrag": "🟢", "tie": "⚪"}[c.winner]
|
|
lines.append(
|
|
f"| {i} | {q_display} | {c.query_type} | {c.qdrant_relevance:.1f} | "
|
|
f"{c.lightrag_relevance:.1f} | {c.qdrant_coverage} | {c.lightrag_coverage} | "
|
|
f"{c.lightrag_answer_quality:.1f} | {winner_emoji} {c.winner} |"
|
|
)
|
|
|
|
# Detailed results for interesting queries
|
|
lines.append("\n## Notable Comparisons\n")
|
|
# Pick queries where there's a clear winner with interesting differences
|
|
notable = [c for c in comparisons if c.winner != "tie"][:5]
|
|
for c in notable:
|
|
lines.append(f"### Query: \"{c.query}\"\n")
|
|
lines.append(f"**Winner: {c.winner}**\n")
|
|
|
|
if c.qdrant and c.qdrant.results:
|
|
lines.append("**Qdrant results:**")
|
|
for r in c.qdrant.results[:3]:
|
|
lines.append(f"- {r.title} (by {r.creator}, score: {r.score:.2f})")
|
|
lines.append("")
|
|
|
|
if c.lightrag and c.lightrag.response_text:
|
|
# Show first 300 chars of LightRAG response
|
|
preview = c.lightrag.response_text[:300]
|
|
if len(c.lightrag.response_text) > 300:
|
|
preview += "…"
|
|
lines.append(f"**LightRAG response preview:**")
|
|
lines.append(f"> {preview}\n")
|
|
if c.lightrag.references:
|
|
ref_slugs = [r["file_path"] for r in c.lightrag.references[:5]]
|
|
lines.append(f"References: {', '.join(ref_slugs)}\n")
|
|
|
|
# Data coverage note
|
|
lines.append("\n## Data Coverage Note\n")
|
|
lines.append(
|
|
"LightRAG has 18 of 93 technique pages indexed. "
|
|
"Results may improve significantly after full reindexing. "
|
|
"Qdrant has all 93 pages embedded."
|
|
)
|
|
|
|
report_path = output_dir / "comparison_report.md"
|
|
report_path.write_text("\n".join(lines), encoding="utf-8")
|
|
return report_path
|
|
|
|
|
|
def generate_json_report(comparisons: list[QueryComparison], output_dir: Path) -> Path:
|
|
"""Write full structured comparison data to JSON."""
|
|
|
|
def _serialize(obj):
|
|
if hasattr(obj, "__dict__"):
|
|
return {k: _serialize(v) for k, v in obj.__dict__.items()}
|
|
if isinstance(obj, list):
|
|
return [_serialize(i) for i in obj]
|
|
if isinstance(obj, dict):
|
|
return {k: _serialize(v) for k, v in obj.items()}
|
|
return obj
|
|
|
|
data = {
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"query_count": len(comparisons),
|
|
"comparisons": [_serialize(c) for c in comparisons],
|
|
}
|
|
|
|
report_path = output_dir / "comparison_report.json"
|
|
report_path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8")
|
|
return report_path
|
|
|
|
|
|
# ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="A/B compare Qdrant search vs LightRAG")
|
|
parser.add_argument(
|
|
"--api-url",
|
|
default=os.environ.get("API_URL", "http://127.0.0.1:8000"),
|
|
help="Chrysopedia API base URL (default: http://127.0.0.1:8000)",
|
|
)
|
|
parser.add_argument(
|
|
"--lightrag-url",
|
|
default=os.environ.get("LIGHTRAG_URL", "http://chrysopedia-lightrag:9621"),
|
|
help="LightRAG API base URL (default: http://chrysopedia-lightrag:9621)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
default=os.environ.get("OUTPUT_DIR", "/app/scripts/output"),
|
|
help="Output directory for reports",
|
|
)
|
|
parser.add_argument("--limit", type=int, default=None, help="Process only first N queries")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show query set without executing")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Debug logging")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
queries = ALL_QUERIES[:args.limit] if args.limit else ALL_QUERIES
|
|
|
|
if args.dry_run:
|
|
print(f"Query set ({len(queries)} queries):")
|
|
for i, q in enumerate(queries, 1):
|
|
qtype = "user" if q in USER_QUERIES else "curated"
|
|
print(f" {i:2d}. [{qtype:>7s}] {q}")
|
|
return
|
|
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
comparisons: list[QueryComparison] = []
|
|
|
|
for i, query in enumerate(queries, 1):
|
|
qtype = "user" if query in USER_QUERIES else "curated"
|
|
logger.info("[%d/%d] Query: %r (%s)", i, len(queries), query, qtype)
|
|
|
|
# Query both backends
|
|
qdrant_resp = query_qdrant_search(args.api_url, query)
|
|
lightrag_resp = query_lightrag(args.lightrag_url, query)
|
|
|
|
if qdrant_resp.error:
|
|
logger.warning(" Qdrant error: %s", qdrant_resp.error)
|
|
else:
|
|
logger.info(" Qdrant: %d results in %.0fms", qdrant_resp.total, qdrant_resp.latency_ms)
|
|
|
|
if lightrag_resp.error:
|
|
logger.warning(" LightRAG error: %s", lightrag_resp.error)
|
|
else:
|
|
ref_count = len(lightrag_resp.references)
|
|
word_count = len(lightrag_resp.response_text.split())
|
|
logger.info(" LightRAG: %d words, %d refs in %.0fms", word_count, ref_count, lightrag_resp.latency_ms)
|
|
|
|
comp = QueryComparison(query=query, query_type=qtype, qdrant=qdrant_resp, lightrag=lightrag_resp)
|
|
|
|
# Score
|
|
score_qdrant_results(comp)
|
|
score_lightrag_results(comp)
|
|
determine_winner(comp)
|
|
|
|
logger.info(
|
|
" Scores → Qdrant: rel=%.1f cov=%d div=%d | LightRAG: rel=%.1f cov=%d qual=%.1f | Winner: %s",
|
|
comp.qdrant_relevance, comp.qdrant_coverage, comp.qdrant_diversity,
|
|
comp.lightrag_relevance, comp.lightrag_coverage, comp.lightrag_answer_quality,
|
|
comp.winner,
|
|
)
|
|
|
|
comparisons.append(comp)
|
|
|
|
# Generate reports
|
|
logger.info("Generating reports...")
|
|
md_path = generate_markdown_report(comparisons, output_dir)
|
|
json_path = generate_json_report(comparisons, output_dir)
|
|
|
|
# Summary
|
|
wins = {"qdrant": 0, "lightrag": 0, "tie": 0}
|
|
for c in comparisons:
|
|
wins[c.winner] += 1
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Comparison complete: {len(comparisons)} queries")
|
|
print(f" Qdrant wins: {wins['qdrant']}")
|
|
print(f" LightRAG wins: {wins['lightrag']}")
|
|
print(f" Ties: {wins['tie']}")
|
|
print(f"\nReports:")
|
|
print(f" {md_path}")
|
|
print(f" {json_path}")
|
|
print(f"{'=' * 60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|