#!/usr/bin/env python3 """A/B comparison of Chrysopedia's Qdrant search vs LightRAG retrieval. Runs a set of queries against both backends and produces a scored comparison report. Designed to run inside the chrysopedia-api container (has network access to both services) or via tunneled URLs. Usage: # Dry run — show query set without executing python3 /app/scripts/compare_search.py --dry-run # Run first 5 queries python3 /app/scripts/compare_search.py --limit 5 # Full comparison python3 /app/scripts/compare_search.py # Custom URLs python3 /app/scripts/compare_search.py --api-url http://localhost:8000 --lightrag-url http://localhost:9621 """ from __future__ import annotations import argparse import json import logging import os import sys import time from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import httpx logger = logging.getLogger("compare_search") # ── Query set ──────────────────────────────────────────────────────────────── # Real user queries (from search_log) USER_QUERIES = [ "squelch", "keota snare", "reverb", "how does keota snare", "bass", "groove", "drums", "fx", "textures", "daw setup", "synthesis", "how does keota", "over-leveling snare to control compression behavior", ] # Curated domain queries — test different retrieval patterns CURATED_QUERIES = [ # Broad topic queries "bass design techniques", "reverb chains and spatial effects", "how to layer drums", # Cross-entity synthesis (LightRAG strength) "what plugins are commonly used for bass sounds", "compare different approaches to snare layering", "how do different producers approach sound design", # Exact lookup (Qdrant strength) "COPYCATT", "Emperor arrangement", # How-to / procedural "how to create tension in a buildup", "step by step resampling workflow", # Concept queries "frequency spectrum balance", "signal chain for drums", ] ALL_QUERIES = USER_QUERIES + CURATED_QUERIES # ── Data structures ────────────────────────────────────────────────────────── @dataclass class SearchResult: title: str score: float snippet: str result_type: str = "" creator: str = "" slug: str = "" @dataclass class QdrantSearchResponse: query: str results: list[SearchResult] = field(default_factory=list) partial_matches: list[SearchResult] = field(default_factory=list) total: int = 0 latency_ms: float = 0.0 error: str = "" @dataclass class LightRAGResponse: query: str response_text: str = "" references: list[dict[str, Any]] = field(default_factory=list) latency_ms: float = 0.0 error: str = "" @dataclass class QueryComparison: query: str query_type: str # "user" or "curated" qdrant: QdrantSearchResponse | None = None lightrag: LightRAGResponse | None = None # Scores (populated by scoring phase) qdrant_relevance: float = 0.0 qdrant_coverage: int = 0 qdrant_diversity: int = 0 lightrag_relevance: float = 0.0 lightrag_coverage: int = 0 lightrag_answer_quality: float = 0.0 winner: str = "" # "qdrant", "lightrag", "tie" # ── Qdrant search client ──────────────────────────────────────────────────── def query_qdrant_search(api_url: str, query: str, limit: int = 20) -> QdrantSearchResponse: """Query the Chrysopedia search API (Qdrant + keyword).""" url = f"{api_url}/api/v1/search" params = {"q": query, "scope": "all", "limit": limit} start = time.monotonic() try: resp = httpx.get(url, params=params, timeout=15) latency = (time.monotonic() - start) * 1000 resp.raise_for_status() data = resp.json() except httpx.HTTPError as e: latency = (time.monotonic() - start) * 1000 return QdrantSearchResponse(query=query, latency_ms=latency, error=str(e)) items = data.get("items", []) partial = data.get("partial_matches", []) results = [ SearchResult( title=item.get("title", ""), score=item.get("score", 0.0), snippet=item.get("summary", "")[:200], result_type=item.get("type", ""), creator=item.get("creator_name", ""), slug=item.get("slug", ""), ) for item in items ] partial_results = [ SearchResult( title=item.get("title", ""), score=item.get("score", 0.0), snippet=item.get("summary", "")[:200], result_type=item.get("type", ""), creator=item.get("creator_name", ""), slug=item.get("slug", ""), ) for item in partial ] return QdrantSearchResponse( query=query, results=results, partial_matches=partial_results, total=data.get("total", 0), latency_ms=latency, ) # ── LightRAG client ───────────────────────────────────────────────────────── def query_lightrag(lightrag_url: str, query: str, mode: str = "hybrid") -> LightRAGResponse: """Query the LightRAG API.""" url = f"{lightrag_url}/query" payload = {"query": query, "mode": mode} start = time.monotonic() try: # LightRAG queries involve LLM inference — can take 2-4 minutes each resp = httpx.post(url, json=payload, timeout=300) latency = (time.monotonic() - start) * 1000 resp.raise_for_status() data = resp.json() except httpx.HTTPError as e: latency = (time.monotonic() - start) * 1000 return LightRAGResponse(query=query, latency_ms=latency, error=str(e)) return LightRAGResponse( query=query, response_text=data.get("response", ""), references=[ {"id": ref.get("reference_id", ""), "file_path": ref.get("file_path", "")} for ref in data.get("references", []) ], latency_ms=latency, ) # ── Scoring ────────────────────────────────────────────────────────────────── def _token_overlap(query: str, text: str) -> float: """Fraction of query tokens found in text (case-insensitive).""" if not text: return 0.0 query_tokens = {t.lower() for t in query.split() if len(t) > 2} if not query_tokens: return 0.0 text_lower = text.lower() matched = sum(1 for t in query_tokens if t in text_lower) return matched / len(query_tokens) def score_qdrant_results(comp: QueryComparison) -> None: """Score Qdrant results on relevance, coverage, and diversity.""" if not comp.qdrant or comp.qdrant.error: return results = comp.qdrant.results if not results: # Check partial matches results = comp.qdrant.partial_matches if not results: comp.qdrant_relevance = 0.0 comp.qdrant_coverage = 0 comp.qdrant_diversity = 0 return # Relevance: average token overlap across top-5 results overlaps = [] for r in results[:5]: combined = f"{r.title} {r.snippet} {r.creator}" overlaps.append(_token_overlap(comp.query, combined)) comp.qdrant_relevance = round((sum(overlaps) / len(overlaps)) * 5, 2) if overlaps else 0.0 # Coverage: unique technique pages slugs = {r.slug for r in results if r.slug} comp.qdrant_coverage = len(slugs) # Diversity: unique creators creators = {r.creator for r in results if r.creator} comp.qdrant_diversity = len(creators) def score_lightrag_results(comp: QueryComparison) -> None: """Score LightRAG results on relevance, coverage, and answer quality.""" if not comp.lightrag or comp.lightrag.error: return text = comp.lightrag.response_text refs = comp.lightrag.references if not text: comp.lightrag_relevance = 0.0 comp.lightrag_coverage = 0 comp.lightrag_answer_quality = 0.0 return # Relevance: token overlap between query and response comp.lightrag_relevance = round(_token_overlap(comp.query, text) * 5, 2) # Coverage: unique technique pages referenced unique_sources = {r["file_path"] for r in refs if r.get("file_path")} comp.lightrag_coverage = len(unique_sources) # Answer quality (0-5 composite): quality = 0.0 # Length: longer synthesized answers are generally better (up to a point) word_count = len(text.split()) if word_count > 20: quality += 1.0 if word_count > 100: quality += 0.5 if word_count > 200: quality += 0.5 # References: more cross-page references = better synthesis if len(unique_sources) >= 2: quality += 1.0 if len(unique_sources) >= 4: quality += 0.5 # Structure: has headings, bullet points, or numbered lists if "**" in text or "##" in text: quality += 0.5 if "- " in text or "* " in text: quality += 0.5 # Doesn't say "no information available" or similar negative_phrases = ["no information", "not mentioned", "no data", "cannot find"] has_negative = any(phrase in text.lower() for phrase in negative_phrases) if not has_negative: quality += 0.5 else: quality -= 1.0 comp.lightrag_answer_quality = round(min(quality, 5.0), 2) def determine_winner(comp: QueryComparison) -> None: """Determine which backend wins for this query.""" # Composite score: relevance weight 0.4, coverage 0.3, quality/diversity 0.3 qdrant_score = ( comp.qdrant_relevance * 0.4 + min(comp.qdrant_coverage, 5) * 0.3 + min(comp.qdrant_diversity, 3) * 0.3 ) lightrag_score = ( comp.lightrag_relevance * 0.4 + min(comp.lightrag_coverage, 5) * 0.3 + comp.lightrag_answer_quality * 0.3 ) if abs(qdrant_score - lightrag_score) < 0.5: comp.winner = "tie" elif qdrant_score > lightrag_score: comp.winner = "qdrant" else: comp.winner = "lightrag" # ── Report generation ──────────────────────────────────────────────────────── def generate_markdown_report(comparisons: list[QueryComparison], output_dir: Path) -> Path: """Generate a human-readable markdown comparison report.""" lines: list[str] = [] lines.append("# Search A/B Comparison: Qdrant vs LightRAG") lines.append(f"\n_Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}_") lines.append(f"\n**Queries evaluated:** {len(comparisons)}") # Aggregate stats wins = {"qdrant": 0, "lightrag": 0, "tie": 0} qdrant_latencies = [] lightrag_latencies = [] for c in comparisons: wins[c.winner] += 1 if c.qdrant and not c.qdrant.error: qdrant_latencies.append(c.qdrant.latency_ms) if c.lightrag and not c.lightrag.error: lightrag_latencies.append(c.lightrag.latency_ms) lines.append("\n## Aggregate Results\n") lines.append(f"| Metric | Qdrant Search | LightRAG |") lines.append(f"|--------|:-------------:|:--------:|") lines.append(f"| **Wins** | {wins['qdrant']} | {wins['lightrag']} |") lines.append(f"| **Ties** | {wins['tie']} | {wins['tie']} |") avg_q_str = f"{sum(qdrant_latencies) / len(qdrant_latencies):.0f}ms" if qdrant_latencies else "N/A" avg_l_str = f"{sum(lightrag_latencies) / len(lightrag_latencies):.0f}ms" if lightrag_latencies else "N/A" lines.append(f"| **Avg latency** | {avg_q_str} | {avg_l_str} |") avg_qr = sum(c.qdrant_relevance for c in comparisons) / len(comparisons) if comparisons else 0 avg_lr = sum(c.lightrag_relevance for c in comparisons) / len(comparisons) if comparisons else 0 lines.append(f"| **Avg relevance** | {avg_qr:.2f}/5 | {avg_lr:.2f}/5 |") avg_qc = sum(c.qdrant_coverage for c in comparisons) / len(comparisons) if comparisons else 0 avg_lc = sum(c.lightrag_coverage for c in comparisons) / len(comparisons) if comparisons else 0 lines.append(f"| **Avg coverage** | {avg_qc:.1f} pages | {avg_lc:.1f} refs |") # Per-query detail lines.append("\n## Per-Query Comparison\n") lines.append("| # | Query | Type | Qdrant Rel | LR Rel | Qdrant Cov | LR Cov | LR Quality | Winner |") lines.append("|---|-------|------|:----------:|:------:|:----------:|:------:|:----------:|:------:|") for i, c in enumerate(comparisons, 1): q_display = c.query[:45] + "…" if len(c.query) > 45 else c.query winner_emoji = {"qdrant": "🔵", "lightrag": "🟢", "tie": "⚪"}[c.winner] lines.append( f"| {i} | {q_display} | {c.query_type} | {c.qdrant_relevance:.1f} | " f"{c.lightrag_relevance:.1f} | {c.qdrant_coverage} | {c.lightrag_coverage} | " f"{c.lightrag_answer_quality:.1f} | {winner_emoji} {c.winner} |" ) # Detailed results for interesting queries lines.append("\n## Notable Comparisons\n") # Pick queries where there's a clear winner with interesting differences notable = [c for c in comparisons if c.winner != "tie"][:5] for c in notable: lines.append(f"### Query: \"{c.query}\"\n") lines.append(f"**Winner: {c.winner}**\n") if c.qdrant and c.qdrant.results: lines.append("**Qdrant results:**") for r in c.qdrant.results[:3]: lines.append(f"- {r.title} (by {r.creator}, score: {r.score:.2f})") lines.append("") if c.lightrag and c.lightrag.response_text: # Show first 300 chars of LightRAG response preview = c.lightrag.response_text[:300] if len(c.lightrag.response_text) > 300: preview += "…" lines.append(f"**LightRAG response preview:**") lines.append(f"> {preview}\n") if c.lightrag.references: ref_slugs = [r["file_path"] for r in c.lightrag.references[:5]] lines.append(f"References: {', '.join(ref_slugs)}\n") # Data coverage note lines.append("\n## Data Coverage Note\n") lines.append( "LightRAG has 18 of 93 technique pages indexed. " "Results may improve significantly after full reindexing. " "Qdrant has all 93 pages embedded." ) report_path = output_dir / "comparison_report.md" report_path.write_text("\n".join(lines), encoding="utf-8") return report_path def generate_json_report(comparisons: list[QueryComparison], output_dir: Path) -> Path: """Write full structured comparison data to JSON.""" def _serialize(obj): if hasattr(obj, "__dict__"): return {k: _serialize(v) for k, v in obj.__dict__.items()} if isinstance(obj, list): return [_serialize(i) for i in obj] if isinstance(obj, dict): return {k: _serialize(v) for k, v in obj.items()} return obj data = { "generated_at": datetime.now(timezone.utc).isoformat(), "query_count": len(comparisons), "comparisons": [_serialize(c) for c in comparisons], } report_path = output_dir / "comparison_report.json" report_path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8") return report_path # ── Main ───────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="A/B compare Qdrant search vs LightRAG") parser.add_argument( "--api-url", default=os.environ.get("API_URL", "http://127.0.0.1:8000"), help="Chrysopedia API base URL (default: http://127.0.0.1:8000)", ) parser.add_argument( "--lightrag-url", default=os.environ.get("LIGHTRAG_URL", "http://chrysopedia-lightrag:9621"), help="LightRAG API base URL (default: http://chrysopedia-lightrag:9621)", ) parser.add_argument( "--output-dir", default=os.environ.get("OUTPUT_DIR", "/app/scripts/output"), help="Output directory for reports", ) parser.add_argument("--limit", type=int, default=None, help="Process only first N queries") parser.add_argument("--dry-run", action="store_true", help="Show query set without executing") parser.add_argument("--verbose", "-v", action="store_true", help="Debug logging") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S", ) queries = ALL_QUERIES[:args.limit] if args.limit else ALL_QUERIES if args.dry_run: print(f"Query set ({len(queries)} queries):") for i, q in enumerate(queries, 1): qtype = "user" if q in USER_QUERIES else "curated" print(f" {i:2d}. [{qtype:>7s}] {q}") return output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) comparisons: list[QueryComparison] = [] for i, query in enumerate(queries, 1): qtype = "user" if query in USER_QUERIES else "curated" logger.info("[%d/%d] Query: %r (%s)", i, len(queries), query, qtype) # Query both backends qdrant_resp = query_qdrant_search(args.api_url, query) lightrag_resp = query_lightrag(args.lightrag_url, query) if qdrant_resp.error: logger.warning(" Qdrant error: %s", qdrant_resp.error) else: logger.info(" Qdrant: %d results in %.0fms", qdrant_resp.total, qdrant_resp.latency_ms) if lightrag_resp.error: logger.warning(" LightRAG error: %s", lightrag_resp.error) else: ref_count = len(lightrag_resp.references) word_count = len(lightrag_resp.response_text.split()) logger.info(" LightRAG: %d words, %d refs in %.0fms", word_count, ref_count, lightrag_resp.latency_ms) comp = QueryComparison(query=query, query_type=qtype, qdrant=qdrant_resp, lightrag=lightrag_resp) # Score score_qdrant_results(comp) score_lightrag_results(comp) determine_winner(comp) logger.info( " Scores → Qdrant: rel=%.1f cov=%d div=%d | LightRAG: rel=%.1f cov=%d qual=%.1f | Winner: %s", comp.qdrant_relevance, comp.qdrant_coverage, comp.qdrant_diversity, comp.lightrag_relevance, comp.lightrag_coverage, comp.lightrag_answer_quality, comp.winner, ) comparisons.append(comp) # Generate reports logger.info("Generating reports...") md_path = generate_markdown_report(comparisons, output_dir) json_path = generate_json_report(comparisons, output_dir) # Summary wins = {"qdrant": 0, "lightrag": 0, "tie": 0} for c in comparisons: wins[c.winner] += 1 print(f"\n{'=' * 60}") print(f"Comparison complete: {len(comparisons)} queries") print(f" Qdrant wins: {wins['qdrant']}") print(f" LightRAG wins: {wins['lightrag']}") print(f" Ties: {wins['tie']}") print(f"\nReports:") print(f" {md_path}") print(f" {json_path}") print(f"{'=' * 60}") if __name__ == "__main__": main()