chrysopedia/backend/pipeline/quality/results/chat_eval_baseline.json
jlightner 4854dad086 feat: Ran manual chat evaluation against live endpoint, documented qual…
- ".gsd/milestones/M025/slices/S09/S09-QUALITY-REPORT.md"
- "backend/pipeline/quality/results/chat_eval_baseline.json"

GSD-Task: S09/T03
2026-04-04 14:50:44 +00:00

91 lines
4.2 KiB
JSON

{
"timestamp": "20260404_043200",
"evaluation_method": "manual_curl",
"llm_status": "unavailable (upstream 502 Bad Gateway at chat.forgetyour.name)",
"api_health": "ok",
"total_queries": 6,
"scored_queries": 0,
"errors_llm": 6,
"note": "LLM completions unavailable — only source retrieval quality assessed. Re-run with automated eval when LLM proxy is restored.",
"source_retrieval_results": [
{
"query": "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?",
"creator": null,
"personality_weight": 0.0,
"category": "technical",
"source_count": 10,
"unique_creators": ["Break", "Caracal Project, The", "Chee", "KOAN Sound"],
"creator_distribution": {"Break": 3, "Caracal Project, The": 2, "Chee": 2, "KOAN Sound": 1},
"relevance_assessment": "highly_relevant",
"notes": "All 10 sources directly about sidechain compression. Good creator diversity."
},
{
"query": "What are the different approaches to layering synth sounds across creators?",
"creator": null,
"personality_weight": 0.0,
"category": "cross_creator",
"source_count": 10,
"unique_creators": ["Chee", "COPYCATT", "Caracal Project, The", "Current Value", "Emperor"],
"creator_distribution": {"Chee": 5, "COPYCATT": 2, "Caracal Project, The": 1, "Current Value": 1, "Emperor": 1},
"relevance_assessment": "relevant_but_skewed",
"notes": "50% of sources from Chee — cross-creator diversity could be improved."
},
{
"query": "How does this creator approach sound design for bass sounds?",
"creator": "Keota",
"personality_weight": 0.0,
"category": "creator_encyclopedic",
"source_count": 10,
"unique_creators": ["COPYCATT", "Break", "Chee", "Caracal Project, The"],
"creator_distribution": {"COPYCATT": 2, "Break": 2, "Chee": 3, "Caracal Project, The": 3},
"relevance_assessment": "creator_scope_failure",
"notes": "Zero sources from Keota despite creator-scoped query. Cascade fell through to global tier."
},
{
"query": "What mixing techniques does this creator recommend for achieving width in a mix?",
"creator": "Mr. Bill",
"personality_weight": 0.0,
"category": "creator_encyclopedic",
"source_count": 10,
"unique_creators": ["Break", "Frequent", "Caracal Project, The", "COPYCATT", "Chee"],
"creator_distribution": {"Break": 2, "Frequent": 1, "Caracal Project, The": 2, "COPYCATT": 2, "Chee": 3},
"relevance_assessment": "creator_scope_failure",
"notes": "Zero sources from Mr. Bill despite creator-scoped query."
},
{
"query": "How does this creator approach sound design for bass sounds? (personality)",
"creator": "Keota",
"personality_weight": 0.7,
"category": "creator_personality",
"source_count": 10,
"personality_profile_exists": false,
"notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently."
},
{
"query": "What mixing techniques does this creator recommend for width? (personality)",
"creator": "Mr. Bill",
"personality_weight": 0.7,
"category": "creator_personality",
"source_count": 10,
"personality_profile_exists": false,
"notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently."
}
],
"personality_profiles_status": {
"total_creators": 25,
"creators_with_profile": 0,
"assessment": "No personality profiles populated. The 5-tier progressive injection system is architecturally complete (26 unit tests pass) but functionally inert on the live system."
},
"prompt_changes": {
"before_lines": 4,
"after_lines": 18,
"changes": [
"Added structured citation guidance with inline example",
"Added response format section (2-4 paragraphs, bullet lists, bold terms)",
"Added domain awareness (music production subdomain list)",
"Added conflicting source handling instruction",
"Added response length guidance"
],
"test_impact": "Zero test modifications needed — all 26 tests pass unchanged"
}
}