From 4a3bb8208a36848896f4c26c66da89b91b915127 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sat, 4 Apr 2026 14:50:44 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Ran=20manual=20chat=20evaluation=20agai?= =?UTF-8?q?nst=20live=20endpoint,=20documented=20qual=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ".gsd/milestones/M025/slices/S09/S09-QUALITY-REPORT.md" - "backend/pipeline/quality/results/chat_eval_baseline.json" GSD-Task: S09/T03 --- .../quality/results/chat_eval_baseline.json | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 backend/pipeline/quality/results/chat_eval_baseline.json diff --git a/backend/pipeline/quality/results/chat_eval_baseline.json b/backend/pipeline/quality/results/chat_eval_baseline.json new file mode 100644 index 0000000..8fff98c --- /dev/null +++ b/backend/pipeline/quality/results/chat_eval_baseline.json @@ -0,0 +1,91 @@ +{ + "timestamp": "20260404_043200", + "evaluation_method": "manual_curl", + "llm_status": "unavailable (upstream 502 Bad Gateway at chat.forgetyour.name)", + "api_health": "ok", + "total_queries": 6, + "scored_queries": 0, + "errors_llm": 6, + "note": "LLM completions unavailable — only source retrieval quality assessed. Re-run with automated eval when LLM proxy is restored.", + "source_retrieval_results": [ + { + "query": "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?", + "creator": null, + "personality_weight": 0.0, + "category": "technical", + "source_count": 10, + "unique_creators": ["Break", "Caracal Project, The", "Chee", "KOAN Sound"], + "creator_distribution": {"Break": 3, "Caracal Project, The": 2, "Chee": 2, "KOAN Sound": 1}, + "relevance_assessment": "highly_relevant", + "notes": "All 10 sources directly about sidechain compression. Good creator diversity." + }, + { + "query": "What are the different approaches to layering synth sounds across creators?", + "creator": null, + "personality_weight": 0.0, + "category": "cross_creator", + "source_count": 10, + "unique_creators": ["Chee", "COPYCATT", "Caracal Project, The", "Current Value", "Emperor"], + "creator_distribution": {"Chee": 5, "COPYCATT": 2, "Caracal Project, The": 1, "Current Value": 1, "Emperor": 1}, + "relevance_assessment": "relevant_but_skewed", + "notes": "50% of sources from Chee — cross-creator diversity could be improved." + }, + { + "query": "How does this creator approach sound design for bass sounds?", + "creator": "Keota", + "personality_weight": 0.0, + "category": "creator_encyclopedic", + "source_count": 10, + "unique_creators": ["COPYCATT", "Break", "Chee", "Caracal Project, The"], + "creator_distribution": {"COPYCATT": 2, "Break": 2, "Chee": 3, "Caracal Project, The": 3}, + "relevance_assessment": "creator_scope_failure", + "notes": "Zero sources from Keota despite creator-scoped query. Cascade fell through to global tier." + }, + { + "query": "What mixing techniques does this creator recommend for achieving width in a mix?", + "creator": "Mr. Bill", + "personality_weight": 0.0, + "category": "creator_encyclopedic", + "source_count": 10, + "unique_creators": ["Break", "Frequent", "Caracal Project, The", "COPYCATT", "Chee"], + "creator_distribution": {"Break": 2, "Frequent": 1, "Caracal Project, The": 2, "COPYCATT": 2, "Chee": 3}, + "relevance_assessment": "creator_scope_failure", + "notes": "Zero sources from Mr. Bill despite creator-scoped query." + }, + { + "query": "How does this creator approach sound design for bass sounds? (personality)", + "creator": "Keota", + "personality_weight": 0.7, + "category": "creator_personality", + "source_count": 10, + "personality_profile_exists": false, + "notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently." + }, + { + "query": "What mixing techniques does this creator recommend for width? (personality)", + "creator": "Mr. Bill", + "personality_weight": 0.7, + "category": "creator_personality", + "source_count": 10, + "personality_profile_exists": false, + "notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently." + } + ], + "personality_profiles_status": { + "total_creators": 25, + "creators_with_profile": 0, + "assessment": "No personality profiles populated. The 5-tier progressive injection system is architecturally complete (26 unit tests pass) but functionally inert on the live system." + }, + "prompt_changes": { + "before_lines": 4, + "after_lines": 18, + "changes": [ + "Added structured citation guidance with inline example", + "Added response format section (2-4 paragraphs, bullet lists, bold terms)", + "Added domain awareness (music production subdomain list)", + "Added conflicting source handling instruction", + "Added response length guidance" + ], + "test_impact": "Zero test modifications needed — all 26 tests pass unchanged" + } +}