feat: Ran manual chat evaluation against live endpoint, documented qual…
- ".gsd/milestones/M025/slices/S09/S09-QUALITY-REPORT.md" - "backend/pipeline/quality/results/chat_eval_baseline.json" GSD-Task: S09/T03
This commit is contained in:
parent
8f7763d822
commit
4a3bb8208a
1 changed files with 91 additions and 0 deletions
91
backend/pipeline/quality/results/chat_eval_baseline.json
Normal file
91
backend/pipeline/quality/results/chat_eval_baseline.json
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
{
|
||||
"timestamp": "20260404_043200",
|
||||
"evaluation_method": "manual_curl",
|
||||
"llm_status": "unavailable (upstream 502 Bad Gateway at chat.forgetyour.name)",
|
||||
"api_health": "ok",
|
||||
"total_queries": 6,
|
||||
"scored_queries": 0,
|
||||
"errors_llm": 6,
|
||||
"note": "LLM completions unavailable — only source retrieval quality assessed. Re-run with automated eval when LLM proxy is restored.",
|
||||
"source_retrieval_results": [
|
||||
{
|
||||
"query": "How do I set up sidechain compression on a bass synth using a kick drum as the trigger?",
|
||||
"creator": null,
|
||||
"personality_weight": 0.0,
|
||||
"category": "technical",
|
||||
"source_count": 10,
|
||||
"unique_creators": ["Break", "Caracal Project, The", "Chee", "KOAN Sound"],
|
||||
"creator_distribution": {"Break": 3, "Caracal Project, The": 2, "Chee": 2, "KOAN Sound": 1},
|
||||
"relevance_assessment": "highly_relevant",
|
||||
"notes": "All 10 sources directly about sidechain compression. Good creator diversity."
|
||||
},
|
||||
{
|
||||
"query": "What are the different approaches to layering synth sounds across creators?",
|
||||
"creator": null,
|
||||
"personality_weight": 0.0,
|
||||
"category": "cross_creator",
|
||||
"source_count": 10,
|
||||
"unique_creators": ["Chee", "COPYCATT", "Caracal Project, The", "Current Value", "Emperor"],
|
||||
"creator_distribution": {"Chee": 5, "COPYCATT": 2, "Caracal Project, The": 1, "Current Value": 1, "Emperor": 1},
|
||||
"relevance_assessment": "relevant_but_skewed",
|
||||
"notes": "50% of sources from Chee — cross-creator diversity could be improved."
|
||||
},
|
||||
{
|
||||
"query": "How does this creator approach sound design for bass sounds?",
|
||||
"creator": "Keota",
|
||||
"personality_weight": 0.0,
|
||||
"category": "creator_encyclopedic",
|
||||
"source_count": 10,
|
||||
"unique_creators": ["COPYCATT", "Break", "Chee", "Caracal Project, The"],
|
||||
"creator_distribution": {"COPYCATT": 2, "Break": 2, "Chee": 3, "Caracal Project, The": 3},
|
||||
"relevance_assessment": "creator_scope_failure",
|
||||
"notes": "Zero sources from Keota despite creator-scoped query. Cascade fell through to global tier."
|
||||
},
|
||||
{
|
||||
"query": "What mixing techniques does this creator recommend for achieving width in a mix?",
|
||||
"creator": "Mr. Bill",
|
||||
"personality_weight": 0.0,
|
||||
"category": "creator_encyclopedic",
|
||||
"source_count": 10,
|
||||
"unique_creators": ["Break", "Frequent", "Caracal Project, The", "COPYCATT", "Chee"],
|
||||
"creator_distribution": {"Break": 2, "Frequent": 1, "Caracal Project, The": 2, "COPYCATT": 2, "Chee": 3},
|
||||
"relevance_assessment": "creator_scope_failure",
|
||||
"notes": "Zero sources from Mr. Bill despite creator-scoped query."
|
||||
},
|
||||
{
|
||||
"query": "How does this creator approach sound design for bass sounds? (personality)",
|
||||
"creator": "Keota",
|
||||
"personality_weight": 0.7,
|
||||
"category": "creator_personality",
|
||||
"source_count": 10,
|
||||
"personality_profile_exists": false,
|
||||
"notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently."
|
||||
},
|
||||
{
|
||||
"query": "What mixing techniques does this creator recommend for width? (personality)",
|
||||
"creator": "Mr. Bill",
|
||||
"personality_weight": 0.7,
|
||||
"category": "creator_personality",
|
||||
"source_count": 10,
|
||||
"personality_profile_exists": false,
|
||||
"notes": "Personality weight=0.7 accepted but no profile data exists — falls back to encyclopedic mode silently."
|
||||
}
|
||||
],
|
||||
"personality_profiles_status": {
|
||||
"total_creators": 25,
|
||||
"creators_with_profile": 0,
|
||||
"assessment": "No personality profiles populated. The 5-tier progressive injection system is architecturally complete (26 unit tests pass) but functionally inert on the live system."
|
||||
},
|
||||
"prompt_changes": {
|
||||
"before_lines": 4,
|
||||
"after_lines": 18,
|
||||
"changes": [
|
||||
"Added structured citation guidance with inline example",
|
||||
"Added response format section (2-4 paragraphs, bullet lists, bold terms)",
|
||||
"Added domain awareness (music production subdomain list)",
|
||||
"Added conflicting source handling instruction",
|
||||
"Added response length guidance"
|
||||
],
|
||||
"test_impact": "Zero test modifications needed — all 26 tests pass unchanged"
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue