feat: Added STAGE_CONFIGS registry (stages 2-5) with per-stage rubrics,…

- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/variant_generator.py"

GSD-Task: S04/T01
This commit is contained in:
jlightner 2026-04-01 09:20:24 +00:00
parent 03373f263d
commit 1be0deeb76
2 changed files with 376 additions and 77 deletions

View file

@ -1,11 +1,7 @@
"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions.
"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics.
Evaluates a synthesized technique page against source moments on:
1. Structural quality section naming, count, paragraph depth
2. Content specificity concrete details vs vague generalities
3. Voice preservation direct quotes, attributed opinions, personality
4. Readability / flow synthesis quality, logical ordering, no redundancy
5. Factual fidelity no hallucinated specifics, grounded in source moments
Supports stages 2-5, each with its own scoring dimensions, rubric, format
markers, fixture key requirements, prompt file name, and output schema.
Run via: python -m pipeline.quality score --file <path>
"""
@ -16,6 +12,7 @@ import logging
import sys
import time
from dataclasses import dataclass, field
from typing import Any
import openai
from pydantic import BaseModel
@ -26,9 +23,177 @@ from pipeline.quality.voice_dial import VoiceDial
logger = logging.getLogger(__name__)
# ── Scoring rubric (hardcoded for iteration speed) ───────────────────────────
# ── Per-stage configuration registry ─────────────────────────────────────────
SCORING_RUBRIC = """\
class StageConfig:
"""Configuration for scoring a specific pipeline stage."""
def __init__(
self,
stage: int,
dimensions: list[str],
rubric: str,
format_markers: list[str],
fixture_keys: list[str],
prompt_file: str,
schema_class: str,
) -> None:
self.stage = stage
self.dimensions = dimensions
self.rubric = rubric
self.format_markers = format_markers
self.fixture_keys = fixture_keys
self.prompt_file = prompt_file
self.schema_class = schema_class
def get_schema(self) -> type[BaseModel]:
"""Import and return the Pydantic schema class for this stage."""
from pipeline import schemas
return getattr(schemas, self.schema_class)
# ── Stage rubrics ────────────────────────────────────────────────────────────
_STAGE_2_RUBRIC = """\
You are an expert evaluator of transcript segmentation quality for educational content.
You will be given:
1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary)
2. The source transcript segments used as input
Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0:
**coverage_completeness** All transcript content accounted for
- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps
- 0.5-0.7: Minor gaps or overlaps, but most content is covered
- 0.0-0.3: Large gaps significant transcript segments are not assigned to any topic
**topic_specificity** Topic labels are descriptive and useful
- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing")
- 0.5-0.7: Labels are somewhat specific but could be more descriptive
- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio")
**boundary_accuracy** Segment boundaries align with actual topic transitions
- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units
- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics
- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content
**summary_quality** Summaries accurately describe segment content
- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately
- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague
- 0.0-0.3: Summaries are inaccurate, too generic, or missing
Return ONLY a JSON object with this exact structure:
{
"coverage_completeness": <float 0.0-1.0>,
"topic_specificity": <float 0.0-1.0>,
"boundary_accuracy": <float 0.0-1.0>,
"summary_quality": <float 0.0-1.0>,
"justifications": {
"coverage_completeness": "<1-2 sentence justification>",
"topic_specificity": "<1-2 sentence justification>",
"boundary_accuracy": "<1-2 sentence justification>",
"summary_quality": "<1-2 sentence justification>"
}
}
"""
_STAGE_3_RUBRIC = """\
You are an expert evaluator of key moment extraction quality for educational content.
You will be given:
1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript)
2. The source topic segments used as input
Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0:
**moment_richness** Extracted moments capture substantial, distinct insights
- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary
- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others
- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed
**timestamp_accuracy** Time ranges are plausible and well-bounded
- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans
- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow
- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical
**content_type_correctness** Content types match the actual moment content
- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it
- 0.5-0.7: Most are correct but 1-2 are miscategorized
- 0.0-0.3: Content types seem randomly assigned or all the same
**summary_actionability** Summaries provide actionable, specific information
- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow
- 0.5-0.7: Summaries describe the topic but lack specific actionable details
- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information
**plugin_normalization** Plugin/tool names are correctly identified and normalized
- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools
- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed
- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors
Return ONLY a JSON object with this exact structure:
{
"moment_richness": <float 0.0-1.0>,
"timestamp_accuracy": <float 0.0-1.0>,
"content_type_correctness": <float 0.0-1.0>,
"summary_actionability": <float 0.0-1.0>,
"plugin_normalization": <float 0.0-1.0>,
"justifications": {
"moment_richness": "<1-2 sentence justification>",
"timestamp_accuracy": "<1-2 sentence justification>",
"content_type_correctness": "<1-2 sentence justification>",
"summary_actionability": "<1-2 sentence justification>",
"plugin_normalization": "<1-2 sentence justification>"
}
}
"""
_STAGE_4_RUBRIC = """\
You are an expert evaluator of content classification quality for educational content.
You will be given:
1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags)
2. The source extracted moments used as input
Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0:
**category_accuracy** Topic categories are appropriate and meaningful
- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels
- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off
- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same
**tag_completeness** All relevant tags are captured
- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively
- 0.5-0.7: Main tags are present but secondary concepts or tools are missed
- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments
**tag_specificity** Tags are specific enough to be useful for search/filtering
- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing")
- 0.5-0.7: Mix of specific and generic tags
- 0.0-0.3: Tags are too generic to meaningfully distinguish moments
**coverage** All moments are classified
- 0.9-1.0: Every moment_index from the input has a corresponding classification entry
- 0.5-0.7: Most moments classified but 1-2 are missing
- 0.0-0.3: Many moments are not classified
Return ONLY a JSON object with this exact structure:
{
"category_accuracy": <float 0.0-1.0>,
"tag_completeness": <float 0.0-1.0>,
"tag_specificity": <float 0.0-1.0>,
"coverage": <float 0.0-1.0>,
"justifications": {
"category_accuracy": "<1-2 sentence justification>",
"tag_completeness": "<1-2 sentence justification>",
"tag_specificity": "<1-2 sentence justification>",
"coverage": "<1-2 sentence justification>"
}
}
"""
_STAGE_5_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education.
You will be given:
@ -79,73 +244,142 @@ Return ONLY a JSON object with this exact structure:
}
"""
DIMENSIONS = [
"structural",
"content_specificity",
"voice_preservation",
"readability",
"factual_fidelity",
]
# Backward-compat alias used by synthesize_and_score and external references
SCORING_RUBRIC = _STAGE_5_RUBRIC
# Build the stage configs registry
STAGE_CONFIGS: dict[int, StageConfig] = {
2: StageConfig(
stage=2,
dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"],
rubric=_STAGE_2_RUBRIC,
format_markers=["segments", "start_index", "end_index", "topic_label"],
fixture_keys=["transcript_segments"],
prompt_file="stage2_segmentation.txt",
schema_class="SegmentationResult",
),
3: StageConfig(
stage=3,
dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"],
rubric=_STAGE_3_RUBRIC,
format_markers=["moments", "content_type", "raw_transcript", "plugins"],
fixture_keys=["topic_segments"],
prompt_file="stage3_extraction.txt",
schema_class="ExtractionResult",
),
4: StageConfig(
stage=4,
dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"],
rubric=_STAGE_4_RUBRIC,
format_markers=["classifications", "moment_index", "topic_category", "topic_tags"],
fixture_keys=["extracted_moments"],
prompt_file="stage4_classification.txt",
schema_class="ClassificationResult",
),
5: StageConfig(
stage=5,
dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"],
rubric=SCORING_RUBRIC,
format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"],
fixture_keys=["key_moments", "creator_name"],
prompt_file="stage5_synthesis.txt",
schema_class="SynthesisResult",
),
}
# Backward-compatible alias: stage 5 dimensions list
DIMENSIONS = STAGE_CONFIGS[5].dimensions
# ── Result type ──────────────────────────────────────────────────────────────
@dataclass
class ScoreResult:
"""Outcome of scoring a technique page across 5 quality dimensions."""
"""Outcome of scoring a stage output across quality dimensions.
structural: float = 0.0
content_specificity: float = 0.0
voice_preservation: float = 0.0
readability: float = 0.0
factual_fidelity: float = 0.0
Uses a generic ``scores`` dict keyed by dimension name. Stage 5's
original named fields (structural, content_specificity, ) are
preserved as properties for backward compatibility.
"""
scores: dict[str, float] = field(default_factory=dict)
composite: float = 0.0
justifications: dict[str, str] = field(default_factory=dict)
elapsed_seconds: float = 0.0
error: str | None = None
# ── Backward-compat properties for stage 5 named dimensions ──────
@property
def structural(self) -> float:
return self.scores.get("structural", 0.0)
@property
def content_specificity(self) -> float:
return self.scores.get("content_specificity", 0.0)
@property
def voice_preservation(self) -> float:
return self.scores.get("voice_preservation", 0.0)
@property
def readability(self) -> float:
return self.scores.get("readability", 0.0)
@property
def factual_fidelity(self) -> float:
return self.scores.get("factual_fidelity", 0.0)
# ── Runner ───────────────────────────────────────────────────────────────────
class ScoreRunner:
"""Scores a Stage 5 technique page using LLM-as-judge evaluation."""
"""Scores pipeline stage outputs using LLM-as-judge evaluation."""
def __init__(self, client: LLMClient) -> None:
self.client = client
def score_page(
# ── Generic stage scorer ─────────────────────────────────────────────
def score_stage_output(
self,
page_json: dict,
moments: list[dict],
stage: int,
output_json: dict | list,
input_json: dict | list,
) -> ScoreResult:
"""Evaluate a technique page against source moments.
"""Score an arbitrary stage's output against its input.
Parameters
----------
page_json:
Synthesized page dict (title, summary, body_sections).
moments:
Source key moments with transcript_excerpt, summary, etc.
stage:
Pipeline stage number (2-5).
output_json:
The stage output to evaluate (parsed JSON).
input_json:
The stage input / source material.
Returns
-------
ScoreResult with per-dimension scores and justifications.
ScoreResult with per-dimension scores for the requested stage.
"""
# Build the user prompt with the page and source moments
if stage not in STAGE_CONFIGS:
return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}")
cfg = STAGE_CONFIGS[stage]
user_prompt = (
"## Synthesized Technique Page\n\n"
f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n"
"## Source Key Moments\n\n"
f"```json\n{json.dumps(moments, indent=2)}\n```\n\n"
"Score this page across all 5 dimensions."
"## Stage Output\n\n"
f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n"
"## Stage Input\n\n"
f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n"
f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions."
)
t0 = time.monotonic()
try:
resp = self.client.complete(
system_prompt=SCORING_RUBRIC,
system_prompt=cfg.rubric,
user_prompt=user_prompt,
response_model=BaseModel, # triggers JSON mode
response_model=BaseModel,
modality="chat",
)
elapsed = round(time.monotonic() - t0, 2)
@ -155,13 +389,9 @@ class ScoreRunner:
fallback = self.client.settings.llm_fallback_url
return ScoreResult(
elapsed_seconds=elapsed,
error=(
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
f"Error: {exc}"
),
error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}",
)
# Parse the LLM judge response
raw_text = str(resp).strip()
try:
parsed = json.loads(raw_text)
@ -172,10 +402,27 @@ class ScoreRunner:
error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
)
return self._parse_scores(parsed, elapsed, cfg.dimensions)
# ── Stage 5 convenience (backward compat) ────────────────────────────
def score_page(
self,
page_json: dict,
moments: list[dict],
) -> ScoreResult:
"""Evaluate a stage 5 technique page against source moments."""
return self.score_stage_output(
stage=5,
output_json=page_json,
input_json=moments,
)
return self._parse_scores(parsed, elapsed)
def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult:
def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult:
"""Extract and validate scores from parsed JSON response."""
dims = dimensions or DIMENSIONS
scores: dict[str, float] = {}
justifications: dict[str, str] = {}
@ -183,7 +430,7 @@ class ScoreRunner:
if not isinstance(raw_justifications, dict):
raw_justifications = {}
for dim in DIMENSIONS:
for dim in dims:
raw = parsed.get(dim)
if raw is None:
logger.warning("Missing dimension '%s' in judge response", dim)
@ -202,14 +449,10 @@ class ScoreRunner:
justifications[dim] = str(raw_justifications.get(dim, ""))
composite = sum(scores.values()) / len(DIMENSIONS)
composite = sum(scores.values()) / len(dims) if dims else 0.0
return ScoreResult(
structural=scores["structural"],
content_specificity=scores["content_specificity"],
voice_preservation=scores["voice_preservation"],
readability=scores["readability"],
factual_fidelity=scores["factual_fidelity"],
scores=scores,
composite=round(composite, 3),
justifications=justifications,
elapsed_seconds=elapsed,
@ -318,10 +561,13 @@ class ScoreRunner:
result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
return result
def print_report(self, result: ScoreResult) -> None:
def print_report(self, result: ScoreResult, stage: int = 5) -> None:
"""Print a formatted scoring report to stdout."""
dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys())
stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY"
print("\n" + "=" * 60)
print(" STAGE 5 QUALITY SCORE REPORT")
print(f" {stage_label} QUALITY SCORE REPORT")
print("=" * 60)
if result.error:
@ -329,8 +575,8 @@ class ScoreRunner:
print("=" * 60 + "\n")
return
for dim in DIMENSIONS:
score = getattr(result, dim)
for dim in dims:
score = result.scores.get(dim, 0.0)
bar = self._score_bar(score)
justification = result.justifications.get(dim, "")
print(f"\n {dim.replace('_', ' ').title()}")

View file

@ -4,13 +4,17 @@ Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing.
Supports any pipeline stage (2-5) callers pass the stage's dimensions
and format markers so the meta-prompt and validation adapt automatically.
"""
from __future__ import annotations
import logging
from typing import Sequence
from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, ScoreResult
from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult
logger = logging.getLogger(__name__)
@ -18,29 +22,24 @@ logger = logging.getLogger(__name__)
# ── Meta-prompt for variant generation ────────────────────────────────────────
VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content synthesis.
You are an expert prompt engineer specializing in LLM-powered content processing pipelines.
Your task: given a synthesis prompt and its quality evaluation scores, produce an
Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others.
## Scoring Dimensions (each 0.01.0)
- **structural** Section naming, count (3-6), paragraph depth (2-5 per section)
- **content_specificity** Concrete details: frequencies, time values, ratios, plugin names, dB values
- **voice_preservation** Direct quotes preserved, opinions attributed to creator by name, personality retained
- **readability** Cohesive article flow, related info merged, no redundancy or contradiction
- **factual_fidelity** Every claim traceable to source material, no hallucinated specifics
{dimension_descriptions}
## Rules
1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
The prompt contains instructions about outputting a JSON object with a specific schema
(SynthesisResult with "pages" containing title, summary, body_sections, etc.).
The prompt contains instructions about outputting a JSON object with a specific schema.
Do NOT modify, remove, or rephrase any part of the JSON format instructions.
Your changes should target the prose synthesis guidelines only.
Your changes should target the processing/analysis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes rewording a sentence or adding one adjective is not enough.
@ -50,9 +49,38 @@ Return ONLY the full modified prompt text. No explanation, no markdown fences, n
Just the complete prompt that could be used directly as a system prompt.
"""
# Dimension descriptions per stage, used to fill the meta-prompt template.
_DIMENSION_DESCRIPTIONS: dict[int, str] = {
2: (
"- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
"- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
"- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
"- **summary_quality** — Summaries accurately describe segment content"
),
3: (
"- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
"- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
"- **content_type_correctness** — Content types match the actual moment content\n"
"- **summary_actionability** — Summaries provide actionable, specific information\n"
"- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
),
4: (
"- **category_accuracy** — Topic categories are appropriate and meaningful\n"
"- **tag_completeness** — All relevant tags are captured\n"
"- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
"- **coverage** — All moments are classified"
),
5: (
"- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
"- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
"- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
"- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
"- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
),
}
# Format markers that must survive variant generation — if any of these
# are present in the base prompt, the variant must also contain them.
# Legacy default format markers for stage 5
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
@ -71,6 +99,9 @@ class PromptVariantGenerator:
base_prompt: str,
scores: ScoreResult,
n: int = 2,
*,
format_markers: Sequence[str] | None = None,
stage: int = 5,
) -> list[str]:
"""Generate up to *n* valid prompt variants.
@ -83,27 +114,48 @@ class PromptVariantGenerator:
Parameters
----------
base_prompt:
The current best synthesis prompt text.
The current best prompt text for the target stage.
scores:
ScoreResult from the most recent evaluation of *base_prompt*.
n:
Number of variants to attempt generating.
format_markers:
Override format markers for validation. When *None*, uses the
markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
defaults for backward compat).
stage:
Pipeline stage number (2-5), used to select dimension
descriptions for the meta-prompt and default format markers.
Returns
-------
list[str]
Valid variant prompt strings (may be fewer than *n*).
"""
user_prompt = self._build_user_prompt(base_prompt, scores)
# Resolve format markers and dimensions for the target stage
if format_markers is not None:
markers = list(format_markers)
elif stage in STAGE_CONFIGS:
markers = STAGE_CONFIGS[stage].format_markers
else:
markers = _FORMAT_MARKERS
dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS
# Build the system prompt with stage-appropriate dimension descriptions
dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)
user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
# Identify which format markers are actually present in the base
required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt]
required_markers = [m for m in markers if m in base_prompt]
variants: list[str] = []
for i in range(n):
logger.info("Generating variant %d/%d...", i + 1, n)
logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
try:
raw = self.client.complete(
system_prompt=VARIANT_META_PROMPT,
system_prompt=system_prompt,
user_prompt=user_prompt,
response_model=None, # free-form text, not JSON
modality="chat",
@ -127,11 +179,12 @@ class PromptVariantGenerator:
# ── Internal helpers ──────────────────────────────────────────────────
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str:
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
"""Build the user message describing the current prompt and its scores."""
dims = dimensions or DIMENSIONS
# Build per-dimension score lines, sorted worst-first
dim_lines: list[str] = []
dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS]
dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
dim_scores.sort(key=lambda x: x[1])
for dim, val in dim_scores: