From 1be0deeb76eb80f740bc7fae84babc7941dbb95f Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Wed, 1 Apr 2026 09:20:24 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20Added=20STAGE=5FCONFIGS=20registry=20(s?=
 =?UTF-8?q?tages=202-5)=20with=20per-stage=20rubrics,=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/variant_generator.py"

GSD-Task: S04/T01
---
 backend/pipeline/quality/scorer.py            | 360 +++++++++++++++---
 backend/pipeline/quality/variant_generator.py |  93 ++++-
 2 files changed, 376 insertions(+), 77 deletions(-)
diff --git a/backend/pipeline/quality/scorer.py b/backend/pipeline/quality/scorer.py
index 66b4a72..6270e64 100644
--- a/backend/pipeline/quality/scorer.py
+++ b/backend/pipeline/quality/scorer.py
@@ -1,11 +1,7 @@
-"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions.
+"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics.
 
-Evaluates a synthesized technique page against source moments on:
-1. Structural quality — section naming, count, paragraph depth
-2. Content specificity — concrete details vs vague generalities
-3. Voice preservation — direct quotes, attributed opinions, personality
-4. Readability / flow — synthesis quality, logical ordering, no redundancy
-5. Factual fidelity — no hallucinated specifics, grounded in source moments
+Supports stages 2-5, each with its own scoring dimensions, rubric, format
+markers, fixture key requirements, prompt file name, and output schema.
 
 Run via: python -m pipeline.quality score --file <path>
 """
@@ -16,6 +12,7 @@ import logging
 import sys
 import time
 from dataclasses import dataclass, field
+from typing import Any
 
 import openai
 from pydantic import BaseModel
@@ -26,9 +23,177 @@ from pipeline.quality.voice_dial import VoiceDial
 logger = logging.getLogger(__name__)
 
 
-# ── Scoring rubric (hardcoded for iteration speed) ───────────────────────────
+# ── Per-stage configuration registry ─────────────────────────────────────────
 
-SCORING_RUBRIC = """\
+class StageConfig:
+    """Configuration for scoring a specific pipeline stage."""
+
+    def __init__(
+        self,
+        stage: int,
+        dimensions: list[str],
+        rubric: str,
+        format_markers: list[str],
+        fixture_keys: list[str],
+        prompt_file: str,
+        schema_class: str,
+    ) -> None:
+        self.stage = stage
+        self.dimensions = dimensions
+        self.rubric = rubric
+        self.format_markers = format_markers
+        self.fixture_keys = fixture_keys
+        self.prompt_file = prompt_file
+        self.schema_class = schema_class
+
+    def get_schema(self) -> type[BaseModel]:
+        """Import and return the Pydantic schema class for this stage."""
+        from pipeline import schemas
+        return getattr(schemas, self.schema_class)
+
+
+# ── Stage rubrics ────────────────────────────────────────────────────────────
+
+_STAGE_2_RUBRIC = """\
+You are an expert evaluator of transcript segmentation quality for educational content.
+
+You will be given:
+1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary)
+2. The source transcript segments used as input
+
+Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0:
+
+**coverage_completeness** — All transcript content accounted for
+- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps
+- 0.5-0.7: Minor gaps or overlaps, but most content is covered
+- 0.0-0.3: Large gaps — significant transcript segments are not assigned to any topic
+
+**topic_specificity** — Topic labels are descriptive and useful
+- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing")
+- 0.5-0.7: Labels are somewhat specific but could be more descriptive
+- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio")
+
+**boundary_accuracy** — Segment boundaries align with actual topic transitions
+- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units
+- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics
+- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content
+
+**summary_quality** — Summaries accurately describe segment content
+- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately
+- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague
+- 0.0-0.3: Summaries are inaccurate, too generic, or missing
+
+Return ONLY a JSON object with this exact structure:
+{
+  "coverage_completeness": <float 0.0-1.0>,
+  "topic_specificity": <float 0.0-1.0>,
+  "boundary_accuracy": <float 0.0-1.0>,
+  "summary_quality": <float 0.0-1.0>,
+  "justifications": {
+    "coverage_completeness": "<1-2 sentence justification>",
+    "topic_specificity": "<1-2 sentence justification>",
+    "boundary_accuracy": "<1-2 sentence justification>",
+    "summary_quality": "<1-2 sentence justification>"
+  }
+}
+"""
+
+_STAGE_3_RUBRIC = """\
+You are an expert evaluator of key moment extraction quality for educational content.
+
+You will be given:
+1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript)
+2. The source topic segments used as input
+
+Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0:
+
+**moment_richness** — Extracted moments capture substantial, distinct insights
+- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary
+- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others
+- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed
+
+**timestamp_accuracy** — Time ranges are plausible and well-bounded
+- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans
+- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow
+- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical
+
+**content_type_correctness** — Content types match the actual moment content
+- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it
+- 0.5-0.7: Most are correct but 1-2 are miscategorized
+- 0.0-0.3: Content types seem randomly assigned or all the same
+
+**summary_actionability** — Summaries provide actionable, specific information
+- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow
+- 0.5-0.7: Summaries describe the topic but lack specific actionable details
+- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information
+
+**plugin_normalization** — Plugin/tool names are correctly identified and normalized
+- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools
+- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed
+- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors
+
+Return ONLY a JSON object with this exact structure:
+{
+  "moment_richness": <float 0.0-1.0>,
+  "timestamp_accuracy": <float 0.0-1.0>,
+  "content_type_correctness": <float 0.0-1.0>,
+  "summary_actionability": <float 0.0-1.0>,
+  "plugin_normalization": <float 0.0-1.0>,
+  "justifications": {
+    "moment_richness": "<1-2 sentence justification>",
+    "timestamp_accuracy": "<1-2 sentence justification>",
+    "content_type_correctness": "<1-2 sentence justification>",
+    "summary_actionability": "<1-2 sentence justification>",
+    "plugin_normalization": "<1-2 sentence justification>"
+  }
+}
+"""
+
+_STAGE_4_RUBRIC = """\
+You are an expert evaluator of content classification quality for educational content.
+
+You will be given:
+1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags)
+2. The source extracted moments used as input
+
+Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0:
+
+**category_accuracy** — Topic categories are appropriate and meaningful
+- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels
+- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off
+- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same
+
+**tag_completeness** — All relevant tags are captured
+- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively
+- 0.5-0.7: Main tags are present but secondary concepts or tools are missed
+- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments
+
+**tag_specificity** — Tags are specific enough to be useful for search/filtering
+- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing")
+- 0.5-0.7: Mix of specific and generic tags
+- 0.0-0.3: Tags are too generic to meaningfully distinguish moments
+
+**coverage** — All moments are classified
+- 0.9-1.0: Every moment_index from the input has a corresponding classification entry
+- 0.5-0.7: Most moments classified but 1-2 are missing
+- 0.0-0.3: Many moments are not classified
+
+Return ONLY a JSON object with this exact structure:
+{
+  "category_accuracy": <float 0.0-1.0>,
+  "tag_completeness": <float 0.0-1.0>,
+  "tag_specificity": <float 0.0-1.0>,
+  "coverage": <float 0.0-1.0>,
+  "justifications": {
+    "category_accuracy": "<1-2 sentence justification>",
+    "tag_completeness": "<1-2 sentence justification>",
+    "tag_specificity": "<1-2 sentence justification>",
+    "coverage": "<1-2 sentence justification>"
+  }
+}
+"""
+
+_STAGE_5_RUBRIC = """\
 You are an expert evaluator of synthesized technique articles for music production education.
 
 You will be given:
@@ -79,73 +244,142 @@ Return ONLY a JSON object with this exact structure:
 }
 """
 
-DIMENSIONS = [
-    "structural",
-    "content_specificity",
-    "voice_preservation",
-    "readability",
-    "factual_fidelity",
-]
+# Backward-compat alias used by synthesize_and_score and external references
+SCORING_RUBRIC = _STAGE_5_RUBRIC
+
+# Build the stage configs registry
+STAGE_CONFIGS: dict[int, StageConfig] = {
+    2: StageConfig(
+        stage=2,
+        dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"],
+        rubric=_STAGE_2_RUBRIC,
+        format_markers=["segments", "start_index", "end_index", "topic_label"],
+        fixture_keys=["transcript_segments"],
+        prompt_file="stage2_segmentation.txt",
+        schema_class="SegmentationResult",
+    ),
+    3: StageConfig(
+        stage=3,
+        dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"],
+        rubric=_STAGE_3_RUBRIC,
+        format_markers=["moments", "content_type", "raw_transcript", "plugins"],
+        fixture_keys=["topic_segments"],
+        prompt_file="stage3_extraction.txt",
+        schema_class="ExtractionResult",
+    ),
+    4: StageConfig(
+        stage=4,
+        dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"],
+        rubric=_STAGE_4_RUBRIC,
+        format_markers=["classifications", "moment_index", "topic_category", "topic_tags"],
+        fixture_keys=["extracted_moments"],
+        prompt_file="stage4_classification.txt",
+        schema_class="ClassificationResult",
+    ),
+    5: StageConfig(
+        stage=5,
+        dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"],
+        rubric=SCORING_RUBRIC,
+        format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"],
+        fixture_keys=["key_moments", "creator_name"],
+        prompt_file="stage5_synthesis.txt",
+        schema_class="SynthesisResult",
+    ),
+}
+
+# Backward-compatible alias: stage 5 dimensions list
+DIMENSIONS = STAGE_CONFIGS[5].dimensions
 
 
 # ── Result type ──────────────────────────────────────────────────────────────
 
 @dataclass
 class ScoreResult:
-    """Outcome of scoring a technique page across 5 quality dimensions."""
+    """Outcome of scoring a stage output across quality dimensions.
 
-    structural: float = 0.0
-    content_specificity: float = 0.0
-    voice_preservation: float = 0.0
-    readability: float = 0.0
-    factual_fidelity: float = 0.0
+    Uses a generic ``scores`` dict keyed by dimension name.  Stage 5's
+    original named fields (structural, content_specificity, …) are
+    preserved as properties for backward compatibility.
+    """
+
+    scores: dict[str, float] = field(default_factory=dict)
     composite: float = 0.0
     justifications: dict[str, str] = field(default_factory=dict)
     elapsed_seconds: float = 0.0
     error: str | None = None
 
+    # ── Backward-compat properties for stage 5 named dimensions ──────
+    @property
+    def structural(self) -> float:
+        return self.scores.get("structural", 0.0)
+
+    @property
+    def content_specificity(self) -> float:
+        return self.scores.get("content_specificity", 0.0)
+
+    @property
+    def voice_preservation(self) -> float:
+        return self.scores.get("voice_preservation", 0.0)
+
+    @property
+    def readability(self) -> float:
+        return self.scores.get("readability", 0.0)
+
+    @property
+    def factual_fidelity(self) -> float:
+        return self.scores.get("factual_fidelity", 0.0)
+
 
 # ── Runner ───────────────────────────────────────────────────────────────────
 
 class ScoreRunner:
-    """Scores a Stage 5 technique page using LLM-as-judge evaluation."""
+    """Scores pipeline stage outputs using LLM-as-judge evaluation."""
 
     def __init__(self, client: LLMClient) -> None:
         self.client = client
 
-    def score_page(
+    # ── Generic stage scorer ─────────────────────────────────────────────
+
+    def score_stage_output(
         self,
-        page_json: dict,
-        moments: list[dict],
+        stage: int,
+        output_json: dict | list,
+        input_json: dict | list,
     ) -> ScoreResult:
-        """Evaluate a technique page against source moments.
+        """Score an arbitrary stage's output against its input.
 
         Parameters
         ----------
-        page_json:
-            Synthesized page dict (title, summary, body_sections).
-        moments:
-            Source key moments with transcript_excerpt, summary, etc.
+        stage:
+            Pipeline stage number (2-5).
+        output_json:
+            The stage output to evaluate (parsed JSON).
+        input_json:
+            The stage input / source material.
 
         Returns
         -------
-        ScoreResult with per-dimension scores and justifications.
+        ScoreResult with per-dimension scores for the requested stage.
         """
-        # Build the user prompt with the page and source moments
+        if stage not in STAGE_CONFIGS:
+            return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}")
+
+        cfg = STAGE_CONFIGS[stage]
+
         user_prompt = (
-            "## Synthesized Technique Page\n\n"
-            f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n"
-            "## Source Key Moments\n\n"
-            f"```json\n{json.dumps(moments, indent=2)}\n```\n\n"
-            "Score this page across all 5 dimensions."
+            "## Stage Output\n\n"
+            f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n"
+            "## Stage Input\n\n"
+            f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n"
+            f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions."
         )
 
         t0 = time.monotonic()
         try:
             resp = self.client.complete(
-                system_prompt=SCORING_RUBRIC,
+                system_prompt=cfg.rubric,
                 user_prompt=user_prompt,
-                response_model=BaseModel,  # triggers JSON mode
+                response_model=BaseModel,
                 modality="chat",
             )
             elapsed = round(time.monotonic() - t0, 2)
@@ -155,13 +389,9 @@ class ScoreRunner:
             fallback = self.client.settings.llm_fallback_url
             return ScoreResult(
                 elapsed_seconds=elapsed,
-                error=(
-                    f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
-                    f"Error: {exc}"
-                ),
+                error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}",
             )
 
-        # Parse the LLM judge response
         raw_text = str(resp).strip()
         try:
             parsed = json.loads(raw_text)
@@ -172,10 +402,27 @@ class ScoreRunner:
                 error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
             )
 
+        return self._parse_scores(parsed, elapsed, cfg.dimensions)
+
+    # ── Stage 5 convenience (backward compat) ────────────────────────────
+
+    def score_page(
+        self,
+        page_json: dict,
+        moments: list[dict],
+    ) -> ScoreResult:
+        """Evaluate a stage 5 technique page against source moments."""
+        return self.score_stage_output(
+            stage=5,
+            output_json=page_json,
+            input_json=moments,
+        )
+
         return self._parse_scores(parsed, elapsed)
 
-    def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult:
+    def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult:
         """Extract and validate scores from parsed JSON response."""
+        dims = dimensions or DIMENSIONS
         scores: dict[str, float] = {}
         justifications: dict[str, str] = {}
 
@@ -183,7 +430,7 @@ class ScoreRunner:
         if not isinstance(raw_justifications, dict):
             raw_justifications = {}
 
-        for dim in DIMENSIONS:
+        for dim in dims:
             raw = parsed.get(dim)
             if raw is None:
                 logger.warning("Missing dimension '%s' in judge response", dim)
@@ -202,14 +449,10 @@ class ScoreRunner:
 
             justifications[dim] = str(raw_justifications.get(dim, ""))
 
-        composite = sum(scores.values()) / len(DIMENSIONS)
+        composite = sum(scores.values()) / len(dims) if dims else 0.0
 
         return ScoreResult(
-            structural=scores["structural"],
-            content_specificity=scores["content_specificity"],
-            voice_preservation=scores["voice_preservation"],
-            readability=scores["readability"],
-            factual_fidelity=scores["factual_fidelity"],
+            scores=scores,
             composite=round(composite, 3),
             justifications=justifications,
             elapsed_seconds=elapsed,
@@ -318,10 +561,13 @@ class ScoreRunner:
         result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
         return result
 
-    def print_report(self, result: ScoreResult) -> None:
+    def print_report(self, result: ScoreResult, stage: int = 5) -> None:
         """Print a formatted scoring report to stdout."""
+        dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys())
+        stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY"
+
         print("\n" + "=" * 60)
-        print("  STAGE 5 QUALITY SCORE REPORT")
+        print(f"  {stage_label} QUALITY SCORE REPORT")
         print("=" * 60)
 
         if result.error:
@@ -329,8 +575,8 @@ class ScoreRunner:
             print("=" * 60 + "\n")
             return
 
-        for dim in DIMENSIONS:
-            score = getattr(result, dim)
+        for dim in dims:
+            score = result.scores.get(dim, 0.0)
             bar = self._score_bar(score)
             justification = result.justifications.get(dim, "")
             print(f"\n  {dim.replace('_', ' ').title()}")
diff --git a/backend/pipeline/quality/variant_generator.py b/backend/pipeline/quality/variant_generator.py
index 3a20adf..da063ae 100644
--- a/backend/pipeline/quality/variant_generator.py
+++ b/backend/pipeline/quality/variant_generator.py
@@ -4,13 +4,17 @@ Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
 analyzing per-dimension scores and producing targeted prompt mutations
 that improve the weakest scoring dimensions while preserving the JSON
 output format required by downstream parsing.
+
+Supports any pipeline stage (2-5) — callers pass the stage's dimensions
+and format markers so the meta-prompt and validation adapt automatically.
 """
 from __future__ import annotations
 
 import logging
+from typing import Sequence
 
 from pipeline.llm_client import LLMClient
-from pipeline.quality.scorer import DIMENSIONS, ScoreResult
+from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult
 
 logger = logging.getLogger(__name__)
 
@@ -18,29 +22,24 @@ logger = logging.getLogger(__name__)
 # ── Meta-prompt for variant generation ────────────────────────────────────────
 
 VARIANT_META_PROMPT = """\
-You are an expert prompt engineer specializing in LLM-powered content synthesis.
+You are an expert prompt engineer specializing in LLM-powered content processing pipelines.
 
-Your task: given a synthesis prompt and its quality evaluation scores, produce an
+Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
 improved variant of the prompt that targets the weakest-scoring dimensions while
 maintaining or improving the others.
 
 ## Scoring Dimensions (each 0.0–1.0)
 
-- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)
-- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values
-- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained
-- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction
-- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics
+{dimension_descriptions}
 
 ## Rules
 
 1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
 2. Add specific, actionable instructions — not vague encouragements.
 3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
-   The prompt contains instructions about outputting a JSON object with a specific schema
-   (SynthesisResult with "pages" containing title, summary, body_sections, etc.).
+   The prompt contains instructions about outputting a JSON object with a specific schema.
    Do NOT modify, remove, or rephrase any part of the JSON format instructions.
-   Your changes should target the prose synthesis guidelines only.
+   Your changes should target the processing/analysis guidelines only.
 4. Keep the overall prompt length within 2x of the original. Don't bloat it.
 5. Make substantive changes — rewording a sentence or adding one adjective is not enough.
 
@@ -50,9 +49,38 @@ Return ONLY the full modified prompt text. No explanation, no markdown fences, n
 Just the complete prompt that could be used directly as a system prompt.
 """
 
+# Dimension descriptions per stage, used to fill the meta-prompt template.
+_DIMENSION_DESCRIPTIONS: dict[int, str] = {
+    2: (
+        "- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
+        "- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
+        "- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
+        "- **summary_quality** — Summaries accurately describe segment content"
+    ),
+    3: (
+        "- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
+        "- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
+        "- **content_type_correctness** — Content types match the actual moment content\n"
+        "- **summary_actionability** — Summaries provide actionable, specific information\n"
+        "- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
+    ),
+    4: (
+        "- **category_accuracy** — Topic categories are appropriate and meaningful\n"
+        "- **tag_completeness** — All relevant tags are captured\n"
+        "- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
+        "- **coverage** — All moments are classified"
+    ),
+    5: (
+        "- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
+        "- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
+        "- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
+        "- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
+        "- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
+    ),
+}
 
-# Format markers that must survive variant generation — if any of these
-# are present in the base prompt, the variant must also contain them.
+
+# Legacy default format markers for stage 5
 _FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
 
 
@@ -71,6 +99,9 @@ class PromptVariantGenerator:
         base_prompt: str,
         scores: ScoreResult,
         n: int = 2,
+        *,
+        format_markers: Sequence[str] | None = None,
+        stage: int = 5,
     ) -> list[str]:
         """Generate up to *n* valid prompt variants.
 
@@ -83,27 +114,48 @@ class PromptVariantGenerator:
         Parameters
         ----------
         base_prompt:
-            The current best synthesis prompt text.
+            The current best prompt text for the target stage.
         scores:
             ScoreResult from the most recent evaluation of *base_prompt*.
         n:
             Number of variants to attempt generating.
+        format_markers:
+            Override format markers for validation.  When *None*, uses the
+            markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
+            defaults for backward compat).
+        stage:
+            Pipeline stage number (2-5), used to select dimension
+            descriptions for the meta-prompt and default format markers.
 
         Returns
         -------
         list[str]
             Valid variant prompt strings (may be fewer than *n*).
         """
-        user_prompt = self._build_user_prompt(base_prompt, scores)
+        # Resolve format markers and dimensions for the target stage
+        if format_markers is not None:
+            markers = list(format_markers)
+        elif stage in STAGE_CONFIGS:
+            markers = STAGE_CONFIGS[stage].format_markers
+        else:
+            markers = _FORMAT_MARKERS
+
+        dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS
+
+        # Build the system prompt with stage-appropriate dimension descriptions
+        dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
+        system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)
+
+        user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
         # Identify which format markers are actually present in the base
-        required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt]
+        required_markers = [m for m in markers if m in base_prompt]
 
         variants: list[str] = []
         for i in range(n):
-            logger.info("Generating variant %d/%d...", i + 1, n)
+            logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
             try:
                 raw = self.client.complete(
-                    system_prompt=VARIANT_META_PROMPT,
+                    system_prompt=system_prompt,
                     user_prompt=user_prompt,
                     response_model=None,  # free-form text, not JSON
                     modality="chat",
@@ -127,11 +179,12 @@ class PromptVariantGenerator:
 
     # ── Internal helpers ──────────────────────────────────────────────────
 
-    def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str:
+    def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
         """Build the user message describing the current prompt and its scores."""
+        dims = dimensions or DIMENSIONS
         # Build per-dimension score lines, sorted worst-first
         dim_lines: list[str] = []
-        dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS]
+        dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
         dim_scores.sort(key=lambda x: x[1])
 
         for dim, val in dim_scores: