From 1be0deeb76eb80f740bc7fae84babc7941dbb95f Mon Sep 17 00:00:00 2001 From: jlightner Date: Wed, 1 Apr 2026 09:20:24 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Added=20STAGE=5FCONFIGS=20registry=20(s?= =?UTF-8?q?tages=202-5)=20with=20per-stage=20rubrics,=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/pipeline/quality/scorer.py" - "backend/pipeline/quality/variant_generator.py" GSD-Task: S04/T01 --- backend/pipeline/quality/scorer.py | 360 +++++++++++++++--- backend/pipeline/quality/variant_generator.py | 93 ++++- 2 files changed, 376 insertions(+), 77 deletions(-) diff --git a/backend/pipeline/quality/scorer.py b/backend/pipeline/quality/scorer.py index 66b4a72..6270e64 100644 --- a/backend/pipeline/quality/scorer.py +++ b/backend/pipeline/quality/scorer.py @@ -1,11 +1,7 @@ -"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions. +"""Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics. -Evaluates a synthesized technique page against source moments on: -1. Structural quality — section naming, count, paragraph depth -2. Content specificity — concrete details vs vague generalities -3. Voice preservation — direct quotes, attributed opinions, personality -4. Readability / flow — synthesis quality, logical ordering, no redundancy -5. Factual fidelity — no hallucinated specifics, grounded in source moments +Supports stages 2-5, each with its own scoring dimensions, rubric, format +markers, fixture key requirements, prompt file name, and output schema. Run via: python -m pipeline.quality score --file """ @@ -16,6 +12,7 @@ import logging import sys import time from dataclasses import dataclass, field +from typing import Any import openai from pydantic import BaseModel @@ -26,9 +23,177 @@ from pipeline.quality.voice_dial import VoiceDial logger = logging.getLogger(__name__) -# ── Scoring rubric (hardcoded for iteration speed) ─────────────────────────── +# ── Per-stage configuration registry ───────────────────────────────────────── -SCORING_RUBRIC = """\ +class StageConfig: + """Configuration for scoring a specific pipeline stage.""" + + def __init__( + self, + stage: int, + dimensions: list[str], + rubric: str, + format_markers: list[str], + fixture_keys: list[str], + prompt_file: str, + schema_class: str, + ) -> None: + self.stage = stage + self.dimensions = dimensions + self.rubric = rubric + self.format_markers = format_markers + self.fixture_keys = fixture_keys + self.prompt_file = prompt_file + self.schema_class = schema_class + + def get_schema(self) -> type[BaseModel]: + """Import and return the Pydantic schema class for this stage.""" + from pipeline import schemas + return getattr(schemas, self.schema_class) + + +# ── Stage rubrics ──────────────────────────────────────────────────────────── + +_STAGE_2_RUBRIC = """\ +You are an expert evaluator of transcript segmentation quality for educational content. + +You will be given: +1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary) +2. The source transcript segments used as input + +Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0: + +**coverage_completeness** — All transcript content accounted for +- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps +- 0.5-0.7: Minor gaps or overlaps, but most content is covered +- 0.0-0.3: Large gaps — significant transcript segments are not assigned to any topic + +**topic_specificity** — Topic labels are descriptive and useful +- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing") +- 0.5-0.7: Labels are somewhat specific but could be more descriptive +- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio") + +**boundary_accuracy** — Segment boundaries align with actual topic transitions +- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units +- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics +- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content + +**summary_quality** — Summaries accurately describe segment content +- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately +- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague +- 0.0-0.3: Summaries are inaccurate, too generic, or missing + +Return ONLY a JSON object with this exact structure: +{ + "coverage_completeness": , + "topic_specificity": , + "boundary_accuracy": , + "summary_quality": , + "justifications": { + "coverage_completeness": "<1-2 sentence justification>", + "topic_specificity": "<1-2 sentence justification>", + "boundary_accuracy": "<1-2 sentence justification>", + "summary_quality": "<1-2 sentence justification>" + } +} +""" + +_STAGE_3_RUBRIC = """\ +You are an expert evaluator of key moment extraction quality for educational content. + +You will be given: +1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript) +2. The source topic segments used as input + +Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0: + +**moment_richness** — Extracted moments capture substantial, distinct insights +- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary +- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others +- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed + +**timestamp_accuracy** — Time ranges are plausible and well-bounded +- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans +- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow +- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical + +**content_type_correctness** — Content types match the actual moment content +- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it +- 0.5-0.7: Most are correct but 1-2 are miscategorized +- 0.0-0.3: Content types seem randomly assigned or all the same + +**summary_actionability** — Summaries provide actionable, specific information +- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow +- 0.5-0.7: Summaries describe the topic but lack specific actionable details +- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information + +**plugin_normalization** — Plugin/tool names are correctly identified and normalized +- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools +- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed +- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors + +Return ONLY a JSON object with this exact structure: +{ + "moment_richness": , + "timestamp_accuracy": , + "content_type_correctness": , + "summary_actionability": , + "plugin_normalization": , + "justifications": { + "moment_richness": "<1-2 sentence justification>", + "timestamp_accuracy": "<1-2 sentence justification>", + "content_type_correctness": "<1-2 sentence justification>", + "summary_actionability": "<1-2 sentence justification>", + "plugin_normalization": "<1-2 sentence justification>" + } +} +""" + +_STAGE_4_RUBRIC = """\ +You are an expert evaluator of content classification quality for educational content. + +You will be given: +1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags) +2. The source extracted moments used as input + +Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0: + +**category_accuracy** — Topic categories are appropriate and meaningful +- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels +- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off +- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same + +**tag_completeness** — All relevant tags are captured +- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively +- 0.5-0.7: Main tags are present but secondary concepts or tools are missed +- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments + +**tag_specificity** — Tags are specific enough to be useful for search/filtering +- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing") +- 0.5-0.7: Mix of specific and generic tags +- 0.0-0.3: Tags are too generic to meaningfully distinguish moments + +**coverage** — All moments are classified +- 0.9-1.0: Every moment_index from the input has a corresponding classification entry +- 0.5-0.7: Most moments classified but 1-2 are missing +- 0.0-0.3: Many moments are not classified + +Return ONLY a JSON object with this exact structure: +{ + "category_accuracy": , + "tag_completeness": , + "tag_specificity": , + "coverage": , + "justifications": { + "category_accuracy": "<1-2 sentence justification>", + "tag_completeness": "<1-2 sentence justification>", + "tag_specificity": "<1-2 sentence justification>", + "coverage": "<1-2 sentence justification>" + } +} +""" + +_STAGE_5_RUBRIC = """\ You are an expert evaluator of synthesized technique articles for music production education. You will be given: @@ -79,73 +244,142 @@ Return ONLY a JSON object with this exact structure: } """ -DIMENSIONS = [ - "structural", - "content_specificity", - "voice_preservation", - "readability", - "factual_fidelity", -] +# Backward-compat alias used by synthesize_and_score and external references +SCORING_RUBRIC = _STAGE_5_RUBRIC + +# Build the stage configs registry +STAGE_CONFIGS: dict[int, StageConfig] = { + 2: StageConfig( + stage=2, + dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"], + rubric=_STAGE_2_RUBRIC, + format_markers=["segments", "start_index", "end_index", "topic_label"], + fixture_keys=["transcript_segments"], + prompt_file="stage2_segmentation.txt", + schema_class="SegmentationResult", + ), + 3: StageConfig( + stage=3, + dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"], + rubric=_STAGE_3_RUBRIC, + format_markers=["moments", "content_type", "raw_transcript", "plugins"], + fixture_keys=["topic_segments"], + prompt_file="stage3_extraction.txt", + schema_class="ExtractionResult", + ), + 4: StageConfig( + stage=4, + dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"], + rubric=_STAGE_4_RUBRIC, + format_markers=["classifications", "moment_index", "topic_category", "topic_tags"], + fixture_keys=["extracted_moments"], + prompt_file="stage4_classification.txt", + schema_class="ClassificationResult", + ), + 5: StageConfig( + stage=5, + dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"], + rubric=SCORING_RUBRIC, + format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"], + fixture_keys=["key_moments", "creator_name"], + prompt_file="stage5_synthesis.txt", + schema_class="SynthesisResult", + ), +} + +# Backward-compatible alias: stage 5 dimensions list +DIMENSIONS = STAGE_CONFIGS[5].dimensions # ── Result type ────────────────────────────────────────────────────────────── @dataclass class ScoreResult: - """Outcome of scoring a technique page across 5 quality dimensions.""" + """Outcome of scoring a stage output across quality dimensions. - structural: float = 0.0 - content_specificity: float = 0.0 - voice_preservation: float = 0.0 - readability: float = 0.0 - factual_fidelity: float = 0.0 + Uses a generic ``scores`` dict keyed by dimension name. Stage 5's + original named fields (structural, content_specificity, …) are + preserved as properties for backward compatibility. + """ + + scores: dict[str, float] = field(default_factory=dict) composite: float = 0.0 justifications: dict[str, str] = field(default_factory=dict) elapsed_seconds: float = 0.0 error: str | None = None + # ── Backward-compat properties for stage 5 named dimensions ────── + @property + def structural(self) -> float: + return self.scores.get("structural", 0.0) + + @property + def content_specificity(self) -> float: + return self.scores.get("content_specificity", 0.0) + + @property + def voice_preservation(self) -> float: + return self.scores.get("voice_preservation", 0.0) + + @property + def readability(self) -> float: + return self.scores.get("readability", 0.0) + + @property + def factual_fidelity(self) -> float: + return self.scores.get("factual_fidelity", 0.0) + # ── Runner ─────────────────────────────────────────────────────────────────── class ScoreRunner: - """Scores a Stage 5 technique page using LLM-as-judge evaluation.""" + """Scores pipeline stage outputs using LLM-as-judge evaluation.""" def __init__(self, client: LLMClient) -> None: self.client = client - def score_page( + # ── Generic stage scorer ───────────────────────────────────────────── + + def score_stage_output( self, - page_json: dict, - moments: list[dict], + stage: int, + output_json: dict | list, + input_json: dict | list, ) -> ScoreResult: - """Evaluate a technique page against source moments. + """Score an arbitrary stage's output against its input. Parameters ---------- - page_json: - Synthesized page dict (title, summary, body_sections). - moments: - Source key moments with transcript_excerpt, summary, etc. + stage: + Pipeline stage number (2-5). + output_json: + The stage output to evaluate (parsed JSON). + input_json: + The stage input / source material. Returns ------- - ScoreResult with per-dimension scores and justifications. + ScoreResult with per-dimension scores for the requested stage. """ - # Build the user prompt with the page and source moments + if stage not in STAGE_CONFIGS: + return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}") + + cfg = STAGE_CONFIGS[stage] + user_prompt = ( - "## Synthesized Technique Page\n\n" - f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n" - "## Source Key Moments\n\n" - f"```json\n{json.dumps(moments, indent=2)}\n```\n\n" - "Score this page across all 5 dimensions." + "## Stage Output\n\n" + f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n" + "## Stage Input\n\n" + f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n" + f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions." ) t0 = time.monotonic() try: resp = self.client.complete( - system_prompt=SCORING_RUBRIC, + system_prompt=cfg.rubric, user_prompt=user_prompt, - response_model=BaseModel, # triggers JSON mode + response_model=BaseModel, modality="chat", ) elapsed = round(time.monotonic() - t0, 2) @@ -155,13 +389,9 @@ class ScoreRunner: fallback = self.client.settings.llm_fallback_url return ScoreResult( elapsed_seconds=elapsed, - error=( - f"Cannot reach LLM endpoint at {url} (fallback {fallback}). " - f"Error: {exc}" - ), + error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}", ) - # Parse the LLM judge response raw_text = str(resp).strip() try: parsed = json.loads(raw_text) @@ -172,10 +402,27 @@ class ScoreRunner: error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}", ) + return self._parse_scores(parsed, elapsed, cfg.dimensions) + + # ── Stage 5 convenience (backward compat) ──────────────────────────── + + def score_page( + self, + page_json: dict, + moments: list[dict], + ) -> ScoreResult: + """Evaluate a stage 5 technique page against source moments.""" + return self.score_stage_output( + stage=5, + output_json=page_json, + input_json=moments, + ) + return self._parse_scores(parsed, elapsed) - def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult: + def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult: """Extract and validate scores from parsed JSON response.""" + dims = dimensions or DIMENSIONS scores: dict[str, float] = {} justifications: dict[str, str] = {} @@ -183,7 +430,7 @@ class ScoreRunner: if not isinstance(raw_justifications, dict): raw_justifications = {} - for dim in DIMENSIONS: + for dim in dims: raw = parsed.get(dim) if raw is None: logger.warning("Missing dimension '%s' in judge response", dim) @@ -202,14 +449,10 @@ class ScoreRunner: justifications[dim] = str(raw_justifications.get(dim, "")) - composite = sum(scores.values()) / len(DIMENSIONS) + composite = sum(scores.values()) / len(dims) if dims else 0.0 return ScoreResult( - structural=scores["structural"], - content_specificity=scores["content_specificity"], - voice_preservation=scores["voice_preservation"], - readability=scores["readability"], - factual_fidelity=scores["factual_fidelity"], + scores=scores, composite=round(composite, 3), justifications=justifications, elapsed_seconds=elapsed, @@ -318,10 +561,13 @@ class ScoreRunner: result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2) return result - def print_report(self, result: ScoreResult) -> None: + def print_report(self, result: ScoreResult, stage: int = 5) -> None: """Print a formatted scoring report to stdout.""" + dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys()) + stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY" + print("\n" + "=" * 60) - print(" STAGE 5 QUALITY SCORE REPORT") + print(f" {stage_label} QUALITY SCORE REPORT") print("=" * 60) if result.error: @@ -329,8 +575,8 @@ class ScoreRunner: print("=" * 60 + "\n") return - for dim in DIMENSIONS: - score = getattr(result, dim) + for dim in dims: + score = result.scores.get(dim, 0.0) bar = self._score_bar(score) justification = result.justifications.get(dim, "") print(f"\n {dim.replace('_', ' ').title()}") diff --git a/backend/pipeline/quality/variant_generator.py b/backend/pipeline/quality/variant_generator.py index 3a20adf..da063ae 100644 --- a/backend/pipeline/quality/variant_generator.py +++ b/backend/pipeline/quality/variant_generator.py @@ -4,13 +4,17 @@ Uses a meta-prompt to instruct the LLM to act as a prompt engineer, analyzing per-dimension scores and producing targeted prompt mutations that improve the weakest scoring dimensions while preserving the JSON output format required by downstream parsing. + +Supports any pipeline stage (2-5) — callers pass the stage's dimensions +and format markers so the meta-prompt and validation adapt automatically. """ from __future__ import annotations import logging +from typing import Sequence from pipeline.llm_client import LLMClient -from pipeline.quality.scorer import DIMENSIONS, ScoreResult +from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult logger = logging.getLogger(__name__) @@ -18,29 +22,24 @@ logger = logging.getLogger(__name__) # ── Meta-prompt for variant generation ──────────────────────────────────────── VARIANT_META_PROMPT = """\ -You are an expert prompt engineer specializing in LLM-powered content synthesis. +You are an expert prompt engineer specializing in LLM-powered content processing pipelines. -Your task: given a synthesis prompt and its quality evaluation scores, produce an +Your task: given a pipeline stage prompt and its quality evaluation scores, produce an improved variant of the prompt that targets the weakest-scoring dimensions while maintaining or improving the others. ## Scoring Dimensions (each 0.0–1.0) -- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section) -- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values -- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained -- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction -- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics +{dimension_descriptions} ## Rules 1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything. 2. Add specific, actionable instructions — not vague encouragements. 3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.** - The prompt contains instructions about outputting a JSON object with a specific schema - (SynthesisResult with "pages" containing title, summary, body_sections, etc.). + The prompt contains instructions about outputting a JSON object with a specific schema. Do NOT modify, remove, or rephrase any part of the JSON format instructions. - Your changes should target the prose synthesis guidelines only. + Your changes should target the processing/analysis guidelines only. 4. Keep the overall prompt length within 2x of the original. Don't bloat it. 5. Make substantive changes — rewording a sentence or adding one adjective is not enough. @@ -50,9 +49,38 @@ Return ONLY the full modified prompt text. No explanation, no markdown fences, n Just the complete prompt that could be used directly as a system prompt. """ +# Dimension descriptions per stage, used to fill the meta-prompt template. +_DIMENSION_DESCRIPTIONS: dict[int, str] = { + 2: ( + "- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n" + "- **topic_specificity** — Topic labels are descriptive and useful, not generic\n" + "- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n" + "- **summary_quality** — Summaries accurately describe segment content" + ), + 3: ( + "- **moment_richness** — Extracted moments capture substantial, distinct insights\n" + "- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n" + "- **content_type_correctness** — Content types match the actual moment content\n" + "- **summary_actionability** — Summaries provide actionable, specific information\n" + "- **plugin_normalization** — Plugin/tool names are correctly identified and normalized" + ), + 4: ( + "- **category_accuracy** — Topic categories are appropriate and meaningful\n" + "- **tag_completeness** — All relevant tags are captured\n" + "- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n" + "- **coverage** — All moments are classified" + ), + 5: ( + "- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n" + "- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n" + "- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n" + "- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n" + "- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics" + ), +} -# Format markers that must survive variant generation — if any of these -# are present in the base prompt, the variant must also contain them. + +# Legacy default format markers for stage 5 _FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"] @@ -71,6 +99,9 @@ class PromptVariantGenerator: base_prompt: str, scores: ScoreResult, n: int = 2, + *, + format_markers: Sequence[str] | None = None, + stage: int = 5, ) -> list[str]: """Generate up to *n* valid prompt variants. @@ -83,27 +114,48 @@ class PromptVariantGenerator: Parameters ---------- base_prompt: - The current best synthesis prompt text. + The current best prompt text for the target stage. scores: ScoreResult from the most recent evaluation of *base_prompt*. n: Number of variants to attempt generating. + format_markers: + Override format markers for validation. When *None*, uses the + markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5 + defaults for backward compat). + stage: + Pipeline stage number (2-5), used to select dimension + descriptions for the meta-prompt and default format markers. Returns ------- list[str] Valid variant prompt strings (may be fewer than *n*). """ - user_prompt = self._build_user_prompt(base_prompt, scores) + # Resolve format markers and dimensions for the target stage + if format_markers is not None: + markers = list(format_markers) + elif stage in STAGE_CONFIGS: + markers = STAGE_CONFIGS[stage].format_markers + else: + markers = _FORMAT_MARKERS + + dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS + + # Build the system prompt with stage-appropriate dimension descriptions + dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5]) + system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc) + + user_prompt = self._build_user_prompt(base_prompt, scores, dimensions) # Identify which format markers are actually present in the base - required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt] + required_markers = [m for m in markers if m in base_prompt] variants: list[str] = [] for i in range(n): - logger.info("Generating variant %d/%d...", i + 1, n) + logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage) try: raw = self.client.complete( - system_prompt=VARIANT_META_PROMPT, + system_prompt=system_prompt, user_prompt=user_prompt, response_model=None, # free-form text, not JSON modality="chat", @@ -127,11 +179,12 @@ class PromptVariantGenerator: # ── Internal helpers ────────────────────────────────────────────────── - def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str: + def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str: """Build the user message describing the current prompt and its scores.""" + dims = dimensions or DIMENSIONS # Build per-dimension score lines, sorted worst-first dim_lines: list[str] = [] - dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS] + dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims] dim_scores.sort(key=lambda x: x[1]) for dim, val in dim_scores: