feat: Added STAGE_CONFIGS registry (stages 2-5) with per-stage rubrics,…

- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/variant_generator.py"

GSD-Task: S04/T01
This commit is contained in:
jlightner 2026-04-01 09:20:24 +00:00
parent 03373f263d
commit 1be0deeb76
2 changed files with 376 additions and 77 deletions

View file

@ -1,11 +1,7 @@
"""Stage 5 quality scorer — LLM-as-judge evaluation across 5 dimensions. """Multi-stage quality scorer — LLM-as-judge evaluation with per-stage rubrics.
Evaluates a synthesized technique page against source moments on: Supports stages 2-5, each with its own scoring dimensions, rubric, format
1. Structural quality section naming, count, paragraph depth markers, fixture key requirements, prompt file name, and output schema.
2. Content specificity concrete details vs vague generalities
3. Voice preservation direct quotes, attributed opinions, personality
4. Readability / flow synthesis quality, logical ordering, no redundancy
5. Factual fidelity no hallucinated specifics, grounded in source moments
Run via: python -m pipeline.quality score --file <path> Run via: python -m pipeline.quality score --file <path>
""" """
@ -16,6 +12,7 @@ import logging
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any
import openai import openai
from pydantic import BaseModel from pydantic import BaseModel
@ -26,9 +23,177 @@ from pipeline.quality.voice_dial import VoiceDial
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ── Scoring rubric (hardcoded for iteration speed) ─────────────────────────── # ── Per-stage configuration registry ─────────────────────────────────────────
SCORING_RUBRIC = """\ class StageConfig:
"""Configuration for scoring a specific pipeline stage."""
def __init__(
self,
stage: int,
dimensions: list[str],
rubric: str,
format_markers: list[str],
fixture_keys: list[str],
prompt_file: str,
schema_class: str,
) -> None:
self.stage = stage
self.dimensions = dimensions
self.rubric = rubric
self.format_markers = format_markers
self.fixture_keys = fixture_keys
self.prompt_file = prompt_file
self.schema_class = schema_class
def get_schema(self) -> type[BaseModel]:
"""Import and return the Pydantic schema class for this stage."""
from pipeline import schemas
return getattr(schemas, self.schema_class)
# ── Stage rubrics ────────────────────────────────────────────────────────────
_STAGE_2_RUBRIC = """\
You are an expert evaluator of transcript segmentation quality for educational content.
You will be given:
1. A segmentation result (JSON with segments, each having start_index, end_index, topic_label, summary)
2. The source transcript segments used as input
Evaluate the segmentation across these 4 dimensions, scoring each 0.0 to 1.0:
**coverage_completeness** All transcript content accounted for
- 0.9-1.0: Every transcript segment is covered by exactly one topic segment, no gaps or overlaps
- 0.5-0.7: Minor gaps or overlaps, but most content is covered
- 0.0-0.3: Large gaps significant transcript segments are not assigned to any topic
**topic_specificity** Topic labels are descriptive and useful
- 0.9-1.0: Labels are specific and descriptive (e.g., "Sidechain compression on kick-bass" not "Audio processing")
- 0.5-0.7: Labels are somewhat specific but could be more descriptive
- 0.0-0.3: Labels are generic or meaningless ("Topic 1", "Discussion", "Audio")
**boundary_accuracy** Segment boundaries align with actual topic transitions
- 0.9-1.0: Boundaries fall at natural topic transitions, segments are coherent units
- 0.5-0.7: Most boundaries are reasonable but some segments mix distinct topics
- 0.0-0.3: Boundaries seem arbitrary, segments contain unrelated content
**summary_quality** Summaries accurately describe segment content
- 0.9-1.0: Summaries capture the key points of each segment concisely and accurately
- 0.5-0.7: Summaries are acceptable but miss some key points or are too vague
- 0.0-0.3: Summaries are inaccurate, too generic, or missing
Return ONLY a JSON object with this exact structure:
{
"coverage_completeness": <float 0.0-1.0>,
"topic_specificity": <float 0.0-1.0>,
"boundary_accuracy": <float 0.0-1.0>,
"summary_quality": <float 0.0-1.0>,
"justifications": {
"coverage_completeness": "<1-2 sentence justification>",
"topic_specificity": "<1-2 sentence justification>",
"boundary_accuracy": "<1-2 sentence justification>",
"summary_quality": "<1-2 sentence justification>"
}
}
"""
_STAGE_3_RUBRIC = """\
You are an expert evaluator of key moment extraction quality for educational content.
You will be given:
1. An extraction result (JSON with moments, each having title, summary, start_time, end_time, content_type, plugins, raw_transcript)
2. The source topic segments used as input
Evaluate the extraction across these 5 dimensions, scoring each 0.0 to 1.0:
**moment_richness** Extracted moments capture substantial, distinct insights
- 0.9-1.0: Each moment represents a meaningful, distinct technique or concept with detailed summary
- 0.5-0.7: Moments are valid but some are thin or overlap significantly with others
- 0.0-0.3: Moments are trivial, redundant, or miss the main techniques discussed
**timestamp_accuracy** Time ranges are plausible and well-bounded
- 0.9-1.0: Start/end times form reasonable ranges, no zero-length or absurdly long spans
- 0.5-0.7: Most timestamps are reasonable but some spans seem too wide or narrow
- 0.0-0.3: Timestamps appear arbitrary or many are zero/identical
**content_type_correctness** Content types match the actual moment content
- 0.9-1.0: Each moment's content_type (technique/settings/reasoning/workflow) accurately describes it
- 0.5-0.7: Most are correct but 1-2 are miscategorized
- 0.0-0.3: Content types seem randomly assigned or all the same
**summary_actionability** Summaries provide actionable, specific information
- 0.9-1.0: Summaries contain concrete details (values, settings, steps) that a practitioner could follow
- 0.5-0.7: Summaries describe the topic but lack specific actionable details
- 0.0-0.3: Summaries are vague ("discusses compression") with no actionable information
**plugin_normalization** Plugin/tool names are correctly identified and normalized
- 0.9-1.0: Plugin names match standard names, no duplicates, captures all mentioned tools
- 0.5-0.7: Most plugins captured but some are misspelled, duplicated, or missed
- 0.0-0.3: Plugin list is mostly empty, contains non-plugins, or has many errors
Return ONLY a JSON object with this exact structure:
{
"moment_richness": <float 0.0-1.0>,
"timestamp_accuracy": <float 0.0-1.0>,
"content_type_correctness": <float 0.0-1.0>,
"summary_actionability": <float 0.0-1.0>,
"plugin_normalization": <float 0.0-1.0>,
"justifications": {
"moment_richness": "<1-2 sentence justification>",
"timestamp_accuracy": "<1-2 sentence justification>",
"content_type_correctness": "<1-2 sentence justification>",
"summary_actionability": "<1-2 sentence justification>",
"plugin_normalization": "<1-2 sentence justification>"
}
}
"""
_STAGE_4_RUBRIC = """\
You are an expert evaluator of content classification quality for educational content.
You will be given:
1. A classification result (JSON with classifications, each having moment_index, topic_category, topic_tags)
2. The source extracted moments used as input
Evaluate the classification across these 4 dimensions, scoring each 0.0 to 1.0:
**category_accuracy** Topic categories are appropriate and meaningful
- 0.9-1.0: Categories accurately reflect the primary topic of each moment, using domain-appropriate labels
- 0.5-0.7: Most categories are reasonable but some are too broad or slightly off
- 0.0-0.3: Categories are generic ("Music"), incorrect, or all the same
**tag_completeness** All relevant tags are captured
- 0.9-1.0: Tags capture the key concepts, tools, and techniques in each moment comprehensively
- 0.5-0.7: Main tags are present but secondary concepts or tools are missed
- 0.0-0.3: Tags are sparse, missing major concepts mentioned in the moments
**tag_specificity** Tags are specific enough to be useful for search/filtering
- 0.9-1.0: Tags are specific ("sidechain compression", "Pro-Q 3") not generic ("audio", "mixing")
- 0.5-0.7: Mix of specific and generic tags
- 0.0-0.3: Tags are too generic to meaningfully distinguish moments
**coverage** All moments are classified
- 0.9-1.0: Every moment_index from the input has a corresponding classification entry
- 0.5-0.7: Most moments classified but 1-2 are missing
- 0.0-0.3: Many moments are not classified
Return ONLY a JSON object with this exact structure:
{
"category_accuracy": <float 0.0-1.0>,
"tag_completeness": <float 0.0-1.0>,
"tag_specificity": <float 0.0-1.0>,
"coverage": <float 0.0-1.0>,
"justifications": {
"category_accuracy": "<1-2 sentence justification>",
"tag_completeness": "<1-2 sentence justification>",
"tag_specificity": "<1-2 sentence justification>",
"coverage": "<1-2 sentence justification>"
}
}
"""
_STAGE_5_RUBRIC = """\
You are an expert evaluator of synthesized technique articles for music production education. You are an expert evaluator of synthesized technique articles for music production education.
You will be given: You will be given:
@ -79,73 +244,142 @@ Return ONLY a JSON object with this exact structure:
} }
""" """
DIMENSIONS = [ # Backward-compat alias used by synthesize_and_score and external references
"structural", SCORING_RUBRIC = _STAGE_5_RUBRIC
"content_specificity",
"voice_preservation", # Build the stage configs registry
"readability", STAGE_CONFIGS: dict[int, StageConfig] = {
"factual_fidelity", 2: StageConfig(
] stage=2,
dimensions=["coverage_completeness", "topic_specificity", "boundary_accuracy", "summary_quality"],
rubric=_STAGE_2_RUBRIC,
format_markers=["segments", "start_index", "end_index", "topic_label"],
fixture_keys=["transcript_segments"],
prompt_file="stage2_segmentation.txt",
schema_class="SegmentationResult",
),
3: StageConfig(
stage=3,
dimensions=["moment_richness", "timestamp_accuracy", "content_type_correctness", "summary_actionability", "plugin_normalization"],
rubric=_STAGE_3_RUBRIC,
format_markers=["moments", "content_type", "raw_transcript", "plugins"],
fixture_keys=["topic_segments"],
prompt_file="stage3_extraction.txt",
schema_class="ExtractionResult",
),
4: StageConfig(
stage=4,
dimensions=["category_accuracy", "tag_completeness", "tag_specificity", "coverage"],
rubric=_STAGE_4_RUBRIC,
format_markers=["classifications", "moment_index", "topic_category", "topic_tags"],
fixture_keys=["extracted_moments"],
prompt_file="stage4_classification.txt",
schema_class="ClassificationResult",
),
5: StageConfig(
stage=5,
dimensions=["structural", "content_specificity", "voice_preservation", "readability", "factual_fidelity"],
rubric=SCORING_RUBRIC,
format_markers=["SynthesisResult", '"pages"', "body_sections", "title", "summary"],
fixture_keys=["key_moments", "creator_name"],
prompt_file="stage5_synthesis.txt",
schema_class="SynthesisResult",
),
}
# Backward-compatible alias: stage 5 dimensions list
DIMENSIONS = STAGE_CONFIGS[5].dimensions
# ── Result type ────────────────────────────────────────────────────────────── # ── Result type ──────────────────────────────────────────────────────────────
@dataclass @dataclass
class ScoreResult: class ScoreResult:
"""Outcome of scoring a technique page across 5 quality dimensions.""" """Outcome of scoring a stage output across quality dimensions.
structural: float = 0.0 Uses a generic ``scores`` dict keyed by dimension name. Stage 5's
content_specificity: float = 0.0 original named fields (structural, content_specificity, ) are
voice_preservation: float = 0.0 preserved as properties for backward compatibility.
readability: float = 0.0 """
factual_fidelity: float = 0.0
scores: dict[str, float] = field(default_factory=dict)
composite: float = 0.0 composite: float = 0.0
justifications: dict[str, str] = field(default_factory=dict) justifications: dict[str, str] = field(default_factory=dict)
elapsed_seconds: float = 0.0 elapsed_seconds: float = 0.0
error: str | None = None error: str | None = None
# ── Backward-compat properties for stage 5 named dimensions ──────
@property
def structural(self) -> float:
return self.scores.get("structural", 0.0)
@property
def content_specificity(self) -> float:
return self.scores.get("content_specificity", 0.0)
@property
def voice_preservation(self) -> float:
return self.scores.get("voice_preservation", 0.0)
@property
def readability(self) -> float:
return self.scores.get("readability", 0.0)
@property
def factual_fidelity(self) -> float:
return self.scores.get("factual_fidelity", 0.0)
# ── Runner ─────────────────────────────────────────────────────────────────── # ── Runner ───────────────────────────────────────────────────────────────────
class ScoreRunner: class ScoreRunner:
"""Scores a Stage 5 technique page using LLM-as-judge evaluation.""" """Scores pipeline stage outputs using LLM-as-judge evaluation."""
def __init__(self, client: LLMClient) -> None: def __init__(self, client: LLMClient) -> None:
self.client = client self.client = client
def score_page( # ── Generic stage scorer ─────────────────────────────────────────────
def score_stage_output(
self, self,
page_json: dict, stage: int,
moments: list[dict], output_json: dict | list,
input_json: dict | list,
) -> ScoreResult: ) -> ScoreResult:
"""Evaluate a technique page against source moments. """Score an arbitrary stage's output against its input.
Parameters Parameters
---------- ----------
page_json: stage:
Synthesized page dict (title, summary, body_sections). Pipeline stage number (2-5).
moments: output_json:
Source key moments with transcript_excerpt, summary, etc. The stage output to evaluate (parsed JSON).
input_json:
The stage input / source material.
Returns Returns
------- -------
ScoreResult with per-dimension scores and justifications. ScoreResult with per-dimension scores for the requested stage.
""" """
# Build the user prompt with the page and source moments if stage not in STAGE_CONFIGS:
return ScoreResult(error=f"No config for stage {stage}. Valid: {sorted(STAGE_CONFIGS)}")
cfg = STAGE_CONFIGS[stage]
user_prompt = ( user_prompt = (
"## Synthesized Technique Page\n\n" "## Stage Output\n\n"
f"```json\n{json.dumps(page_json, indent=2)}\n```\n\n" f"```json\n{json.dumps(output_json, indent=2)}\n```\n\n"
"## Source Key Moments\n\n" "## Stage Input\n\n"
f"```json\n{json.dumps(moments, indent=2)}\n```\n\n" f"```json\n{json.dumps(input_json, indent=2)}\n```\n\n"
"Score this page across all 5 dimensions." f"Score this stage {stage} output across all {len(cfg.dimensions)} dimensions."
) )
t0 = time.monotonic() t0 = time.monotonic()
try: try:
resp = self.client.complete( resp = self.client.complete(
system_prompt=SCORING_RUBRIC, system_prompt=cfg.rubric,
user_prompt=user_prompt, user_prompt=user_prompt,
response_model=BaseModel, # triggers JSON mode response_model=BaseModel,
modality="chat", modality="chat",
) )
elapsed = round(time.monotonic() - t0, 2) elapsed = round(time.monotonic() - t0, 2)
@ -155,13 +389,9 @@ class ScoreRunner:
fallback = self.client.settings.llm_fallback_url fallback = self.client.settings.llm_fallback_url
return ScoreResult( return ScoreResult(
elapsed_seconds=elapsed, elapsed_seconds=elapsed,
error=( error=f"Cannot reach LLM endpoint at {url} (fallback {fallback}). Error: {exc}",
f"Cannot reach LLM endpoint at {url} (fallback {fallback}). "
f"Error: {exc}"
),
) )
# Parse the LLM judge response
raw_text = str(resp).strip() raw_text = str(resp).strip()
try: try:
parsed = json.loads(raw_text) parsed = json.loads(raw_text)
@ -172,10 +402,27 @@ class ScoreRunner:
error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}", error=f"Malformed judge response (not valid JSON). Raw excerpt: {raw_text[:200]}",
) )
return self._parse_scores(parsed, elapsed, cfg.dimensions)
# ── Stage 5 convenience (backward compat) ────────────────────────────
def score_page(
self,
page_json: dict,
moments: list[dict],
) -> ScoreResult:
"""Evaluate a stage 5 technique page against source moments."""
return self.score_stage_output(
stage=5,
output_json=page_json,
input_json=moments,
)
return self._parse_scores(parsed, elapsed) return self._parse_scores(parsed, elapsed)
def _parse_scores(self, parsed: dict, elapsed: float) -> ScoreResult: def _parse_scores(self, parsed: dict, elapsed: float, dimensions: list[str] | None = None) -> ScoreResult:
"""Extract and validate scores from parsed JSON response.""" """Extract and validate scores from parsed JSON response."""
dims = dimensions or DIMENSIONS
scores: dict[str, float] = {} scores: dict[str, float] = {}
justifications: dict[str, str] = {} justifications: dict[str, str] = {}
@ -183,7 +430,7 @@ class ScoreRunner:
if not isinstance(raw_justifications, dict): if not isinstance(raw_justifications, dict):
raw_justifications = {} raw_justifications = {}
for dim in DIMENSIONS: for dim in dims:
raw = parsed.get(dim) raw = parsed.get(dim)
if raw is None: if raw is None:
logger.warning("Missing dimension '%s' in judge response", dim) logger.warning("Missing dimension '%s' in judge response", dim)
@ -202,14 +449,10 @@ class ScoreRunner:
justifications[dim] = str(raw_justifications.get(dim, "")) justifications[dim] = str(raw_justifications.get(dim, ""))
composite = sum(scores.values()) / len(DIMENSIONS) composite = sum(scores.values()) / len(dims) if dims else 0.0
return ScoreResult( return ScoreResult(
structural=scores["structural"], scores=scores,
content_specificity=scores["content_specificity"],
voice_preservation=scores["voice_preservation"],
readability=scores["readability"],
factual_fidelity=scores["factual_fidelity"],
composite=round(composite, 3), composite=round(composite, 3),
justifications=justifications, justifications=justifications,
elapsed_seconds=elapsed, elapsed_seconds=elapsed,
@ -318,10 +561,13 @@ class ScoreRunner:
result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2) result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
return result return result
def print_report(self, result: ScoreResult) -> None: def print_report(self, result: ScoreResult, stage: int = 5) -> None:
"""Print a formatted scoring report to stdout.""" """Print a formatted scoring report to stdout."""
dims = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else list(result.scores.keys())
stage_label = f"STAGE {stage}" if stage in STAGE_CONFIGS else "QUALITY"
print("\n" + "=" * 60) print("\n" + "=" * 60)
print(" STAGE 5 QUALITY SCORE REPORT") print(f" {stage_label} QUALITY SCORE REPORT")
print("=" * 60) print("=" * 60)
if result.error: if result.error:
@ -329,8 +575,8 @@ class ScoreRunner:
print("=" * 60 + "\n") print("=" * 60 + "\n")
return return
for dim in DIMENSIONS: for dim in dims:
score = getattr(result, dim) score = result.scores.get(dim, 0.0)
bar = self._score_bar(score) bar = self._score_bar(score)
justification = result.justifications.get(dim, "") justification = result.justifications.get(dim, "")
print(f"\n {dim.replace('_', ' ').title()}") print(f"\n {dim.replace('_', ' ').title()}")

View file

@ -4,13 +4,17 @@ Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing. output format required by downstream parsing.
Supports any pipeline stage (2-5) callers pass the stage's dimensions
and format markers so the meta-prompt and validation adapt automatically.
""" """
from __future__ import annotations from __future__ import annotations
import logging import logging
from typing import Sequence
from pipeline.llm_client import LLMClient from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, ScoreResult from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,29 +22,24 @@ logger = logging.getLogger(__name__)
# ── Meta-prompt for variant generation ──────────────────────────────────────── # ── Meta-prompt for variant generation ────────────────────────────────────────
VARIANT_META_PROMPT = """\ VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content synthesis. You are an expert prompt engineer specializing in LLM-powered content processing pipelines.
Your task: given a synthesis prompt and its quality evaluation scores, produce an Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others. maintaining or improving the others.
## Scoring Dimensions (each 0.01.0) ## Scoring Dimensions (each 0.01.0)
- **structural** Section naming, count (3-6), paragraph depth (2-5 per section) {dimension_descriptions}
- **content_specificity** Concrete details: frequencies, time values, ratios, plugin names, dB values
- **voice_preservation** Direct quotes preserved, opinions attributed to creator by name, personality retained
- **readability** Cohesive article flow, related info merged, no redundancy or contradiction
- **factual_fidelity** Every claim traceable to source material, no hallucinated specifics
## Rules ## Rules
1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything. 1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions not vague encouragements. 2. Add specific, actionable instructions not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.** 3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
The prompt contains instructions about outputting a JSON object with a specific schema The prompt contains instructions about outputting a JSON object with a specific schema.
(SynthesisResult with "pages" containing title, summary, body_sections, etc.).
Do NOT modify, remove, or rephrase any part of the JSON format instructions. Do NOT modify, remove, or rephrase any part of the JSON format instructions.
Your changes should target the prose synthesis guidelines only. Your changes should target the processing/analysis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it. 4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes rewording a sentence or adding one adjective is not enough. 5. Make substantive changes rewording a sentence or adding one adjective is not enough.
@ -50,9 +49,38 @@ Return ONLY the full modified prompt text. No explanation, no markdown fences, n
Just the complete prompt that could be used directly as a system prompt. Just the complete prompt that could be used directly as a system prompt.
""" """
# Dimension descriptions per stage, used to fill the meta-prompt template.
_DIMENSION_DESCRIPTIONS: dict[int, str] = {
2: (
"- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
"- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
"- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
"- **summary_quality** — Summaries accurately describe segment content"
),
3: (
"- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
"- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
"- **content_type_correctness** — Content types match the actual moment content\n"
"- **summary_actionability** — Summaries provide actionable, specific information\n"
"- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
),
4: (
"- **category_accuracy** — Topic categories are appropriate and meaningful\n"
"- **tag_completeness** — All relevant tags are captured\n"
"- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
"- **coverage** — All moments are classified"
),
5: (
"- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
"- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
"- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
"- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
"- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
),
}
# Format markers that must survive variant generation — if any of these
# are present in the base prompt, the variant must also contain them. # Legacy default format markers for stage 5
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"] _FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
@ -71,6 +99,9 @@ class PromptVariantGenerator:
base_prompt: str, base_prompt: str,
scores: ScoreResult, scores: ScoreResult,
n: int = 2, n: int = 2,
*,
format_markers: Sequence[str] | None = None,
stage: int = 5,
) -> list[str]: ) -> list[str]:
"""Generate up to *n* valid prompt variants. """Generate up to *n* valid prompt variants.
@ -83,27 +114,48 @@ class PromptVariantGenerator:
Parameters Parameters
---------- ----------
base_prompt: base_prompt:
The current best synthesis prompt text. The current best prompt text for the target stage.
scores: scores:
ScoreResult from the most recent evaluation of *base_prompt*. ScoreResult from the most recent evaluation of *base_prompt*.
n: n:
Number of variants to attempt generating. Number of variants to attempt generating.
format_markers:
Override format markers for validation. When *None*, uses the
markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
defaults for backward compat).
stage:
Pipeline stage number (2-5), used to select dimension
descriptions for the meta-prompt and default format markers.
Returns Returns
------- -------
list[str] list[str]
Valid variant prompt strings (may be fewer than *n*). Valid variant prompt strings (may be fewer than *n*).
""" """
user_prompt = self._build_user_prompt(base_prompt, scores) # Resolve format markers and dimensions for the target stage
if format_markers is not None:
markers = list(format_markers)
elif stage in STAGE_CONFIGS:
markers = STAGE_CONFIGS[stage].format_markers
else:
markers = _FORMAT_MARKERS
dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS
# Build the system prompt with stage-appropriate dimension descriptions
dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)
user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
# Identify which format markers are actually present in the base # Identify which format markers are actually present in the base
required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt] required_markers = [m for m in markers if m in base_prompt]
variants: list[str] = [] variants: list[str] = []
for i in range(n): for i in range(n):
logger.info("Generating variant %d/%d...", i + 1, n) logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
try: try:
raw = self.client.complete( raw = self.client.complete(
system_prompt=VARIANT_META_PROMPT, system_prompt=system_prompt,
user_prompt=user_prompt, user_prompt=user_prompt,
response_model=None, # free-form text, not JSON response_model=None, # free-form text, not JSON
modality="chat", modality="chat",
@ -127,11 +179,12 @@ class PromptVariantGenerator:
# ── Internal helpers ────────────────────────────────────────────────── # ── Internal helpers ──────────────────────────────────────────────────
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str: def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
"""Build the user message describing the current prompt and its scores.""" """Build the user message describing the current prompt and its scores."""
dims = dimensions or DIMENSIONS
# Build per-dimension score lines, sorted worst-first # Build per-dimension score lines, sorted worst-first
dim_lines: list[str] = [] dim_lines: list[str] = []
dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS] dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
dim_scores.sort(key=lambda x: x[1]) dim_scores.sort(key=lambda x: x[1])
for dim, val in dim_scores: for dim, val in dim_scores: