chrysopedia/backend/pipeline/quality/variant_generator.py
jlightner e740798f7c feat: Added STAGE_CONFIGS registry (stages 2-5) with per-stage rubrics,…
- "backend/pipeline/quality/scorer.py"
- "backend/pipeline/quality/variant_generator.py"

GSD-Task: S04/T01
2026-04-01 09:20:24 +00:00

247 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""LLM-powered prompt variant generator for automated optimization.
Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing.
Supports any pipeline stage (2-5) — callers pass the stage's dimensions
and format markers so the meta-prompt and validation adapt automatically.
"""
from __future__ import annotations
import logging
from typing import Sequence
from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult
logger = logging.getLogger(__name__)
# ── Meta-prompt for variant generation ────────────────────────────────────────
VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content processing pipelines.
Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others.
## Scoring Dimensions (each 0.01.0)
{dimension_descriptions}
## Rules
1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions — not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
The prompt contains instructions about outputting a JSON object with a specific schema.
Do NOT modify, remove, or rephrase any part of the JSON format instructions.
Your changes should target the processing/analysis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes — rewording a sentence or adding one adjective is not enough.
## Output
Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
Just the complete prompt that could be used directly as a system prompt.
"""
# Dimension descriptions per stage, used to fill the meta-prompt template.
_DIMENSION_DESCRIPTIONS: dict[int, str] = {
2: (
"- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
"- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
"- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
"- **summary_quality** — Summaries accurately describe segment content"
),
3: (
"- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
"- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
"- **content_type_correctness** — Content types match the actual moment content\n"
"- **summary_actionability** — Summaries provide actionable, specific information\n"
"- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
),
4: (
"- **category_accuracy** — Topic categories are appropriate and meaningful\n"
"- **tag_completeness** — All relevant tags are captured\n"
"- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
"- **coverage** — All moments are classified"
),
5: (
"- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
"- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
"- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
"- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
"- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
),
}
# Legacy default format markers for stage 5
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
class PromptVariantGenerator:
"""Generates prompt variants by asking an LLM to act as a prompt engineer.
Given a base prompt and its evaluation scores, produces N mutated
variants targeting the weakest dimensions.
"""
def __init__(self, client: LLMClient) -> None:
self.client = client
def generate(
self,
base_prompt: str,
scores: ScoreResult,
n: int = 2,
*,
format_markers: Sequence[str] | None = None,
stage: int = 5,
) -> list[str]:
"""Generate up to *n* valid prompt variants.
Each variant is produced by a separate LLM call with the meta-prompt.
Variants are validated: they must differ from the base by ≥50 characters
and must contain the JSON format instruction markers found in the base.
Invalid variants are logged and skipped.
Parameters
----------
base_prompt:
The current best prompt text for the target stage.
scores:
ScoreResult from the most recent evaluation of *base_prompt*.
n:
Number of variants to attempt generating.
format_markers:
Override format markers for validation. When *None*, uses the
markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
defaults for backward compat).
stage:
Pipeline stage number (2-5), used to select dimension
descriptions for the meta-prompt and default format markers.
Returns
-------
list[str]
Valid variant prompt strings (may be fewer than *n*).
"""
# Resolve format markers and dimensions for the target stage
if format_markers is not None:
markers = list(format_markers)
elif stage in STAGE_CONFIGS:
markers = STAGE_CONFIGS[stage].format_markers
else:
markers = _FORMAT_MARKERS
dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS
# Build the system prompt with stage-appropriate dimension descriptions
dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)
user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
# Identify which format markers are actually present in the base
required_markers = [m for m in markers if m in base_prompt]
variants: list[str] = []
for i in range(n):
logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
try:
raw = self.client.complete(
system_prompt=system_prompt,
user_prompt=user_prompt,
response_model=None, # free-form text, not JSON
modality="chat",
)
variant = str(raw).strip()
except Exception:
logger.exception("LLM error generating variant %d/%d", i + 1, n)
continue
# Validate the variant
if not self._validate(variant, base_prompt, required_markers, i + 1):
continue
variants.append(variant)
logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))
logger.info(
"Generated %d valid variant(s) out of %d attempts", len(variants), n
)
return variants
# ── Internal helpers ──────────────────────────────────────────────────
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
"""Build the user message describing the current prompt and its scores."""
dims = dimensions or DIMENSIONS
# Build per-dimension score lines, sorted worst-first
dim_lines: list[str] = []
dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
dim_scores.sort(key=lambda x: x[1])
for dim, val in dim_scores:
justification = scores.justifications.get(dim, "")
label = dim.replace("_", " ").title()
line = f" {label}: {val:.2f}"
if justification:
line += f"{justification}"
dim_lines.append(line)
weakest = dim_scores[0][0].replace("_", " ").title()
second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest
return (
f"## Current Prompt\n\n{base_prompt}\n\n"
f"## Evaluation Scores (sorted weakest → strongest)\n\n"
+ "\n".join(dim_lines)
+ f"\n\n Composite: {scores.composite:.3f}\n\n"
f"## Priority\n\n"
f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
f"Focus your prompt modifications on improving these.\n\n"
f"Return the full modified prompt now."
)
def _validate(
self,
variant: str,
base_prompt: str,
required_markers: list[str],
index: int,
) -> bool:
"""Check a variant meets minimum quality gates."""
if not variant:
logger.warning("Variant %d is empty — skipping", index)
return False
# Must differ meaningfully from base
diff = abs(len(variant) - len(base_prompt))
# Also check actual content difference via set-symmetric-difference of lines
base_lines = set(base_prompt.splitlines())
variant_lines = set(variant.splitlines())
changed_lines = len(base_lines.symmetric_difference(variant_lines))
if diff < 50 and changed_lines < 3:
logger.warning(
"Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
index, diff, changed_lines,
)
return False
# Must preserve format markers
missing = [m for m in required_markers if m not in variant]
if missing:
logger.warning(
"Variant %d missing format markers %s — skipping",
index, missing,
)
return False
return True