chrysopedia/backend/pipeline/quality/variant_generator.py
jlightner c6cbb09dd3 feat: Created PromptVariantGenerator (LLM-powered prompt mutation) and…
- "backend/pipeline/quality/variant_generator.py"
- "backend/pipeline/quality/optimizer.py"

GSD-Task: S03/T01
2026-04-01 09:08:01 +00:00

194 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""LLM-powered prompt variant generator for automated optimization.
Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing.
"""
from __future__ import annotations
import logging
from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, ScoreResult
logger = logging.getLogger(__name__)
# ── Meta-prompt for variant generation ────────────────────────────────────────
VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content synthesis.
Your task: given a synthesis prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others.
## Scoring Dimensions (each 0.01.0)
- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)
- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values
- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained
- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction
- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics
## Rules
1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions — not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
The prompt contains instructions about outputting a JSON object with a specific schema
(SynthesisResult with "pages" containing title, summary, body_sections, etc.).
Do NOT modify, remove, or rephrase any part of the JSON format instructions.
Your changes should target the prose synthesis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes — rewording a sentence or adding one adjective is not enough.
## Output
Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
Just the complete prompt that could be used directly as a system prompt.
"""
# Format markers that must survive variant generation — if any of these
# are present in the base prompt, the variant must also contain them.
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
class PromptVariantGenerator:
"""Generates prompt variants by asking an LLM to act as a prompt engineer.
Given a base prompt and its evaluation scores, produces N mutated
variants targeting the weakest dimensions.
"""
def __init__(self, client: LLMClient) -> None:
self.client = client
def generate(
self,
base_prompt: str,
scores: ScoreResult,
n: int = 2,
) -> list[str]:
"""Generate up to *n* valid prompt variants.
Each variant is produced by a separate LLM call with the meta-prompt.
Variants are validated: they must differ from the base by ≥50 characters
and must contain the JSON format instruction markers found in the base.
Invalid variants are logged and skipped.
Parameters
----------
base_prompt:
The current best synthesis prompt text.
scores:
ScoreResult from the most recent evaluation of *base_prompt*.
n:
Number of variants to attempt generating.
Returns
-------
list[str]
Valid variant prompt strings (may be fewer than *n*).
"""
user_prompt = self._build_user_prompt(base_prompt, scores)
# Identify which format markers are actually present in the base
required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt]
variants: list[str] = []
for i in range(n):
logger.info("Generating variant %d/%d...", i + 1, n)
try:
raw = self.client.complete(
system_prompt=VARIANT_META_PROMPT,
user_prompt=user_prompt,
response_model=None, # free-form text, not JSON
modality="chat",
)
variant = str(raw).strip()
except Exception:
logger.exception("LLM error generating variant %d/%d", i + 1, n)
continue
# Validate the variant
if not self._validate(variant, base_prompt, required_markers, i + 1):
continue
variants.append(variant)
logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))
logger.info(
"Generated %d valid variant(s) out of %d attempts", len(variants), n
)
return variants
# ── Internal helpers ──────────────────────────────────────────────────
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str:
"""Build the user message describing the current prompt and its scores."""
# Build per-dimension score lines, sorted worst-first
dim_lines: list[str] = []
dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS]
dim_scores.sort(key=lambda x: x[1])
for dim, val in dim_scores:
justification = scores.justifications.get(dim, "")
label = dim.replace("_", " ").title()
line = f" {label}: {val:.2f}"
if justification:
line += f"{justification}"
dim_lines.append(line)
weakest = dim_scores[0][0].replace("_", " ").title()
second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest
return (
f"## Current Prompt\n\n{base_prompt}\n\n"
f"## Evaluation Scores (sorted weakest → strongest)\n\n"
+ "\n".join(dim_lines)
+ f"\n\n Composite: {scores.composite:.3f}\n\n"
f"## Priority\n\n"
f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
f"Focus your prompt modifications on improving these.\n\n"
f"Return the full modified prompt now."
)
def _validate(
self,
variant: str,
base_prompt: str,
required_markers: list[str],
index: int,
) -> bool:
"""Check a variant meets minimum quality gates."""
if not variant:
logger.warning("Variant %d is empty — skipping", index)
return False
# Must differ meaningfully from base
diff = abs(len(variant) - len(base_prompt))
# Also check actual content difference via set-symmetric-difference of lines
base_lines = set(base_prompt.splitlines())
variant_lines = set(variant.splitlines())
changed_lines = len(base_lines.symmetric_difference(variant_lines))
if diff < 50 and changed_lines < 3:
logger.warning(
"Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
index, diff, changed_lines,
)
return False
# Must preserve format markers
missing = [m for m in required_markers if m not in variant]
if missing:
logger.warning(
"Variant %d missing format markers %s — skipping",
index, missing,
)
return False
return True