"""LLM-powered prompt variant generator for automated optimization. Uses a meta-prompt to instruct the LLM to act as a prompt engineer, analyzing per-dimension scores and producing targeted prompt mutations that improve the weakest scoring dimensions while preserving the JSON output format required by downstream parsing. """ from __future__ import annotations import logging from pipeline.llm_client import LLMClient from pipeline.quality.scorer import DIMENSIONS, ScoreResult logger = logging.getLogger(__name__) # ── Meta-prompt for variant generation ──────────────────────────────────────── VARIANT_META_PROMPT = """\ You are an expert prompt engineer specializing in LLM-powered content synthesis. Your task: given a synthesis prompt and its quality evaluation scores, produce an improved variant of the prompt that targets the weakest-scoring dimensions while maintaining or improving the others. ## Scoring Dimensions (each 0.0–1.0) - **structural** — Section naming, count (3-6), paragraph depth (2-5 per section) - **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values - **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained - **readability** — Cohesive article flow, related info merged, no redundancy or contradiction - **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics ## Rules 1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything. 2. Add specific, actionable instructions — not vague encouragements. 3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.** The prompt contains instructions about outputting a JSON object with a specific schema (SynthesisResult with "pages" containing title, summary, body_sections, etc.). Do NOT modify, remove, or rephrase any part of the JSON format instructions. Your changes should target the prose synthesis guidelines only. 4. Keep the overall prompt length within 2x of the original. Don't bloat it. 5. Make substantive changes — rewording a sentence or adding one adjective is not enough. ## Output Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble. Just the complete prompt that could be used directly as a system prompt. """ # Format markers that must survive variant generation — if any of these # are present in the base prompt, the variant must also contain them. _FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"] class PromptVariantGenerator: """Generates prompt variants by asking an LLM to act as a prompt engineer. Given a base prompt and its evaluation scores, produces N mutated variants targeting the weakest dimensions. """ def __init__(self, client: LLMClient) -> None: self.client = client def generate( self, base_prompt: str, scores: ScoreResult, n: int = 2, ) -> list[str]: """Generate up to *n* valid prompt variants. Each variant is produced by a separate LLM call with the meta-prompt. Variants are validated: they must differ from the base by ≥50 characters and must contain the JSON format instruction markers found in the base. Invalid variants are logged and skipped. Parameters ---------- base_prompt: The current best synthesis prompt text. scores: ScoreResult from the most recent evaluation of *base_prompt*. n: Number of variants to attempt generating. Returns ------- list[str] Valid variant prompt strings (may be fewer than *n*). """ user_prompt = self._build_user_prompt(base_prompt, scores) # Identify which format markers are actually present in the base required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt] variants: list[str] = [] for i in range(n): logger.info("Generating variant %d/%d...", i + 1, n) try: raw = self.client.complete( system_prompt=VARIANT_META_PROMPT, user_prompt=user_prompt, response_model=None, # free-form text, not JSON modality="chat", ) variant = str(raw).strip() except Exception: logger.exception("LLM error generating variant %d/%d", i + 1, n) continue # Validate the variant if not self._validate(variant, base_prompt, required_markers, i + 1): continue variants.append(variant) logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant)) logger.info( "Generated %d valid variant(s) out of %d attempts", len(variants), n ) return variants # ── Internal helpers ────────────────────────────────────────────────── def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str: """Build the user message describing the current prompt and its scores.""" # Build per-dimension score lines, sorted worst-first dim_lines: list[str] = [] dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS] dim_scores.sort(key=lambda x: x[1]) for dim, val in dim_scores: justification = scores.justifications.get(dim, "") label = dim.replace("_", " ").title() line = f" {label}: {val:.2f}" if justification: line += f" — {justification}" dim_lines.append(line) weakest = dim_scores[0][0].replace("_", " ").title() second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest return ( f"## Current Prompt\n\n{base_prompt}\n\n" f"## Evaluation Scores (sorted weakest → strongest)\n\n" + "\n".join(dim_lines) + f"\n\n Composite: {scores.composite:.3f}\n\n" f"## Priority\n\n" f"The weakest dimensions are **{weakest}** and **{second_weakest}**. " f"Focus your prompt modifications on improving these.\n\n" f"Return the full modified prompt now." ) def _validate( self, variant: str, base_prompt: str, required_markers: list[str], index: int, ) -> bool: """Check a variant meets minimum quality gates.""" if not variant: logger.warning("Variant %d is empty — skipping", index) return False # Must differ meaningfully from base diff = abs(len(variant) - len(base_prompt)) # Also check actual content difference via set-symmetric-difference of lines base_lines = set(base_prompt.splitlines()) variant_lines = set(variant.splitlines()) changed_lines = len(base_lines.symmetric_difference(variant_lines)) if diff < 50 and changed_lines < 3: logger.warning( "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping", index, diff, changed_lines, ) return False # Must preserve format markers missing = [m for m in required_markers if m not in variant] if missing: logger.warning( "Variant %d missing format markers %s — skipping", index, missing, ) return False return True