"""LLM-powered prompt variant generator for automated optimization. Uses a meta-prompt to instruct the LLM to act as a prompt engineer, analyzing per-dimension scores and producing targeted prompt mutations that improve the weakest scoring dimensions while preserving the JSON output format required by downstream parsing. Supports any pipeline stage (2-5) — callers pass the stage's dimensions and format markers so the meta-prompt and validation adapt automatically. """ from __future__ import annotations import logging from typing import Sequence from pipeline.llm_client import LLMClient from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult logger = logging.getLogger(__name__) # ── Meta-prompt for variant generation ──────────────────────────────────────── VARIANT_META_PROMPT = """\ You are an expert prompt engineer specializing in LLM-powered content processing pipelines. Your task: given a pipeline stage prompt and its quality evaluation scores, produce an improved variant of the prompt that targets the weakest-scoring dimensions while maintaining or improving the others. ## Scoring Dimensions (each 0.0–1.0) {dimension_descriptions} ## Rules 1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything. 2. Add specific, actionable instructions — not vague encouragements. 3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.** The prompt contains instructions about outputting a JSON object with a specific schema. Do NOT modify, remove, or rephrase any part of the JSON format instructions. Your changes should target the processing/analysis guidelines only. 4. Keep the overall prompt length within 2x of the original. Don't bloat it. 5. Make substantive changes — rewording a sentence or adding one adjective is not enough. ## Output Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble. Just the complete prompt that could be used directly as a system prompt. """ # Dimension descriptions per stage, used to fill the meta-prompt template. _DIMENSION_DESCRIPTIONS: dict[int, str] = { 2: ( "- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n" "- **topic_specificity** — Topic labels are descriptive and useful, not generic\n" "- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n" "- **summary_quality** — Summaries accurately describe segment content" ), 3: ( "- **moment_richness** — Extracted moments capture substantial, distinct insights\n" "- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n" "- **content_type_correctness** — Content types match the actual moment content\n" "- **summary_actionability** — Summaries provide actionable, specific information\n" "- **plugin_normalization** — Plugin/tool names are correctly identified and normalized" ), 4: ( "- **category_accuracy** — Topic categories are appropriate and meaningful\n" "- **tag_completeness** — All relevant tags are captured\n" "- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n" "- **coverage** — All moments are classified" ), 5: ( "- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n" "- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n" "- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n" "- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n" "- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics" ), } # Legacy default format markers for stage 5 _FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"] class PromptVariantGenerator: """Generates prompt variants by asking an LLM to act as a prompt engineer. Given a base prompt and its evaluation scores, produces N mutated variants targeting the weakest dimensions. """ def __init__(self, client: LLMClient) -> None: self.client = client def generate( self, base_prompt: str, scores: ScoreResult, n: int = 2, *, format_markers: Sequence[str] | None = None, stage: int = 5, ) -> list[str]: """Generate up to *n* valid prompt variants. Each variant is produced by a separate LLM call with the meta-prompt. Variants are validated: they must differ from the base by ≥50 characters and must contain the JSON format instruction markers found in the base. Invalid variants are logged and skipped. Parameters ---------- base_prompt: The current best prompt text for the target stage. scores: ScoreResult from the most recent evaluation of *base_prompt*. n: Number of variants to attempt generating. format_markers: Override format markers for validation. When *None*, uses the markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5 defaults for backward compat). stage: Pipeline stage number (2-5), used to select dimension descriptions for the meta-prompt and default format markers. Returns ------- list[str] Valid variant prompt strings (may be fewer than *n*). """ # Resolve format markers and dimensions for the target stage if format_markers is not None: markers = list(format_markers) elif stage in STAGE_CONFIGS: markers = STAGE_CONFIGS[stage].format_markers else: markers = _FORMAT_MARKERS dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS # Build the system prompt with stage-appropriate dimension descriptions dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5]) system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc) user_prompt = self._build_user_prompt(base_prompt, scores, dimensions) # Identify which format markers are actually present in the base required_markers = [m for m in markers if m in base_prompt] variants: list[str] = [] for i in range(n): logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage) try: raw = self.client.complete( system_prompt=system_prompt, user_prompt=user_prompt, response_model=None, # free-form text, not JSON modality="chat", ) variant = str(raw).strip() except Exception: logger.exception("LLM error generating variant %d/%d", i + 1, n) continue # Validate the variant if not self._validate(variant, base_prompt, required_markers, i + 1): continue variants.append(variant) logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant)) logger.info( "Generated %d valid variant(s) out of %d attempts", len(variants), n ) return variants # ── Internal helpers ────────────────────────────────────────────────── def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str: """Build the user message describing the current prompt and its scores.""" dims = dimensions or DIMENSIONS # Build per-dimension score lines, sorted worst-first dim_lines: list[str] = [] dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims] dim_scores.sort(key=lambda x: x[1]) for dim, val in dim_scores: justification = scores.justifications.get(dim, "") label = dim.replace("_", " ").title() line = f" {label}: {val:.2f}" if justification: line += f" — {justification}" dim_lines.append(line) weakest = dim_scores[0][0].replace("_", " ").title() second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest return ( f"## Current Prompt\n\n{base_prompt}\n\n" f"## Evaluation Scores (sorted weakest → strongest)\n\n" + "\n".join(dim_lines) + f"\n\n Composite: {scores.composite:.3f}\n\n" f"## Priority\n\n" f"The weakest dimensions are **{weakest}** and **{second_weakest}**. " f"Focus your prompt modifications on improving these.\n\n" f"Return the full modified prompt now." ) def _validate( self, variant: str, base_prompt: str, required_markers: list[str], index: int, ) -> bool: """Check a variant meets minimum quality gates.""" if not variant: logger.warning("Variant %d is empty — skipping", index) return False # Must differ meaningfully from base diff = abs(len(variant) - len(base_prompt)) # Also check actual content difference via set-symmetric-difference of lines base_lines = set(base_prompt.splitlines()) variant_lines = set(variant.splitlines()) changed_lines = len(base_lines.symmetric_difference(variant_lines)) if diff < 50 and changed_lines < 3: logger.warning( "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping", index, diff, changed_lines, ) return False # Must preserve format markers missing = [m for m in required_markers if m not in variant] if missing: logger.warning( "Variant %d missing format markers %s — skipping", index, missing, ) return False return True