feat: Created PromptVariantGenerator (LLM-powered prompt mutation) and…

- "backend/pipeline/quality/variant_generator.py" - "backend/pipeline/quality/optimizer.py" GSD-Task: S03/T01
2026-04-01 09:08:01 +00:00 · 2026-04-01 09:08:01 +00:00 · 0d82b2b409
commit 0d82b2b409
parent 0086573af5
4 changed files with 570 additions and 0 deletions
--- a/backend/pipeline/quality/init.py
+++ b/backend/pipeline/quality/init.py
@ -0,0 +1,11 @@
+"""FYN-LLM quality assurance toolkit."""
+
+import os
+import sys
+
+# Ensure backend/ is on sys.path so sibling modules (config, pipeline.llm_client)
+# resolve when running from the project root via symlink.
+_backend_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")
+_backend_abs = os.path.normpath(os.path.abspath(_backend_dir))
+if _backend_abs not in sys.path:
+    sys.path.insert(0, _backend_abs)
--- a/backend/pipeline/quality/optimizer.py
+++ b/backend/pipeline/quality/optimizer.py
@ -0,0 +1,364 @@
+"""Automated prompt optimization loop for Stage 5 synthesis.
+
+Orchestrates a generate→score→select cycle:
+1. Score the current best prompt against reference fixtures
+2. Generate N variants targeting weak dimensions
+3. Score each variant
+4. Keep the best scorer as the new baseline
+5. Repeat for K iterations
+
+Usage (via CLI):
+    python -m pipeline.quality optimize --stage 5 --iterations 10
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from pipeline.llm_client import LLMClient
+from pipeline.quality.scorer import DIMENSIONS, ScoreResult, ScoreRunner
+from pipeline.quality.variant_generator import PromptVariantGenerator
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OptimizationResult:
+    """Full result of an optimization run."""
+
+    best_prompt: str = ""
+    best_score: ScoreResult = field(default_factory=ScoreResult)
+    history: list[dict] = field(default_factory=list)
+    elapsed_seconds: float = 0.0
+
+
+class OptimizationLoop:
+    """Runs iterative prompt optimization for a pipeline stage.
+
+    Each iteration generates *variants_per_iter* prompt mutations,
+    scores each against reference fixture data, and keeps the
+    highest-composite-scoring variant as the new baseline.
+
+    Parameters
+    ----------
+    client:
+        LLMClient instance for LLM calls (synthesis + scoring + variant gen).
+    stage:
+        Pipeline stage number (currently only 5 is supported).
+    fixture_path:
+        Path to a JSON fixture file containing ``creator_name`` and ``moments``.
+    iterations:
+        Number of generate→score→select cycles.
+    variants_per_iter:
+        Number of variant prompts to generate per iteration.
+    """
+
+    def __init__(
+        self,
+        client: LLMClient,
+        stage: int,
+        fixture_path: str,
+        iterations: int = 5,
+        variants_per_iter: int = 2,
+    ) -> None:
+        self.client = client
+        self.stage = stage
+        self.fixture_path = fixture_path
+        self.iterations = iterations
+        self.variants_per_iter = variants_per_iter
+
+        self.scorer = ScoreRunner(client)
+        self.generator = PromptVariantGenerator(client)
+
+    def run(self) -> OptimizationResult:
+        """Execute the full optimization loop.
+
+        Returns
+        -------
+        OptimizationResult
+            Contains the best prompt, its scores, full iteration history,
+            and wall-clock elapsed time.
+        """
+        from pipeline.stages import _load_prompt
+
+        t0 = time.monotonic()
+
+        # Load base prompt
+        prompt_file = f"stage{self.stage}_synthesis.txt"
+        try:
+            base_prompt = _load_prompt(prompt_file)
+        except FileNotFoundError:
+            logger.error("Prompt file not found: %s", prompt_file)
+            return OptimizationResult(
+                best_prompt="",
+                best_score=ScoreResult(error=f"Prompt file not found: {prompt_file}"),
+                elapsed_seconds=round(time.monotonic() - t0, 2),
+            )
+
+        # Load fixture data
+        try:
+            fixture = self._load_fixture()
+        except (FileNotFoundError, json.JSONDecodeError, KeyError) as exc:
+            logger.error("Failed to load fixture: %s", exc)
+            return OptimizationResult(
+                best_prompt=base_prompt,
+                best_score=ScoreResult(error=f"Fixture load error: {exc}"),
+                elapsed_seconds=round(time.monotonic() - t0, 2),
+            )
+
+        moments = fixture["moments"]
+        creator_name = fixture["creator_name"]
+        history: list[dict] = []
+
+        # Score the baseline
+        print(f"\n{'='*60}")
+        print(f"  PROMPT OPTIMIZATION — Stage {self.stage}")
+        print(f"  Iterations: {self.iterations}, Variants/iter: {self.variants_per_iter}")
+        print(f"{'='*60}\n")
+
+        print("  Scoring baseline prompt...")
+        best_score = self.scorer.synthesize_and_score(
+            moments=moments,
+            creator_name=creator_name,
+            voice_level=0.5,
+        )
+        best_prompt = base_prompt
+
+        history.append({
+            "iteration": 0,
+            "variant_index": 0,
+            "prompt_text": base_prompt[:200] + "..." if len(base_prompt) > 200 else base_prompt,
+            "prompt_length": len(base_prompt),
+            "composite": best_score.composite,
+            "scores": {d: getattr(best_score, d) for d in DIMENSIONS},
+            "error": best_score.error,
+            "label": "baseline",
+        })
+
+        if best_score.error:
+            print(f"  ✗ Baseline scoring failed: {best_score.error}")
+            print("  Aborting optimization — fix the baseline first.\n")
+            return OptimizationResult(
+                best_prompt=best_prompt,
+                best_score=best_score,
+                history=history,
+                elapsed_seconds=round(time.monotonic() - t0, 2),
+            )
+
+        self._print_iteration_summary(0, best_score, is_baseline=True)
+
+        # Iterate
+        for iteration in range(1, self.iterations + 1):
+            print(f"\n  ── Iteration {iteration}/{self.iterations} ──")
+
+            # Generate variants
+            variants = self.generator.generate(
+                base_prompt=best_prompt,
+                scores=best_score,
+                n=self.variants_per_iter,
+            )
+
+            if not variants:
+                print("  ⚠ No valid variants generated — skipping iteration")
+                continue
+
+            # Score each variant
+            iteration_best_score = best_score
+            iteration_best_prompt = best_prompt
+
+            for vi, variant_prompt in enumerate(variants):
+                print(f"  Scoring variant {vi + 1}/{len(variants)}...")
+
+                # Temporarily replace the base prompt with the variant for synthesis
+                score = self._score_variant(
+                    variant_prompt, moments, creator_name,
+                )
+
+                history.append({
+                    "iteration": iteration,
+                    "variant_index": vi + 1,
+                    "prompt_text": variant_prompt[:200] + "..." if len(variant_prompt) > 200 else variant_prompt,
+                    "prompt_length": len(variant_prompt),
+                    "composite": score.composite,
+                    "scores": {d: getattr(score, d) for d in DIMENSIONS},
+                    "error": score.error,
+                    "label": f"iter{iteration}_v{vi+1}",
+                })
+
+                if score.error:
+                    print(f"    ✗ Variant {vi + 1} errored: {score.error}")
+                    continue
+
+                if score.composite > iteration_best_score.composite:
+                    iteration_best_score = score
+                    iteration_best_prompt = variant_prompt
+                    print(f"    ✓ New best: {score.composite:.3f} (was {best_score.composite:.3f})")
+                else:
+                    print(f"    · Score {score.composite:.3f} ≤ current best {iteration_best_score.composite:.3f}")
+
+            # Update global best if this iteration improved
+            if iteration_best_score.composite > best_score.composite:
+                best_score = iteration_best_score
+                best_prompt = iteration_best_prompt
+                print(f"  ★ Iteration {iteration} improved: {best_score.composite:.3f}")
+            else:
+                print(f"  · No improvement in iteration {iteration}")
+
+            self._print_iteration_summary(iteration, best_score)
+
+        # Final report
+        elapsed = round(time.monotonic() - t0, 2)
+        self._print_final_report(best_score, history, elapsed)
+
+        return OptimizationResult(
+            best_prompt=best_prompt,
+            best_score=best_score,
+            history=history,
+            elapsed_seconds=elapsed,
+        )
+
+    # ── Internal helpers ──────────────────────────────────────────────────
+
+    def _load_fixture(self) -> dict:
+        """Load and validate the fixture JSON file."""
+        path = Path(self.fixture_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Fixture not found: {path}")
+        data = json.loads(path.read_text(encoding="utf-8"))
+
+        if "moments" not in data:
+            raise KeyError("Fixture must contain 'moments' key")
+        if "creator_name" not in data:
+            raise KeyError("Fixture must contain 'creator_name' key")
+
+        return data
+
+    def _score_variant(
+        self,
+        variant_prompt: str,
+        moments: list[dict],
+        creator_name: str,
+    ) -> ScoreResult:
+        """Score a variant prompt by running synthesis + scoring.
+
+        Uses the variant as a direct system prompt for synthesis, bypassing
+        VoiceDial (the optimization loop owns the full prompt text).
+        """
+        from pipeline.schemas import SynthesisResult
+        from pipeline.stages import _get_stage_config
+
+        import json as _json
+        import openai as _openai
+
+        model_override, modality = _get_stage_config(self.stage)
+
+        moments_json = _json.dumps(moments, indent=2)
+        user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_json}\n</moments>"
+
+        t0 = time.monotonic()
+        try:
+            raw = self.client.complete(
+                system_prompt=variant_prompt,
+                user_prompt=user_prompt,
+                response_model=SynthesisResult,
+                modality=modality,
+                model_override=model_override,
+            )
+            elapsed_synth = round(time.monotonic() - t0, 2)
+        except (_openai.APIConnectionError, _openai.APITimeoutError) as exc:
+            elapsed_synth = round(time.monotonic() - t0, 2)
+            return ScoreResult(
+                elapsed_seconds=elapsed_synth,
+                error=f"Synthesis LLM error: {exc}",
+            )
+        except Exception as exc:
+            elapsed_synth = round(time.monotonic() - t0, 2)
+            logger.exception("Unexpected error during variant synthesis")
+            return ScoreResult(
+                elapsed_seconds=elapsed_synth,
+                error=f"Unexpected synthesis error: {exc}",
+            )
+
+        # Parse synthesis
+        raw_text = str(raw).strip()
+        try:
+            synthesis = self.client.parse_response(raw_text, SynthesisResult)
+        except Exception as exc:
+            return ScoreResult(
+                elapsed_seconds=elapsed_synth,
+                error=f"Variant synthesis parse error: {exc}",
+            )
+
+        if not synthesis.pages:
+            return ScoreResult(
+                elapsed_seconds=elapsed_synth,
+                error="Variant synthesis returned no pages",
+            )
+
+        # Score the first page
+        page = synthesis.pages[0]
+        page_json = {
+            "title": page.title,
+            "creator_name": creator_name,
+            "summary": page.summary,
+            "body_sections": [
+                {"heading": heading, "content": content}
+                for heading, content in page.body_sections.items()
+            ],
+        }
+
+        result = self.scorer.score_page(page_json, moments)
+        result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2)
+        return result
+
+    def _print_iteration_summary(
+        self,
+        iteration: int,
+        score: ScoreResult,
+        is_baseline: bool = False,
+    ) -> None:
+        """Print a compact one-line summary of the current best scores."""
+        label = "BASELINE" if is_baseline else f"ITER {iteration}"
+        dims = "  ".join(
+            f"{d[:4]}={getattr(score, d):.2f}" for d in DIMENSIONS
+        )
+        print(f"  [{label}] composite={score.composite:.3f}  {dims}")
+
+    def _print_final_report(
+        self,
+        best_score: ScoreResult,
+        history: list[dict],
+        elapsed: float,
+    ) -> None:
+        """Print the final optimization summary."""
+        print(f"\n{'='*60}")
+        print("  OPTIMIZATION COMPLETE")
+        print(f"{'='*60}")
+        print(f"  Total time: {elapsed}s")
+        print(f"  Iterations: {self.iterations}")
+        print(f"  Variants scored: {len(history) - 1}")  # minus baseline
+
+        baseline_composite = history[0]["composite"] if history else 0.0
+        improvement = best_score.composite - baseline_composite
+
+        print(f"\n  Baseline composite: {baseline_composite:.3f}")
+        print(f"  Best composite:     {best_score.composite:.3f}")
+        if improvement > 0:
+            print(f"  Improvement:        +{improvement:.3f}")
+        else:
+            print(f"  Improvement:        {improvement:.3f} (no gain)")
+
+        print(f"\n  Per-dimension best scores:")
+        for d in DIMENSIONS:
+            val = getattr(best_score, d)
+            bar = "█" * int(val * 20) + "░" * (20 - int(val * 20))
+            print(f"    {d.replace('_', ' ').title():25s} {val:.2f}  {bar}")
+
+        errored = sum(1 for h in history if h.get("error"))
+        if errored:
+            print(f"\n  ⚠ {errored} variant(s) errored during scoring")
+
+        print(f"{'='*60}\n")
--- a/backend/pipeline/quality/variant_generator.py
+++ b/backend/pipeline/quality/variant_generator.py
@ -0,0 +1,194 @@
+"""LLM-powered prompt variant generator for automated optimization.
+
+Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
+analyzing per-dimension scores and producing targeted prompt mutations
+that improve the weakest scoring dimensions while preserving the JSON
+output format required by downstream parsing.
+"""
+from __future__ import annotations
+
+import logging
+
+from pipeline.llm_client import LLMClient
+from pipeline.quality.scorer import DIMENSIONS, ScoreResult
+
+logger = logging.getLogger(__name__)
+
+
+# ── Meta-prompt for variant generation ────────────────────────────────────────
+
+VARIANT_META_PROMPT = """\
+You are an expert prompt engineer specializing in LLM-powered content synthesis.
+
+Your task: given a synthesis prompt and its quality evaluation scores, produce an
+improved variant of the prompt that targets the weakest-scoring dimensions while
+maintaining or improving the others.
+
+## Scoring Dimensions (each 0.0–1.0)
+
+- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)
+- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values
+- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained
+- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction
+- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics
+
+## Rules
+
+1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
+2. Add specific, actionable instructions — not vague encouragements.
+3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
+   The prompt contains instructions about outputting a JSON object with a specific schema
+   (SynthesisResult with "pages" containing title, summary, body_sections, etc.).
+   Do NOT modify, remove, or rephrase any part of the JSON format instructions.
+   Your changes should target the prose synthesis guidelines only.
+4. Keep the overall prompt length within 2x of the original. Don't bloat it.
+5. Make substantive changes — rewording a sentence or adding one adjective is not enough.
+
+## Output
+
+Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
+Just the complete prompt that could be used directly as a system prompt.
+"""
+
+
+# Format markers that must survive variant generation — if any of these
+# are present in the base prompt, the variant must also contain them.
+_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
+
+
+class PromptVariantGenerator:
+    """Generates prompt variants by asking an LLM to act as a prompt engineer.
+
+    Given a base prompt and its evaluation scores, produces N mutated
+    variants targeting the weakest dimensions.
+    """
+
+    def __init__(self, client: LLMClient) -> None:
+        self.client = client
+
+    def generate(
+        self,
+        base_prompt: str,
+        scores: ScoreResult,
+        n: int = 2,
+    ) -> list[str]:
+        """Generate up to *n* valid prompt variants.
+
+        Each variant is produced by a separate LLM call with the meta-prompt.
+        Variants are validated: they must differ from the base by ≥50 characters
+        and must contain the JSON format instruction markers found in the base.
+
+        Invalid variants are logged and skipped.
+
+        Parameters
+        ----------
+        base_prompt:
+            The current best synthesis prompt text.
+        scores:
+            ScoreResult from the most recent evaluation of *base_prompt*.
+        n:
+            Number of variants to attempt generating.
+
+        Returns
+        -------
+        list[str]
+            Valid variant prompt strings (may be fewer than *n*).
+        """
+        user_prompt = self._build_user_prompt(base_prompt, scores)
+        # Identify which format markers are actually present in the base
+        required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt]
+
+        variants: list[str] = []
+        for i in range(n):
+            logger.info("Generating variant %d/%d...", i + 1, n)
+            try:
+                raw = self.client.complete(
+                    system_prompt=VARIANT_META_PROMPT,
+                    user_prompt=user_prompt,
+                    response_model=None,  # free-form text, not JSON
+                    modality="chat",
+                )
+                variant = str(raw).strip()
+            except Exception:
+                logger.exception("LLM error generating variant %d/%d", i + 1, n)
+                continue
+
+            # Validate the variant
+            if not self._validate(variant, base_prompt, required_markers, i + 1):
+                continue
+
+            variants.append(variant)
+            logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))
+
+        logger.info(
+            "Generated %d valid variant(s) out of %d attempts", len(variants), n
+        )
+        return variants
+
+    # ── Internal helpers ──────────────────────────────────────────────────
+
+    def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str:
+        """Build the user message describing the current prompt and its scores."""
+        # Build per-dimension score lines, sorted worst-first
+        dim_lines: list[str] = []
+        dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS]
+        dim_scores.sort(key=lambda x: x[1])
+
+        for dim, val in dim_scores:
+            justification = scores.justifications.get(dim, "")
+            label = dim.replace("_", " ").title()
+            line = f"  {label}: {val:.2f}"
+            if justification:
+                line += f"  — {justification}"
+            dim_lines.append(line)
+
+        weakest = dim_scores[0][0].replace("_", " ").title()
+        second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest
+
+        return (
+            f"## Current Prompt\n\n{base_prompt}\n\n"
+            f"## Evaluation Scores (sorted weakest → strongest)\n\n"
+            + "\n".join(dim_lines)
+            + f"\n\n  Composite: {scores.composite:.3f}\n\n"
+            f"## Priority\n\n"
+            f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
+            f"Focus your prompt modifications on improving these.\n\n"
+            f"Return the full modified prompt now."
+        )
+
+    def _validate(
+        self,
+        variant: str,
+        base_prompt: str,
+        required_markers: list[str],
+        index: int,
+    ) -> bool:
+        """Check a variant meets minimum quality gates."""
+        if not variant:
+            logger.warning("Variant %d is empty — skipping", index)
+            return False
+
+        # Must differ meaningfully from base
+        diff = abs(len(variant) - len(base_prompt))
+        # Also check actual content difference via set-symmetric-difference of lines
+        base_lines = set(base_prompt.splitlines())
+        variant_lines = set(variant.splitlines())
+        changed_lines = len(base_lines.symmetric_difference(variant_lines))
+
+        if diff < 50 and changed_lines < 3:
+            logger.warning(
+                "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
+                index, diff, changed_lines,
+            )
+            return False
+
+        # Must preserve format markers
+        missing = [m for m in required_markers if m not in variant]
+        if missing:
+            logger.warning(
+                "Variant %d missing format markers %s — skipping",
+                index, missing,
+            )
+            return False
+
+        return True
--- a/1
+++ b/1
@ -0,0 +1 @@
+backend/pipeline