chrysopedia/backend/pipeline/quality/variant_generator.py

"""LLM-powered prompt variant generator for automated optimization.

Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing.
"""
from __future__ import annotations

import logging

from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, ScoreResult

logger = logging.getLogger(__name__)


# ── Meta-prompt for variant generation ────────────────────────────────────────

VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content synthesis.

Your task: given a synthesis prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others.

## Scoring Dimensions (each 0.0–1.0)

- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)
- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values
- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained
- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction
- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics

## Rules

1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions — not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
   The prompt contains instructions about outputting a JSON object with a specific schema
   (SynthesisResult with "pages" containing title, summary, body_sections, etc.).
   Do NOT modify, remove, or rephrase any part of the JSON format instructions.
   Your changes should target the prose synthesis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes — rewording a sentence or adding one adjective is not enough.

## Output

Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
Just the complete prompt that could be used directly as a system prompt.
"""


# Format markers that must survive variant generation — if any of these
# are present in the base prompt, the variant must also contain them.
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]


class PromptVariantGenerator:
    """Generates prompt variants by asking an LLM to act as a prompt engineer.

    Given a base prompt and its evaluation scores, produces N mutated
    variants targeting the weakest dimensions.
    """

    def __init__(self, client: LLMClient) -> None:
        self.client = client

    def generate(
        self,
        base_prompt: str,
        scores: ScoreResult,
        n: int = 2,
    ) -> list[str]:
        """Generate up to *n* valid prompt variants.

        Each variant is produced by a separate LLM call with the meta-prompt.
        Variants are validated: they must differ from the base by ≥50 characters
        and must contain the JSON format instruction markers found in the base.

        Invalid variants are logged and skipped.

        Parameters
        ----------
        base_prompt:
            The current best synthesis prompt text.
        scores:
            ScoreResult from the most recent evaluation of *base_prompt*.
        n:
            Number of variants to attempt generating.

        Returns
        -------
        list[str]
            Valid variant prompt strings (may be fewer than *n*).
        """
        user_prompt = self._build_user_prompt(base_prompt, scores)
        # Identify which format markers are actually present in the base
        required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt]

        variants: list[str] = []
        for i in range(n):
            logger.info("Generating variant %d/%d...", i + 1, n)
            try:
                raw = self.client.complete(
                    system_prompt=VARIANT_META_PROMPT,
                    user_prompt=user_prompt,
                    response_model=None,  # free-form text, not JSON
                    modality="chat",
                )
                variant = str(raw).strip()
            except Exception:
                logger.exception("LLM error generating variant %d/%d", i + 1, n)
                continue

            # Validate the variant
            if not self._validate(variant, base_prompt, required_markers, i + 1):
                continue

            variants.append(variant)
            logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))

        logger.info(
            "Generated %d valid variant(s) out of %d attempts", len(variants), n
        )
        return variants

    # ── Internal helpers ──────────────────────────────────────────────────

    def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str:
        """Build the user message describing the current prompt and its scores."""
        # Build per-dimension score lines, sorted worst-first
        dim_lines: list[str] = []
        dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS]
        dim_scores.sort(key=lambda x: x[1])

        for dim, val in dim_scores:
            justification = scores.justifications.get(dim, "")
            label = dim.replace("_", " ").title()
            line = f"  {label}: {val:.2f}"
            if justification:
                line += f"  — {justification}"
            dim_lines.append(line)

        weakest = dim_scores[0][0].replace("_", " ").title()
        second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest

        return (
            f"## Current Prompt\n\n{base_prompt}\n\n"
            f"## Evaluation Scores (sorted weakest → strongest)\n\n"
            + "\n".join(dim_lines)
            + f"\n\n  Composite: {scores.composite:.3f}\n\n"
            f"## Priority\n\n"
            f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
            f"Focus your prompt modifications on improving these.\n\n"
            f"Return the full modified prompt now."
        )

    def _validate(
        self,
        variant: str,
        base_prompt: str,
        required_markers: list[str],
        index: int,
    ) -> bool:
        """Check a variant meets minimum quality gates."""
        if not variant:
            logger.warning("Variant %d is empty — skipping", index)
            return False

        # Must differ meaningfully from base
        diff = abs(len(variant) - len(base_prompt))
        # Also check actual content difference via set-symmetric-difference of lines
        base_lines = set(base_prompt.splitlines())
        variant_lines = set(variant.splitlines())
        changed_lines = len(base_lines.symmetric_difference(variant_lines))

        if diff < 50 and changed_lines < 3:
            logger.warning(
                "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
                index, diff, changed_lines,
            )
            return False

        # Must preserve format markers
        missing = [m for m in required_markers if m not in variant]
        if missing:
            logger.warning(
                "Variant %d missing format markers %s — skipping",
                index, missing,
            )
            return False

        return True