chrysopedia/backend/pipeline/quality/variant_generator.py

"""LLM-powered prompt variant generator for automated optimization.

Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
analyzing per-dimension scores and producing targeted prompt mutations
that improve the weakest scoring dimensions while preserving the JSON
output format required by downstream parsing.

Supports any pipeline stage (2-5) — callers pass the stage's dimensions
and format markers so the meta-prompt and validation adapt automatically.
"""
from __future__ import annotations

import logging
from typing import Sequence

from pipeline.llm_client import LLMClient
from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult

logger = logging.getLogger(__name__)


# ── Meta-prompt for variant generation ────────────────────────────────────────

VARIANT_META_PROMPT = """\
You are an expert prompt engineer specializing in LLM-powered content processing pipelines.

Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
improved variant of the prompt that targets the weakest-scoring dimensions while
maintaining or improving the others.

## Scoring Dimensions (each 0.0–1.0)

{dimension_descriptions}

## Rules

1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
2. Add specific, actionable instructions — not vague encouragements.
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
   The prompt contains instructions about outputting a JSON object with a specific schema.
   Do NOT modify, remove, or rephrase any part of the JSON format instructions.
   Your changes should target the processing/analysis guidelines only.
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
5. Make substantive changes — rewording a sentence or adding one adjective is not enough.

## Output

Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
Just the complete prompt that could be used directly as a system prompt.
"""

# Dimension descriptions per stage, used to fill the meta-prompt template.
_DIMENSION_DESCRIPTIONS: dict[int, str] = {
    2: (
        "- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
        "- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
        "- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
        "- **summary_quality** — Summaries accurately describe segment content"
    ),
    3: (
        "- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
        "- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
        "- **content_type_correctness** — Content types match the actual moment content\n"
        "- **summary_actionability** — Summaries provide actionable, specific information\n"
        "- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
    ),
    4: (
        "- **category_accuracy** — Topic categories are appropriate and meaningful\n"
        "- **tag_completeness** — All relevant tags are captured\n"
        "- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
        "- **coverage** — All moments are classified"
    ),
    5: (
        "- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
        "- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
        "- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
        "- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
        "- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
    ),
}


# Legacy default format markers for stage 5
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]


class PromptVariantGenerator:
    """Generates prompt variants by asking an LLM to act as a prompt engineer.

    Given a base prompt and its evaluation scores, produces N mutated
    variants targeting the weakest dimensions.
    """

    def __init__(self, client: LLMClient) -> None:
        self.client = client

    def generate(
        self,
        base_prompt: str,
        scores: ScoreResult,
        n: int = 2,
        *,
        format_markers: Sequence[str] | None = None,
        stage: int = 5,
    ) -> list[str]:
        """Generate up to *n* valid prompt variants.

        Each variant is produced by a separate LLM call with the meta-prompt.
        Variants are validated: they must differ from the base by ≥50 characters
        and must contain the JSON format instruction markers found in the base.

        Invalid variants are logged and skipped.

        Parameters
        ----------
        base_prompt:
            The current best prompt text for the target stage.
        scores:
            ScoreResult from the most recent evaluation of *base_prompt*.
        n:
            Number of variants to attempt generating.
        format_markers:
            Override format markers for validation.  When *None*, uses the
            markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
            defaults for backward compat).
        stage:
            Pipeline stage number (2-5), used to select dimension
            descriptions for the meta-prompt and default format markers.

        Returns
        -------
        list[str]
            Valid variant prompt strings (may be fewer than *n*).
        """
        # Resolve format markers and dimensions for the target stage
        if format_markers is not None:
            markers = list(format_markers)
        elif stage in STAGE_CONFIGS:
            markers = STAGE_CONFIGS[stage].format_markers
        else:
            markers = _FORMAT_MARKERS

        dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS

        # Build the system prompt with stage-appropriate dimension descriptions
        dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
        system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)

        user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
        # Identify which format markers are actually present in the base
        required_markers = [m for m in markers if m in base_prompt]

        variants: list[str] = []
        for i in range(n):
            logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
            try:
                raw = self.client.complete(
                    system_prompt=system_prompt,
                    user_prompt=user_prompt,
                    response_model=None,  # free-form text, not JSON
                    modality="chat",
                )
                variant = str(raw).strip()
            except Exception:
                logger.exception("LLM error generating variant %d/%d", i + 1, n)
                continue

            # Validate the variant
            if not self._validate(variant, base_prompt, required_markers, i + 1):
                continue

            variants.append(variant)
            logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))

        logger.info(
            "Generated %d valid variant(s) out of %d attempts", len(variants), n
        )
        return variants

    # ── Internal helpers ──────────────────────────────────────────────────

    def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
        """Build the user message describing the current prompt and its scores."""
        dims = dimensions or DIMENSIONS
        # Build per-dimension score lines, sorted worst-first
        dim_lines: list[str] = []
        dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
        dim_scores.sort(key=lambda x: x[1])

        for dim, val in dim_scores:
            justification = scores.justifications.get(dim, "")
            label = dim.replace("_", " ").title()
            line = f"  {label}: {val:.2f}"
            if justification:
                line += f"  — {justification}"
            dim_lines.append(line)

        weakest = dim_scores[0][0].replace("_", " ").title()
        second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest

        return (
            f"## Current Prompt\n\n{base_prompt}\n\n"
            f"## Evaluation Scores (sorted weakest → strongest)\n\n"
            + "\n".join(dim_lines)
            + f"\n\n  Composite: {scores.composite:.3f}\n\n"
            f"## Priority\n\n"
            f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
            f"Focus your prompt modifications on improving these.\n\n"
            f"Return the full modified prompt now."
        )

    def _validate(
        self,
        variant: str,
        base_prompt: str,
        required_markers: list[str],
        index: int,
    ) -> bool:
        """Check a variant meets minimum quality gates."""
        if not variant:
            logger.warning("Variant %d is empty — skipping", index)
            return False

        # Must differ meaningfully from base
        diff = abs(len(variant) - len(base_prompt))
        # Also check actual content difference via set-symmetric-difference of lines
        base_lines = set(base_prompt.splitlines())
        variant_lines = set(variant.splitlines())
        changed_lines = len(base_lines.symmetric_difference(variant_lines))

        if diff < 50 and changed_lines < 3:
            logger.warning(
                "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
                index, diff, changed_lines,
            )
            return False

        # Must preserve format markers
        missing = [m for m in required_markers if m not in variant]
        if missing:
            logger.warning(
                "Variant %d missing format markers %s — skipping",
                index, missing,
            )
            return False

        return True