- "backend/pipeline/quality/scorer.py" - "backend/pipeline/quality/variant_generator.py" GSD-Task: S04/T01
247 lines
10 KiB
Python
247 lines
10 KiB
Python
"""LLM-powered prompt variant generator for automated optimization.
|
||
|
||
Uses a meta-prompt to instruct the LLM to act as a prompt engineer,
|
||
analyzing per-dimension scores and producing targeted prompt mutations
|
||
that improve the weakest scoring dimensions while preserving the JSON
|
||
output format required by downstream parsing.
|
||
|
||
Supports any pipeline stage (2-5) — callers pass the stage's dimensions
|
||
and format markers so the meta-prompt and validation adapt automatically.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Sequence
|
||
|
||
from pipeline.llm_client import LLMClient
|
||
from pipeline.quality.scorer import DIMENSIONS, STAGE_CONFIGS, ScoreResult
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ── Meta-prompt for variant generation ────────────────────────────────────────
|
||
|
||
VARIANT_META_PROMPT = """\
|
||
You are an expert prompt engineer specializing in LLM-powered content processing pipelines.
|
||
|
||
Your task: given a pipeline stage prompt and its quality evaluation scores, produce an
|
||
improved variant of the prompt that targets the weakest-scoring dimensions while
|
||
maintaining or improving the others.
|
||
|
||
## Scoring Dimensions (each 0.0–1.0)
|
||
|
||
{dimension_descriptions}
|
||
|
||
## Rules
|
||
|
||
1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything.
|
||
2. Add specific, actionable instructions — not vague encouragements.
|
||
3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.**
|
||
The prompt contains instructions about outputting a JSON object with a specific schema.
|
||
Do NOT modify, remove, or rephrase any part of the JSON format instructions.
|
||
Your changes should target the processing/analysis guidelines only.
|
||
4. Keep the overall prompt length within 2x of the original. Don't bloat it.
|
||
5. Make substantive changes — rewording a sentence or adding one adjective is not enough.
|
||
|
||
## Output
|
||
|
||
Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble.
|
||
Just the complete prompt that could be used directly as a system prompt.
|
||
"""
|
||
|
||
# Dimension descriptions per stage, used to fill the meta-prompt template.
|
||
_DIMENSION_DESCRIPTIONS: dict[int, str] = {
|
||
2: (
|
||
"- **coverage_completeness** — All transcript content accounted for, no gaps or overlaps\n"
|
||
"- **topic_specificity** — Topic labels are descriptive and useful, not generic\n"
|
||
"- **boundary_accuracy** — Segment boundaries align with actual topic transitions\n"
|
||
"- **summary_quality** — Summaries accurately describe segment content"
|
||
),
|
||
3: (
|
||
"- **moment_richness** — Extracted moments capture substantial, distinct insights\n"
|
||
"- **timestamp_accuracy** — Time ranges are plausible and well-bounded\n"
|
||
"- **content_type_correctness** — Content types match the actual moment content\n"
|
||
"- **summary_actionability** — Summaries provide actionable, specific information\n"
|
||
"- **plugin_normalization** — Plugin/tool names are correctly identified and normalized"
|
||
),
|
||
4: (
|
||
"- **category_accuracy** — Topic categories are appropriate and meaningful\n"
|
||
"- **tag_completeness** — All relevant tags are captured\n"
|
||
"- **tag_specificity** — Tags are specific enough to be useful for search/filtering\n"
|
||
"- **coverage** — All moments are classified"
|
||
),
|
||
5: (
|
||
"- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section)\n"
|
||
"- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values\n"
|
||
"- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained\n"
|
||
"- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction\n"
|
||
"- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics"
|
||
),
|
||
}
|
||
|
||
|
||
# Legacy default format markers for stage 5
|
||
_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"]
|
||
|
||
|
||
class PromptVariantGenerator:
|
||
"""Generates prompt variants by asking an LLM to act as a prompt engineer.
|
||
|
||
Given a base prompt and its evaluation scores, produces N mutated
|
||
variants targeting the weakest dimensions.
|
||
"""
|
||
|
||
def __init__(self, client: LLMClient) -> None:
|
||
self.client = client
|
||
|
||
def generate(
|
||
self,
|
||
base_prompt: str,
|
||
scores: ScoreResult,
|
||
n: int = 2,
|
||
*,
|
||
format_markers: Sequence[str] | None = None,
|
||
stage: int = 5,
|
||
) -> list[str]:
|
||
"""Generate up to *n* valid prompt variants.
|
||
|
||
Each variant is produced by a separate LLM call with the meta-prompt.
|
||
Variants are validated: they must differ from the base by ≥50 characters
|
||
and must contain the JSON format instruction markers found in the base.
|
||
|
||
Invalid variants are logged and skipped.
|
||
|
||
Parameters
|
||
----------
|
||
base_prompt:
|
||
The current best prompt text for the target stage.
|
||
scores:
|
||
ScoreResult from the most recent evaluation of *base_prompt*.
|
||
n:
|
||
Number of variants to attempt generating.
|
||
format_markers:
|
||
Override format markers for validation. When *None*, uses the
|
||
markers from ``STAGE_CONFIGS[stage]`` (falling back to stage 5
|
||
defaults for backward compat).
|
||
stage:
|
||
Pipeline stage number (2-5), used to select dimension
|
||
descriptions for the meta-prompt and default format markers.
|
||
|
||
Returns
|
||
-------
|
||
list[str]
|
||
Valid variant prompt strings (may be fewer than *n*).
|
||
"""
|
||
# Resolve format markers and dimensions for the target stage
|
||
if format_markers is not None:
|
||
markers = list(format_markers)
|
||
elif stage in STAGE_CONFIGS:
|
||
markers = STAGE_CONFIGS[stage].format_markers
|
||
else:
|
||
markers = _FORMAT_MARKERS
|
||
|
||
dimensions = STAGE_CONFIGS[stage].dimensions if stage in STAGE_CONFIGS else DIMENSIONS
|
||
|
||
# Build the system prompt with stage-appropriate dimension descriptions
|
||
dim_desc = _DIMENSION_DESCRIPTIONS.get(stage, _DIMENSION_DESCRIPTIONS[5])
|
||
system_prompt = VARIANT_META_PROMPT.format(dimension_descriptions=dim_desc)
|
||
|
||
user_prompt = self._build_user_prompt(base_prompt, scores, dimensions)
|
||
# Identify which format markers are actually present in the base
|
||
required_markers = [m for m in markers if m in base_prompt]
|
||
|
||
variants: list[str] = []
|
||
for i in range(n):
|
||
logger.info("Generating variant %d/%d (stage %d)...", i + 1, n, stage)
|
||
try:
|
||
raw = self.client.complete(
|
||
system_prompt=system_prompt,
|
||
user_prompt=user_prompt,
|
||
response_model=None, # free-form text, not JSON
|
||
modality="chat",
|
||
)
|
||
variant = str(raw).strip()
|
||
except Exception:
|
||
logger.exception("LLM error generating variant %d/%d", i + 1, n)
|
||
continue
|
||
|
||
# Validate the variant
|
||
if not self._validate(variant, base_prompt, required_markers, i + 1):
|
||
continue
|
||
|
||
variants.append(variant)
|
||
logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant))
|
||
|
||
logger.info(
|
||
"Generated %d valid variant(s) out of %d attempts", len(variants), n
|
||
)
|
||
return variants
|
||
|
||
# ── Internal helpers ──────────────────────────────────────────────────
|
||
|
||
def _build_user_prompt(self, base_prompt: str, scores: ScoreResult, dimensions: list[str] | None = None) -> str:
|
||
"""Build the user message describing the current prompt and its scores."""
|
||
dims = dimensions or DIMENSIONS
|
||
# Build per-dimension score lines, sorted worst-first
|
||
dim_lines: list[str] = []
|
||
dim_scores = [(d, scores.scores.get(d, 0.0)) for d in dims]
|
||
dim_scores.sort(key=lambda x: x[1])
|
||
|
||
for dim, val in dim_scores:
|
||
justification = scores.justifications.get(dim, "")
|
||
label = dim.replace("_", " ").title()
|
||
line = f" {label}: {val:.2f}"
|
||
if justification:
|
||
line += f" — {justification}"
|
||
dim_lines.append(line)
|
||
|
||
weakest = dim_scores[0][0].replace("_", " ").title()
|
||
second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest
|
||
|
||
return (
|
||
f"## Current Prompt\n\n{base_prompt}\n\n"
|
||
f"## Evaluation Scores (sorted weakest → strongest)\n\n"
|
||
+ "\n".join(dim_lines)
|
||
+ f"\n\n Composite: {scores.composite:.3f}\n\n"
|
||
f"## Priority\n\n"
|
||
f"The weakest dimensions are **{weakest}** and **{second_weakest}**. "
|
||
f"Focus your prompt modifications on improving these.\n\n"
|
||
f"Return the full modified prompt now."
|
||
)
|
||
|
||
def _validate(
|
||
self,
|
||
variant: str,
|
||
base_prompt: str,
|
||
required_markers: list[str],
|
||
index: int,
|
||
) -> bool:
|
||
"""Check a variant meets minimum quality gates."""
|
||
if not variant:
|
||
logger.warning("Variant %d is empty — skipping", index)
|
||
return False
|
||
|
||
# Must differ meaningfully from base
|
||
diff = abs(len(variant) - len(base_prompt))
|
||
# Also check actual content difference via set-symmetric-difference of lines
|
||
base_lines = set(base_prompt.splitlines())
|
||
variant_lines = set(variant.splitlines())
|
||
changed_lines = len(base_lines.symmetric_difference(variant_lines))
|
||
|
||
if diff < 50 and changed_lines < 3:
|
||
logger.warning(
|
||
"Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping",
|
||
index, diff, changed_lines,
|
||
)
|
||
return False
|
||
|
||
# Must preserve format markers
|
||
missing = [m for m in required_markers if m not in variant]
|
||
if missing:
|
||
logger.warning(
|
||
"Variant %d missing format markers %s — skipping",
|
||
index, missing,
|
||
)
|
||
return False
|
||
|
||
return True
|