From c6cbb09dd3beb7defbb27e8877549131eb5f91c5 Mon Sep 17 00:00:00 2001 From: jlightner Date: Wed, 1 Apr 2026 09:08:01 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Created=20PromptVariantGenerator=20(LLM?= =?UTF-8?q?-powered=20prompt=20mutation)=20and=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/pipeline/quality/variant_generator.py" - "backend/pipeline/quality/optimizer.py" GSD-Task: S03/T01 --- .gsd/KNOWLEDGE.md | 6 + .gsd/milestones/M013/M013-ROADMAP.md | 2 +- .../milestones/M013/slices/S02/S02-SUMMARY.md | 101 +++++ .gsd/milestones/M013/slices/S02/S02-UAT.md | 110 ++++++ .../M013/slices/S02/tasks/T02-VERIFY.json | 24 ++ .gsd/milestones/M013/slices/S03/S03-PLAN.md | 83 +++- .../M013/slices/S03/S03-RESEARCH.md | 128 ++++++ .../M013/slices/S03/tasks/T01-PLAN.md | 60 +++ .../M013/slices/S03/tasks/T01-SUMMARY.md | 78 ++++ .../M013/slices/S03/tasks/T02-PLAN.md | 64 +++ backend/pipeline/quality/__init__.py | 11 + backend/pipeline/quality/optimizer.py | 364 ++++++++++++++++++ backend/pipeline/quality/variant_generator.py | 194 ++++++++++ pipeline | 1 + 14 files changed, 1224 insertions(+), 2 deletions(-) create mode 100644 .gsd/milestones/M013/slices/S02/S02-SUMMARY.md create mode 100644 .gsd/milestones/M013/slices/S02/S02-UAT.md create mode 100644 .gsd/milestones/M013/slices/S02/tasks/T02-VERIFY.json create mode 100644 .gsd/milestones/M013/slices/S03/S03-RESEARCH.md create mode 100644 .gsd/milestones/M013/slices/S03/tasks/T01-PLAN.md create mode 100644 .gsd/milestones/M013/slices/S03/tasks/T01-SUMMARY.md create mode 100644 .gsd/milestones/M013/slices/S03/tasks/T02-PLAN.md create mode 100644 backend/pipeline/quality/optimizer.py create mode 100644 backend/pipeline/quality/variant_generator.py create mode 120000 pipeline diff --git a/.gsd/KNOWLEDGE.md b/.gsd/KNOWLEDGE.md index 0d88342..a7b510c 100644 --- a/.gsd/KNOWLEDGE.md +++ b/.gsd/KNOWLEDGE.md @@ -222,3 +222,9 @@ **Context:** Single-pattern ILIKE search (e.g., `%keota snare%`) only matches when the exact phrase appears in one field. Users expect "keota snare" to find content where "keota" matches the creator and "snare" matches the title/tags. **Fix:** Tokenize by whitespace. For each token, generate OR conditions across all searchable fields (title, summary, tags, category, creator name). AND all per-token conditions. When a multi-token AND query returns zero results, fall back to partial_matches: query each token separately, score results by token coverage, return top 5 with a muted UI treatment. This gives exact matches priority while still providing useful results for imprecise queries. + +## Project-root symlink + sys.path bootstrap for nested Python packages + +**Context:** When a Python package lives under a subdirectory (e.g., `backend/pipeline/`), `python -m pipeline.quality` fails from the project root because `pipeline` isn't on `sys.path`. Task executors worked around this with `cd backend &&` prefix, but CI/verification gates may run from project root. + +**Fix:** Create a symlink at project root (`pipeline -> backend/pipeline`) so Python finds the package. Add a `sys.path` bootstrap in the package's `__init__.py` that uses `os.path.realpath(__file__)` to resolve through the symlink and insert the real parent directory (`backend/`) onto `sys.path`. This ensures sibling imports (e.g., `from config import ...`) resolve correctly. The `realpath()` call is critical — without it, the path resolves relative to the symlink location, not the real file location. diff --git a/.gsd/milestones/M013/M013-ROADMAP.md b/.gsd/milestones/M013/M013-ROADMAP.md index 29f314b..83bf8c5 100644 --- a/.gsd/milestones/M013/M013-ROADMAP.md +++ b/.gsd/milestones/M013/M013-ROADMAP.md @@ -7,6 +7,6 @@ A fully automated CLI tool that tests FYN-LLM fitness, scores pipeline output ac | ID | Slice | Risk | Depends | Done | After this | |----|-------|------|---------|------|------------| | S01 | General FYN-LLM Fitness Suite | medium | — | ✅ | Run `python -m pipeline.quality fitness` — outputs pass/fail for Mandelbrot question, JSON compliance, instruction following, and diverse prompt battery against live FYN-LLM | -| S02 | Stage 5 Quality Scorer & Voice Preservation Dial | high | S01 | ⬜ | Run scorer on a reference article — outputs composite score across 5 dimensions. Run same article at voice_level 0.2 vs 0.8 — voice preservation score differs meaningfully | +| S02 | Stage 5 Quality Scorer & Voice Preservation Dial | high | S01 | ✅ | Run scorer on a reference article — outputs composite score across 5 dimensions. Run same article at voice_level 0.2 vs 0.8 — voice preservation score differs meaningfully | | S03 | Prompt Variant Generator & Automated A/B Loop | high | S02 | ⬜ | Run `python -m pipeline.quality optimize --stage 5 --iterations 10` — generates prompt variants, scores each against reference articles, outputs leaderboard and score trajectory chart | | S04 | Expand to Pipeline Stages 2-4 | medium | S03 | ⬜ | Run `python -m pipeline.quality optimize --stage 3 --iterations 5` — optimizes extraction prompts with stage-appropriate scoring | diff --git a/.gsd/milestones/M013/slices/S02/S02-SUMMARY.md b/.gsd/milestones/M013/slices/S02/S02-SUMMARY.md new file mode 100644 index 0000000..3518d3c --- /dev/null +++ b/.gsd/milestones/M013/slices/S02/S02-SUMMARY.md @@ -0,0 +1,101 @@ +--- +id: S02 +parent: M013 +milestone: M013 +provides: + - ScoreRunner with score_page() and synthesize_and_score() methods + - VoiceDial class for 3-band prompt modification + - CLI score subcommand with --file, --slug, --voice-level + - 6-moment fixture file for offline testing +requires: + [] +affects: + - S03 +key_files: + - backend/pipeline/quality/scorer.py + - backend/pipeline/quality/voice_dial.py + - backend/pipeline/quality/__main__.py + - backend/pipeline/quality/__init__.py + - backend/pipeline/quality/fixtures/sample_moments.json + - pipeline +key_decisions: + - Hardcoded scoring rubric in scorer.py rather than external prompt file — faster iteration during quality toolkit development + - Three discrete voice bands (low/mid/high) at 0.33/0.67 boundaries instead of continuous interpolation — simpler to reason about and test + - Mid band returns base prompt unmodified since existing stage5 prompt already targets ~0.6 voice preservation + - Project-root symlink + __init__.py sys.path bootstrap to support running from any CWD +patterns_established: + - sys.path bootstrap in package __init__.py using os.path.realpath() to resolve through symlinks + - VoiceDial band-based prompt modification pattern reusable for other prompt dimension dials +observability_surfaces: + - Score report output with per-dimension scores and justification excerpts + - Band identification in voice-level output (band=low/mid/high) + - Clean connectivity error with endpoint URL on LLM failure +drill_down_paths: + - .gsd/milestones/M013/slices/S02/tasks/T01-SUMMARY.md + - .gsd/milestones/M013/slices/S02/tasks/T02-SUMMARY.md +duration: "" +verification_result: passed +completed_at: 2026-04-01T09:01:48.179Z +blocker_discovered: false +--- + +# S02: Stage 5 Quality Scorer & Voice Preservation Dial + +**Built 5-dimension LLM-as-judge scorer with CLI `score` subcommand and 3-band voice preservation dial that modifies Stage 5 synthesis prompts, runnable from both project root and backend/ directory.** + +## What Happened + +T01 created the scoring infrastructure: `ScoreRunner` with a `ScoreResult` dataclass covering 5 quality dimensions (structural, content_specificity, voice_preservation, readability, factual_fidelity), each scored 0.0–1.0 by an LLM judge via a hardcoded rubric prompt. The `score` subcommand was added to the existing `pipeline.quality` CLI with mutually exclusive `--file`/`--slug` inputs and an optional `--voice-level` parameter. A 6-moment fixture file (`sample_moments.json`) with realistic music production content provides offline testing data. + +T02 added the `VoiceDial` class implementing three discrete bands (low ≤0.33, mid 0.34–0.66, high ≥0.67) that modify the Stage 5 synthesis system prompt. Low band appends voice suppression instructions (neutral third-person, no direct quotes). Mid band passes the base prompt through unmodified (it already targets ~0.6 voice preservation). High band appends amplification instructions (maximize direct quotes, preserve personality). `ScoreRunner.synthesize_and_score()` chains re-synthesis through the modified prompt followed by scoring. + +Post-task fix: The verification gate ran `python -m pipeline.quality score --help` from the project root, but the `pipeline` package lives under `backend/`. Added a `pipeline` symlink at project root pointing to `backend/pipeline`, and added a `sys.path` bootstrap in `backend/pipeline/quality/__init__.py` that resolves through the symlink to add `backend/` to `sys.path`. This ensures `config` and `pipeline.llm_client` imports resolve regardless of CWD. + +## Verification + +All slice verification checks pass from both project root and backend/ directory: + +1. `python -m pipeline.quality score --help` — shows --file, --slug, --voice-level args (exit 0) +2. `python -c "from pipeline.quality.scorer import ScoreRunner, ScoreResult; print('ok')"` — import succeeds (exit 0) +3. `python -c "from pipeline.quality.voice_dial import VoiceDial; vd = VoiceDial('base'); assert vd.modify(0.1) != vd.modify(0.5); assert vd.modify(0.5) != vd.modify(0.9)"` — three distinct bands confirmed (exit 0) +4. `python -m pipeline.quality score --file backend/pipeline/quality/fixtures/sample_moments.json` — clean connectivity error, exit 1, no traceback +5. `python -m pipeline.quality score --file backend/pipeline/quality/fixtures/sample_moments.json --voice-level 0.3` — re-synthesis path, clean connectivity error, exit 1, no traceback +6. Fixture validation: 6 moments with required fields present + +## Requirements Advanced + +None. + +## Requirements Validated + +None. + +## New Requirements Surfaced + +None. + +## Requirements Invalidated or Re-scoped + +None. + +## Deviations + +Added project-root `pipeline` symlink and `sys.path` bootstrap in `__init__.py` to support running `python -m pipeline.quality` from the project root (not just from `backend/`). This was not in the original plan but required by the verification gate. + +## Known Limitations + +`--slug` path prints "DB loading not yet implemented" and exits 1 (deferred by design). Voice-level path requires `prompts/stage5_synthesis.txt` to be reachable from CWD — works in Docker (WORKDIR=/app) and from `backend/`, but from project root the prompt file path resolves differently. + +## Follow-ups + +None. + +## Files Created/Modified + +- `backend/pipeline/quality/scorer.py` — New: ScoreResult dataclass + ScoreRunner with score_page() and synthesize_and_score() methods +- `backend/pipeline/quality/voice_dial.py` — New: VoiceDial class with 3-band prompt modification (low/mid/high) +- `backend/pipeline/quality/__main__.py` — Added score subcommand with --file, --slug, --voice-level args +- `backend/pipeline/quality/__init__.py` — Added sys.path bootstrap for project-root execution +- `backend/pipeline/quality/fixtures/sample_moments.json` — New: 6-moment fixture with realistic music production content +- `backend/pipeline/quality/fixtures/__init__.py` — New: empty package marker +- `pipeline` — New: symlink to backend/pipeline for project-root execution diff --git a/.gsd/milestones/M013/slices/S02/S02-UAT.md b/.gsd/milestones/M013/slices/S02/S02-UAT.md new file mode 100644 index 0000000..3334dbe --- /dev/null +++ b/.gsd/milestones/M013/slices/S02/S02-UAT.md @@ -0,0 +1,110 @@ +# S02: Stage 5 Quality Scorer & Voice Preservation Dial — UAT + +**Milestone:** M013 +**Written:** 2026-04-01T09:01:48.179Z + +## UAT: Stage 5 Quality Scorer & Voice Preservation Dial + +### Preconditions +- Python 3.12+ available +- Working directory: project root (`/home/aux/projects/content-to-kb-automator`) +- No live LLM endpoint required (tests verify error handling) + +### Test Case 1: CLI Help Output +**Steps:** +1. Run `python -m pipeline.quality score --help` + +**Expected:** +- Exit code 0 +- Output shows three arguments: `--file`, `--slug`, `--voice-level` +- `--file` and `--slug` shown as mutually exclusive (required group) +- `--voice-level` shown as optional with float type + +### Test Case 2: Score with File Input (No LLM) +**Steps:** +1. Run `python -m pipeline.quality score --file backend/pipeline/quality/fixtures/sample_moments.json` + +**Expected:** +- Exit code 1 (no LLM available) +- Output includes "Cannot reach LLM endpoint" with the endpoint URL +- No Python traceback in output +- Shows "Scoring page for 'KOAN Sound' (6 moments)..." + +### Test Case 3: Score with Voice Level (No LLM) +**Steps:** +1. Run `python -m pipeline.quality score --file backend/pipeline/quality/fixtures/sample_moments.json --voice-level 0.3` + +**Expected:** +- Exit code 1 (no LLM available) +- Output includes "Re-synthesizing + scoring" and "voice_level=0.3" +- Output includes "band=low" (0.3 falls in low band ≤0.33) +- No Python traceback + +### Test Case 4: Voice Dial Band Boundaries +**Steps:** +1. Run: +```python +from pipeline.quality.voice_dial import VoiceDial +vd = VoiceDial("base prompt text") +low = vd.modify(0.1) +mid = vd.modify(0.5) +high = vd.modify(0.9) +``` + +**Expected:** +- `low != mid` — low band appends voice suppression instructions +- `mid != high` — high band appends voice amplification instructions +- `mid == "base prompt text"` — mid band returns base prompt unmodified +- `low` contains words like "suppress", "neutral", or "third-person" +- `high` contains words like "quote", "direct", or "personality" + +### Test Case 5: Invalid Voice Level +**Steps:** +1. Run `python -m pipeline.quality score --file backend/pipeline/quality/fixtures/sample_moments.json --voice-level 1.5` + +**Expected:** +- Exit code 1 +- Error message: "--voice-level must be between 0.0 and 1.0" + +### Test Case 6: Missing Required Input +**Steps:** +1. Run `python -m pipeline.quality score` + +**Expected:** +- Exit code 2 (argparse error) +- Error message about requiring --file or --slug + +### Test Case 7: Slug Path (Deferred) +**Steps:** +1. Run `python -m pipeline.quality score --slug test-technique` + +**Expected:** +- Exit code 1 +- Output: "DB loading not yet implemented" + +### Test Case 8: Fixture File Validity +**Steps:** +1. Run: +```python +import json +with open("backend/pipeline/quality/fixtures/sample_moments.json") as f: + data = json.load(f) +``` + +**Expected:** +- `data["creator_name"]` is a non-empty string +- `data["moments"]` is a list with ≥5 entries +- Each moment has: `summary`, `transcript_excerpt`, `topic_tags`, `topic_category` +- At least one moment contains specific plugin/setting mentions in transcript_excerpt + +### Test Case 9: Import from Project Root +**Steps:** +1. From project root, run: +```python +from pipeline.quality.scorer import ScoreRunner, ScoreResult +from pipeline.quality.voice_dial import VoiceDial +``` + +**Expected:** +- Both imports succeed without error +- Works from project root (not just backend/) diff --git a/.gsd/milestones/M013/slices/S02/tasks/T02-VERIFY.json b/.gsd/milestones/M013/slices/S02/tasks/T02-VERIFY.json new file mode 100644 index 0000000..dc54fc9 --- /dev/null +++ b/.gsd/milestones/M013/slices/S02/tasks/T02-VERIFY.json @@ -0,0 +1,24 @@ +{ + "schemaVersion": 1, + "taskId": "T02", + "unitId": "M013/S02/T02", + "timestamp": 1775033827449, + "passed": false, + "discoverySource": "task-plan", + "checks": [ + { + "command": "cd backend", + "exitCode": 0, + "durationMs": 3, + "verdict": "pass" + }, + { + "command": "python -m pipeline.quality score --help", + "exitCode": 1, + "durationMs": 18, + "verdict": "fail" + } + ], + "retryAttempt": 1, + "maxRetries": 2 +} diff --git a/.gsd/milestones/M013/slices/S03/S03-PLAN.md b/.gsd/milestones/M013/slices/S03/S03-PLAN.md index b073232..16483c6 100644 --- a/.gsd/milestones/M013/slices/S03/S03-PLAN.md +++ b/.gsd/milestones/M013/slices/S03/S03-PLAN.md @@ -1,6 +1,87 @@ # S03: Prompt Variant Generator & Automated A/B Loop -**Goal:** Automated prompt optimization that runs unattended and identifies winning prompt variants +**Goal:** Automated prompt optimization loop that generates LLM-powered prompt variants for Stage 5 synthesis, scores each against reference articles, and outputs a ranked leaderboard with score trajectories. **Demo:** After this: Run `python -m pipeline.quality optimize --stage 5 --iterations 10` — generates prompt variants, scores each against reference articles, outputs leaderboard and score trajectory chart ## Tasks +- [x] **T01: Created PromptVariantGenerator (LLM-powered prompt mutation) and OptimizationLoop (iterative generate→score→select engine) with full error tolerance and progress reporting** — Create two new modules: `variant_generator.py` with an LLM-powered prompt mutation engine, and `optimizer.py` with an optimization loop that orchestrates generate→score→select cycles. + +## Context + +The existing `ScoreRunner.synthesize_and_score()` already chains Stage 5 synthesis through a modified prompt and scores the result. The `VoiceDial` class demonstrates the pattern of appending modifier text to the base prompt. This task builds the higher-level automation: an LLM generates prompt variants targeting weak dimensions, and a loop selects winners across iterations. + +## Key Interfaces + +- `LLMClient.complete(system_prompt, user_prompt, response_model, modality)` — drives both variant generation and synthesis +- `ScoreRunner.synthesize_and_score(moments, creator_name, voice_level)` — scores a variant (uses voice_level=0.5 for neutral baseline) +- `_load_prompt('stage5_synthesis.txt')` — loads the base prompt to optimize +- `_get_stage_config(5)` — returns (model_override, modality) for Stage 5 + +## Steps + +1. Create `backend/pipeline/quality/variant_generator.py`: + - Define a meta-prompt constant (`VARIANT_META_PROMPT`) that instructs the LLM to act as a prompt engineer. Given the current best prompt, its per-dimension scores, and the scoring rubric summary, produce a single modified variant targeting the weakest dimension(s). The meta-prompt MUST instruct the LLM to preserve the JSON output format section of the synthesis prompt unchanged. + - `PromptVariantGenerator.__init__(self, client: LLMClient)` — stores the LLM client + - `generate(self, base_prompt: str, scores: ScoreResult, n: int = 2) -> list[str]` — calls the LLM `n` times with the meta-prompt, each time asking for a variant. Validates each variant: must differ from base by ≥50 chars, must still contain the JSON format instruction markers (e.g. `SynthesisResult` or `"pages"` key reference). Returns list of valid variant prompt strings. Logs and skips invalid variants. + +2. Create `backend/pipeline/quality/optimizer.py`: + - `@dataclass OptimizationResult`: best_prompt (str), best_score (ScoreResult), history (list of dicts with iteration, variant_index, prompt_text, score fields), elapsed_seconds (float) + - `OptimizationLoop.__init__(self, client: LLMClient, stage: int, fixture_path: str, iterations: int, variants_per_iter: int)` — stores config, creates ScoreRunner and PromptVariantGenerator internally + - `run(self) -> OptimizationResult` — loads base prompt via `_load_prompt(f'stage{stage}_synthesis.txt')`, loads fixture data from JSON file, runs baseline `synthesize_and_score()`, then iterates: generate N variants → score each → keep best → repeat. Prints progress per iteration. Stores full history. Returns OptimizationResult. + - Handle LLM errors gracefully: if a variant's synthesis/scoring returns a ScoreResult with `error`, log it and skip that variant (don't abort the loop). + +3. Verify both modules import cleanly from project root. + +## Must-Haves + +- Meta-prompt instructs LLM to preserve JSON output format section +- Variant validation: minimum diff from base, format markers present +- Invalid/errored variants are skipped, not fatal +- Progress output per iteration +- OptimizationResult contains full history for downstream reporting + - Estimate: 1.5h + - Files: backend/pipeline/quality/variant_generator.py, backend/pipeline/quality/optimizer.py + - Verify: python -c "from pipeline.quality.variant_generator import PromptVariantGenerator; print('generator ok')" && python -c "from pipeline.quality.optimizer import OptimizationLoop, OptimizationResult; print('optimizer ok')" +- [ ] **T02: Wire optimize CLI subcommand with leaderboard and trajectory output** — Add the `optimize` subcommand to `__main__.py` that connects PromptVariantGenerator + OptimizationLoop to the CLI, and add formatted reporting: a leaderboard table and an ASCII score trajectory chart. Write results to a JSON file. + +## Context + +T01 produced `PromptVariantGenerator` and `OptimizationLoop` with `OptimizationResult`. This task wires them into the existing CLI (which already has `fitness` and `score` subcommands) and adds human-readable output. + +## Steps + +1. Add `optimize` subparser to `__main__.py`: + - `--stage` (int, default 5) — pipeline stage to optimize + - `--iterations` (int, default 10) — number of optimization iterations + - `--variants-per-iter` (int, default 2) — variants generated per iteration + - `--file` (str, required) — path to moments JSON fixture + - `--output-dir` (str, default `backend/pipeline/quality/results/`) — where to write result JSON + - Validate --stage is 5 (others not yet supported, print message and exit 1 for other values) + - Create output dir if it doesn't exist + - Instantiate `OptimizationLoop` with parsed args, call `run()`, handle result + +2. Add reporting functions (can be in `__main__.py` or a small `reporting.py`): + - `print_leaderboard(result: OptimizationResult)` — formatted table showing top 5 variants by composite score, with per-dimension breakdown. Use the same visual style as `ScoreRunner.print_report()` (plain print with alignment, score bars). + - `print_trajectory(result: OptimizationResult)` — ASCII line chart of composite score across iterations. ~20 rows height, iteration index on x-axis. Simple text rendering (no external deps). + - Print both after the optimization loop completes. + +3. Write results JSON: + - Save `OptimizationResult` to `{output_dir}/optimize_stage{N}_{timestamp}.json` with: best_prompt, best_scores, full history, config (stage, iterations, variants_per_iter, fixture_path), elapsed_seconds. + - Create `backend/pipeline/quality/results/.gitkeep` so the directory is tracked. + +4. Verify the full CLI flow: + - `python -m pipeline.quality optimize --help` shows all args + - `python -m pipeline.quality optimize --stage 5 --iterations 1 --file backend/pipeline/quality/fixtures/sample_moments.json` runs and either produces output (if LLM reachable) or shows clean connectivity error + - `python -m pipeline.quality optimize --stage 3` prints 'only stage 5 supported' and exits 1 + +## Must-Haves + +- --stage, --iterations, --variants-per-iter, --file, --output-dir CLI args +- Stage validation (only 5 supported currently) +- Leaderboard table printed after loop +- ASCII trajectory chart printed after loop +- Results JSON written to output directory +- Clean error handling (no tracebacks on LLM connectivity failure) + - Estimate: 1h + - Files: backend/pipeline/quality/__main__.py, backend/pipeline/quality/results/.gitkeep + - Verify: python -m pipeline.quality optimize --help && python -m pipeline.quality optimize --stage 3 --iterations 1 --file backend/pipeline/quality/fixtures/sample_moments.json 2>&1 | grep -q 'stage 5' diff --git a/.gsd/milestones/M013/slices/S03/S03-RESEARCH.md b/.gsd/milestones/M013/slices/S03/S03-RESEARCH.md new file mode 100644 index 0000000..1a931f3 --- /dev/null +++ b/.gsd/milestones/M013/slices/S03/S03-RESEARCH.md @@ -0,0 +1,128 @@ +# S03 Research: Prompt Variant Generator & Automated A/B Loop + +## Summary + +This slice builds an automated optimization loop that generates prompt variants for Stage 5 synthesis, scores each against reference articles using the existing `ScoreRunner`, and outputs a leaderboard with score trajectories. The core deliverable is `python -m pipeline.quality optimize --stage 5 --iterations 10`. + +The existing infrastructure is solid: `ScoreRunner` handles scoring, `VoiceDial` demonstrates prompt modification patterns, `LLMClient` handles LLM calls with primary/fallback. The new work is: (1) an LLM-powered prompt variant generator, (2) an optimization loop orchestrator, (3) leaderboard + trajectory reporting. + +## Recommendation + +Build in three tasks: + +1. **Prompt Variant Generator** — `PromptVariantGenerator` class that uses the LLM to mutate a base prompt, producing N variants per iteration. This is the riskiest piece (LLM must produce valid, meaningfully different prompts). +2. **Optimization Loop** — `OptimizationLoop` class that orchestrates: load base prompt → generate variants → score each → select winners → repeat. Stores results in a simple JSON log file. +3. **CLI + Reporting** — Wire the `optimize` subcommand into `__main__.py`, add leaderboard table and score trajectory ASCII chart output. + +## Implementation Landscape + +### Existing Code to Build On + +| File | What it provides | How S03 uses it | +|------|-----------------|-----------------| +| `backend/pipeline/quality/scorer.py` | `ScoreRunner.score_page()`, `ScoreRunner.synthesize_and_score()`, `ScoreResult` dataclass | Core scoring for each variant. `synthesize_and_score()` already chains synthesis → scoring. | +| `backend/pipeline/quality/voice_dial.py` | `VoiceDial` class with band-based prompt modification | Pattern reference for prompt modification. S03's variant generator is more flexible — LLM-driven mutations vs fixed bands. | +| `backend/pipeline/quality/__main__.py` | CLI with `fitness` and `score` subcommands | Add `optimize` subcommand here. | +| `backend/pipeline/quality/__init__.py` | `sys.path` bootstrap for project-root execution | Already handles CWD resolution. | +| `backend/pipeline/stages.py` | `_load_prompt()`, `_get_stage_config()` | Load the base prompt template, get model/modality config. Already used by `synthesize_and_score()`. | +| `backend/pipeline/schemas.py` | `SynthesisResult`, `SynthesizedPage` | Parse synthesis output. Already used by `synthesize_and_score()`. | +| `backend/pipeline/llm_client.py` | `LLMClient.complete()`, `LLMClient.parse_response()` | Drive both variant generation and synthesis calls. | +| `backend/pipeline/quality/fixtures/sample_moments.json` | 6-moment fixture file | Default test data for optimization runs. | +| `prompts/stage5_synthesis.txt` | 129-line Stage 5 synthesis prompt | Base prompt to optimize. Path: `./prompts/` from backend CWD or project root via symlink. | + +### Prompt Variant Generation Strategy + +The generator should use the LLM itself to produce prompt variants. Approach: + +1. **Meta-prompt**: A system prompt instructing the LLM to act as a prompt engineer. Given the current best prompt and its scores, produce a variant that targets the weakest dimension(s). +2. **Mutation types**: The meta-prompt should guide the LLM toward specific mutation strategies: + - **Targeted improvement**: Focus on the lowest-scoring dimension (e.g., if `structural` is 0.4, modify section naming instructions) + - **Balanced tweak**: Small modifications across multiple dimensions + - **Creative restructure**: Reorganize prompt sections or add new constraints +3. **Constraint**: Variants must preserve the core JSON output format instructions (the synthesis must still produce valid `SynthesisResult` JSON). The meta-prompt must instruct the LLM not to modify the output format section. +4. **Output**: The variant generator returns the full modified prompt text as a string. + +Key risk: The LLM might produce variants that are too similar to the base or that break the output format. Mitigation: validate that the variant differs by at least N characters from the base, and that it still contains the JSON format instructions. + +### Optimization Loop Design + +``` +OptimizationLoop: + __init__(client, stage, fixture_path, iterations, variants_per_iter) + + run() → OptimizationResult: + base_prompt = _load_prompt(f"stage{stage}_synthesis.txt") + baseline_score = synthesize_and_score(base_prompt, moments) + + best = (base_prompt, baseline_score) + history = [baseline_score] + + for i in range(iterations): + variants = generator.generate(best.prompt, best.scores, n=variants_per_iter) + for variant in variants: + score = synthesize_and_score(variant, moments) + history.append(score) + if score.composite > best.composite: + best = (variant, score) + + return OptimizationResult(best, history, leaderboard) +``` + +Design decisions to note: +- **Variants per iteration**: Default 2-3 (each requires a synthesis + scoring LLM call, so cost is ~2 calls per variant per iteration). +- **Selection strategy**: Simple greedy — keep the best-scoring variant as the seed for next iteration. No population/crossover complexity. +- **Persistence**: Write a JSON log file after each iteration with prompt text, scores, and metadata. This allows resuming and provides the data for trajectory charts. +- **Stage parameter**: Initially only `--stage 5` works. The `--stage` flag is present for S04 expansion. + +### Reporting + +- **Leaderboard**: Top N variants by composite score, showing per-dimension breakdown. Use formatted terminal output (existing `print_report` pattern or `rich` library which is installed). +- **Score trajectory**: ASCII line chart showing composite score across iterations. Can use a simple text-based chart (20-30 lines of code) — no need for matplotlib. The iteration count is small (10-20) so ASCII is sufficient. +- **Output file**: Save full results to a JSON file at `backend/pipeline/quality/results/` for later analysis. + +### CLI Interface + +``` +python -m pipeline.quality optimize \ + --stage 5 \ + --iterations 10 \ + --variants-per-iter 2 \ + --file backend/pipeline/quality/fixtures/sample_moments.json \ + [--output-dir backend/pipeline/quality/results/] +``` + +Add to `__main__.py` alongside existing `fitness` and `score` subcommands. + +### Dependencies + +No new pip packages needed. `rich` is already installed for table output if desired, but plain formatted print (matching existing `print_report` style) is simpler and consistent. + +### File Plan + +| File | Action | Description | +|------|--------|-------------| +| `backend/pipeline/quality/variant_generator.py` | Create | `PromptVariantGenerator` class with LLM-powered prompt mutation | +| `backend/pipeline/quality/optimizer.py` | Create | `OptimizationLoop` class orchestrating generate→score→select cycle | +| `backend/pipeline/quality/__main__.py` | Modify | Add `optimize` subcommand | +| `backend/pipeline/quality/results/` | Create dir | Output directory for optimization run results | + +### Verification Strategy + +1. `python -m pipeline.quality optimize --help` — shows --stage, --iterations, --variants-per-iter, --file args +2. Import check: `python -c "from pipeline.quality.variant_generator import PromptVariantGenerator; print('ok')"` +3. Import check: `python -c "from pipeline.quality.optimizer import OptimizationLoop; print('ok')"` +4. Dry-run with fixture data (will fail at LLM call but should show clean error, not traceback): `python -m pipeline.quality optimize --stage 5 --iterations 1 --file backend/pipeline/quality/fixtures/sample_moments.json` +5. Unit-level: `PromptVariantGenerator` meta-prompt construction can be tested without LLM by verifying prompt assembly + +### Risks + +1. **LLM variant quality**: The meta-prompt for generating variants is the trickiest part. If the LLM produces near-identical variants or breaks output format, the loop produces no improvement. Mitigation: validate variant diff and format preservation. +2. **Cost per iteration**: Each variant needs synthesis + scoring = 2 LLM calls. 10 iterations × 2 variants = 40 LLM calls total. With a slow/rate-limited endpoint, this could take 10-20 minutes. Add progress output per iteration. +3. **Prompt file path resolution**: Known issue from S02 — `_load_prompt` uses `settings.prompts_path` which defaults to `./prompts`. Works from backend/ but from project root needs the symlink. Already handled by existing symlink setup. + +### Natural Task Seams + +1. **T01: PromptVariantGenerator** — Self-contained class in new file. Testable via import + prompt assembly check. No dependency on optimizer. +2. **T02: OptimizationLoop + CLI** — Depends on T01. Wires generator + scorer into loop, adds CLI subcommand, adds reporting/output. + +Two tasks is the natural decomposition. T01 is the riskiest (meta-prompt design), T02 is integration + reporting. diff --git a/.gsd/milestones/M013/slices/S03/tasks/T01-PLAN.md b/.gsd/milestones/M013/slices/S03/tasks/T01-PLAN.md new file mode 100644 index 0000000..57adce1 --- /dev/null +++ b/.gsd/milestones/M013/slices/S03/tasks/T01-PLAN.md @@ -0,0 +1,60 @@ +--- +estimated_steps: 25 +estimated_files: 2 +skills_used: [] +--- + +# T01: Build PromptVariantGenerator and OptimizationLoop engine + +Create two new modules: `variant_generator.py` with an LLM-powered prompt mutation engine, and `optimizer.py` with an optimization loop that orchestrates generate→score→select cycles. + +## Context + +The existing `ScoreRunner.synthesize_and_score()` already chains Stage 5 synthesis through a modified prompt and scores the result. The `VoiceDial` class demonstrates the pattern of appending modifier text to the base prompt. This task builds the higher-level automation: an LLM generates prompt variants targeting weak dimensions, and a loop selects winners across iterations. + +## Key Interfaces + +- `LLMClient.complete(system_prompt, user_prompt, response_model, modality)` — drives both variant generation and synthesis +- `ScoreRunner.synthesize_and_score(moments, creator_name, voice_level)` — scores a variant (uses voice_level=0.5 for neutral baseline) +- `_load_prompt('stage5_synthesis.txt')` — loads the base prompt to optimize +- `_get_stage_config(5)` — returns (model_override, modality) for Stage 5 + +## Steps + +1. Create `backend/pipeline/quality/variant_generator.py`: + - Define a meta-prompt constant (`VARIANT_META_PROMPT`) that instructs the LLM to act as a prompt engineer. Given the current best prompt, its per-dimension scores, and the scoring rubric summary, produce a single modified variant targeting the weakest dimension(s). The meta-prompt MUST instruct the LLM to preserve the JSON output format section of the synthesis prompt unchanged. + - `PromptVariantGenerator.__init__(self, client: LLMClient)` — stores the LLM client + - `generate(self, base_prompt: str, scores: ScoreResult, n: int = 2) -> list[str]` — calls the LLM `n` times with the meta-prompt, each time asking for a variant. Validates each variant: must differ from base by ≥50 chars, must still contain the JSON format instruction markers (e.g. `SynthesisResult` or `"pages"` key reference). Returns list of valid variant prompt strings. Logs and skips invalid variants. + +2. Create `backend/pipeline/quality/optimizer.py`: + - `@dataclass OptimizationResult`: best_prompt (str), best_score (ScoreResult), history (list of dicts with iteration, variant_index, prompt_text, score fields), elapsed_seconds (float) + - `OptimizationLoop.__init__(self, client: LLMClient, stage: int, fixture_path: str, iterations: int, variants_per_iter: int)` — stores config, creates ScoreRunner and PromptVariantGenerator internally + - `run(self) -> OptimizationResult` — loads base prompt via `_load_prompt(f'stage{stage}_synthesis.txt')`, loads fixture data from JSON file, runs baseline `synthesize_and_score()`, then iterates: generate N variants → score each → keep best → repeat. Prints progress per iteration. Stores full history. Returns OptimizationResult. + - Handle LLM errors gracefully: if a variant's synthesis/scoring returns a ScoreResult with `error`, log it and skip that variant (don't abort the loop). + +3. Verify both modules import cleanly from project root. + +## Must-Haves + +- Meta-prompt instructs LLM to preserve JSON output format section +- Variant validation: minimum diff from base, format markers present +- Invalid/errored variants are skipped, not fatal +- Progress output per iteration +- OptimizationResult contains full history for downstream reporting + +## Inputs + +- ``backend/pipeline/quality/scorer.py` — ScoreRunner and ScoreResult classes used by optimizer` +- ``backend/pipeline/quality/voice_dial.py` — pattern reference for prompt modification` +- ``backend/pipeline/llm_client.py` — LLMClient used by both generator and optimizer` +- ``backend/pipeline/stages.py` — _load_prompt() and _get_stage_config() used by optimizer` +- ``backend/pipeline/quality/fixtures/sample_moments.json` — fixture format reference` + +## Expected Output + +- ``backend/pipeline/quality/variant_generator.py` — PromptVariantGenerator class with LLM-powered prompt mutation` +- ``backend/pipeline/quality/optimizer.py` — OptimizationLoop class with generate→score→select cycle and OptimizationResult dataclass` + +## Verification + +python -c "from pipeline.quality.variant_generator import PromptVariantGenerator; print('generator ok')" && python -c "from pipeline.quality.optimizer import OptimizationLoop, OptimizationResult; print('optimizer ok')" diff --git a/.gsd/milestones/M013/slices/S03/tasks/T01-SUMMARY.md b/.gsd/milestones/M013/slices/S03/tasks/T01-SUMMARY.md new file mode 100644 index 0000000..023b865 --- /dev/null +++ b/.gsd/milestones/M013/slices/S03/tasks/T01-SUMMARY.md @@ -0,0 +1,78 @@ +--- +id: T01 +parent: S03 +milestone: M013 +provides: [] +requires: [] +affects: [] +key_files: ["backend/pipeline/quality/variant_generator.py", "backend/pipeline/quality/optimizer.py"] +key_decisions: ["Variant validation uses both length diff and line-level symmetric difference to catch trivial mutations", "OptimizationLoop bypasses VoiceDial — the loop owns the full prompt text directly"] +patterns_established: [] +drill_down_paths: [] +observability_surfaces: [] +duration: "" +verification_result: "Both modules import cleanly: `python -c "from pipeline.quality.variant_generator import PromptVariantGenerator"` and `python -c "from pipeline.quality.optimizer import OptimizationLoop, OptimizationResult"` both exit 0." +completed_at: 2026-04-01T09:07:58.463Z +blocker_discovered: false +--- + +# T01: Created PromptVariantGenerator (LLM-powered prompt mutation) and OptimizationLoop (iterative generate→score→select engine) with full error tolerance and progress reporting + +> Created PromptVariantGenerator (LLM-powered prompt mutation) and OptimizationLoop (iterative generate→score→select engine) with full error tolerance and progress reporting + +## What Happened +--- +id: T01 +parent: S03 +milestone: M013 +key_files: + - backend/pipeline/quality/variant_generator.py + - backend/pipeline/quality/optimizer.py +key_decisions: + - Variant validation uses both length diff and line-level symmetric difference to catch trivial mutations + - OptimizationLoop bypasses VoiceDial — the loop owns the full prompt text directly +duration: "" +verification_result: passed +completed_at: 2026-04-01T09:07:58.464Z +blocker_discovered: false +--- + +# T01: Created PromptVariantGenerator (LLM-powered prompt mutation) and OptimizationLoop (iterative generate→score→select engine) with full error tolerance and progress reporting + +**Created PromptVariantGenerator (LLM-powered prompt mutation) and OptimizationLoop (iterative generate→score→select engine) with full error tolerance and progress reporting** + +## What Happened + +Built two modules in backend/pipeline/quality/. variant_generator.py provides PromptVariantGenerator with a meta-prompt instructing the LLM to act as a prompt engineer — generates N variants targeting weakest dimensions, validates each (min diff + format markers), skips invalid ones. optimizer.py provides OptimizationLoop that orchestrates load→baseline score→iterate(generate→score→select)→report cycles, with OptimizationResult capturing full history for downstream leaderboard/charting. + +## Verification + +Both modules import cleanly: `python -c "from pipeline.quality.variant_generator import PromptVariantGenerator"` and `python -c "from pipeline.quality.optimizer import OptimizationLoop, OptimizationResult"` both exit 0. + +## Verification Evidence + +| # | Command | Exit Code | Verdict | Duration | +|---|---------|-----------|---------|----------| +| 1 | `python -c "from pipeline.quality.variant_generator import PromptVariantGenerator; print('generator ok')"` | 0 | ✅ pass | 500ms | +| 2 | `python -c "from pipeline.quality.optimizer import OptimizationLoop, OptimizationResult; print('optimizer ok')"` | 0 | ✅ pass | 500ms | + + +## Deviations + +OptimizationLoop._score_variant does its own synthesis call rather than delegating to ScoreRunner.synthesize_and_score() to avoid VoiceDial double-application since the loop owns the full prompt text directly. + +## Known Issues + +None. + +## Files Created/Modified + +- `backend/pipeline/quality/variant_generator.py` +- `backend/pipeline/quality/optimizer.py` + + +## Deviations +OptimizationLoop._score_variant does its own synthesis call rather than delegating to ScoreRunner.synthesize_and_score() to avoid VoiceDial double-application since the loop owns the full prompt text directly. + +## Known Issues +None. diff --git a/.gsd/milestones/M013/slices/S03/tasks/T02-PLAN.md b/.gsd/milestones/M013/slices/S03/tasks/T02-PLAN.md new file mode 100644 index 0000000..c2bc392 --- /dev/null +++ b/.gsd/milestones/M013/slices/S03/tasks/T02-PLAN.md @@ -0,0 +1,64 @@ +--- +estimated_steps: 31 +estimated_files: 2 +skills_used: [] +--- + +# T02: Wire optimize CLI subcommand with leaderboard and trajectory output + +Add the `optimize` subcommand to `__main__.py` that connects PromptVariantGenerator + OptimizationLoop to the CLI, and add formatted reporting: a leaderboard table and an ASCII score trajectory chart. Write results to a JSON file. + +## Context + +T01 produced `PromptVariantGenerator` and `OptimizationLoop` with `OptimizationResult`. This task wires them into the existing CLI (which already has `fitness` and `score` subcommands) and adds human-readable output. + +## Steps + +1. Add `optimize` subparser to `__main__.py`: + - `--stage` (int, default 5) — pipeline stage to optimize + - `--iterations` (int, default 10) — number of optimization iterations + - `--variants-per-iter` (int, default 2) — variants generated per iteration + - `--file` (str, required) — path to moments JSON fixture + - `--output-dir` (str, default `backend/pipeline/quality/results/`) — where to write result JSON + - Validate --stage is 5 (others not yet supported, print message and exit 1 for other values) + - Create output dir if it doesn't exist + - Instantiate `OptimizationLoop` with parsed args, call `run()`, handle result + +2. Add reporting functions (can be in `__main__.py` or a small `reporting.py`): + - `print_leaderboard(result: OptimizationResult)` — formatted table showing top 5 variants by composite score, with per-dimension breakdown. Use the same visual style as `ScoreRunner.print_report()` (plain print with alignment, score bars). + - `print_trajectory(result: OptimizationResult)` — ASCII line chart of composite score across iterations. ~20 rows height, iteration index on x-axis. Simple text rendering (no external deps). + - Print both after the optimization loop completes. + +3. Write results JSON: + - Save `OptimizationResult` to `{output_dir}/optimize_stage{N}_{timestamp}.json` with: best_prompt, best_scores, full history, config (stage, iterations, variants_per_iter, fixture_path), elapsed_seconds. + - Create `backend/pipeline/quality/results/.gitkeep` so the directory is tracked. + +4. Verify the full CLI flow: + - `python -m pipeline.quality optimize --help` shows all args + - `python -m pipeline.quality optimize --stage 5 --iterations 1 --file backend/pipeline/quality/fixtures/sample_moments.json` runs and either produces output (if LLM reachable) or shows clean connectivity error + - `python -m pipeline.quality optimize --stage 3` prints 'only stage 5 supported' and exits 1 + +## Must-Haves + +- --stage, --iterations, --variants-per-iter, --file, --output-dir CLI args +- Stage validation (only 5 supported currently) +- Leaderboard table printed after loop +- ASCII trajectory chart printed after loop +- Results JSON written to output directory +- Clean error handling (no tracebacks on LLM connectivity failure) + +## Inputs + +- ``backend/pipeline/quality/variant_generator.py` — PromptVariantGenerator from T01` +- ``backend/pipeline/quality/optimizer.py` — OptimizationLoop and OptimizationResult from T01` +- ``backend/pipeline/quality/__main__.py` — existing CLI with fitness and score subcommands` +- ``backend/pipeline/quality/scorer.py` — ScoreRunner.print_report() style reference for formatting` + +## Expected Output + +- ``backend/pipeline/quality/__main__.py` — updated with optimize subcommand, leaderboard, trajectory chart, results JSON output` +- ``backend/pipeline/quality/results/.gitkeep` — tracked output directory` + +## Verification + +python -m pipeline.quality optimize --help && python -m pipeline.quality optimize --stage 3 --iterations 1 --file backend/pipeline/quality/fixtures/sample_moments.json 2>&1 | grep -q 'stage 5' diff --git a/backend/pipeline/quality/__init__.py b/backend/pipeline/quality/__init__.py index e69de29..6474832 100644 --- a/backend/pipeline/quality/__init__.py +++ b/backend/pipeline/quality/__init__.py @@ -0,0 +1,11 @@ +"""FYN-LLM quality assurance toolkit.""" + +import os +import sys + +# Ensure backend/ is on sys.path so sibling modules (config, pipeline.llm_client) +# resolve when running from the project root via symlink. +_backend_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..") +_backend_abs = os.path.normpath(os.path.abspath(_backend_dir)) +if _backend_abs not in sys.path: + sys.path.insert(0, _backend_abs) diff --git a/backend/pipeline/quality/optimizer.py b/backend/pipeline/quality/optimizer.py new file mode 100644 index 0000000..7aebb85 --- /dev/null +++ b/backend/pipeline/quality/optimizer.py @@ -0,0 +1,364 @@ +"""Automated prompt optimization loop for Stage 5 synthesis. + +Orchestrates a generate→score→select cycle: +1. Score the current best prompt against reference fixtures +2. Generate N variants targeting weak dimensions +3. Score each variant +4. Keep the best scorer as the new baseline +5. Repeat for K iterations + +Usage (via CLI): + python -m pipeline.quality optimize --stage 5 --iterations 10 +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field +from pathlib import Path + +from pipeline.llm_client import LLMClient +from pipeline.quality.scorer import DIMENSIONS, ScoreResult, ScoreRunner +from pipeline.quality.variant_generator import PromptVariantGenerator + +logger = logging.getLogger(__name__) + + +@dataclass +class OptimizationResult: + """Full result of an optimization run.""" + + best_prompt: str = "" + best_score: ScoreResult = field(default_factory=ScoreResult) + history: list[dict] = field(default_factory=list) + elapsed_seconds: float = 0.0 + + +class OptimizationLoop: + """Runs iterative prompt optimization for a pipeline stage. + + Each iteration generates *variants_per_iter* prompt mutations, + scores each against reference fixture data, and keeps the + highest-composite-scoring variant as the new baseline. + + Parameters + ---------- + client: + LLMClient instance for LLM calls (synthesis + scoring + variant gen). + stage: + Pipeline stage number (currently only 5 is supported). + fixture_path: + Path to a JSON fixture file containing ``creator_name`` and ``moments``. + iterations: + Number of generate→score→select cycles. + variants_per_iter: + Number of variant prompts to generate per iteration. + """ + + def __init__( + self, + client: LLMClient, + stage: int, + fixture_path: str, + iterations: int = 5, + variants_per_iter: int = 2, + ) -> None: + self.client = client + self.stage = stage + self.fixture_path = fixture_path + self.iterations = iterations + self.variants_per_iter = variants_per_iter + + self.scorer = ScoreRunner(client) + self.generator = PromptVariantGenerator(client) + + def run(self) -> OptimizationResult: + """Execute the full optimization loop. + + Returns + ------- + OptimizationResult + Contains the best prompt, its scores, full iteration history, + and wall-clock elapsed time. + """ + from pipeline.stages import _load_prompt + + t0 = time.monotonic() + + # Load base prompt + prompt_file = f"stage{self.stage}_synthesis.txt" + try: + base_prompt = _load_prompt(prompt_file) + except FileNotFoundError: + logger.error("Prompt file not found: %s", prompt_file) + return OptimizationResult( + best_prompt="", + best_score=ScoreResult(error=f"Prompt file not found: {prompt_file}"), + elapsed_seconds=round(time.monotonic() - t0, 2), + ) + + # Load fixture data + try: + fixture = self._load_fixture() + except (FileNotFoundError, json.JSONDecodeError, KeyError) as exc: + logger.error("Failed to load fixture: %s", exc) + return OptimizationResult( + best_prompt=base_prompt, + best_score=ScoreResult(error=f"Fixture load error: {exc}"), + elapsed_seconds=round(time.monotonic() - t0, 2), + ) + + moments = fixture["moments"] + creator_name = fixture["creator_name"] + history: list[dict] = [] + + # Score the baseline + print(f"\n{'='*60}") + print(f" PROMPT OPTIMIZATION — Stage {self.stage}") + print(f" Iterations: {self.iterations}, Variants/iter: {self.variants_per_iter}") + print(f"{'='*60}\n") + + print(" Scoring baseline prompt...") + best_score = self.scorer.synthesize_and_score( + moments=moments, + creator_name=creator_name, + voice_level=0.5, + ) + best_prompt = base_prompt + + history.append({ + "iteration": 0, + "variant_index": 0, + "prompt_text": base_prompt[:200] + "..." if len(base_prompt) > 200 else base_prompt, + "prompt_length": len(base_prompt), + "composite": best_score.composite, + "scores": {d: getattr(best_score, d) for d in DIMENSIONS}, + "error": best_score.error, + "label": "baseline", + }) + + if best_score.error: + print(f" ✗ Baseline scoring failed: {best_score.error}") + print(" Aborting optimization — fix the baseline first.\n") + return OptimizationResult( + best_prompt=best_prompt, + best_score=best_score, + history=history, + elapsed_seconds=round(time.monotonic() - t0, 2), + ) + + self._print_iteration_summary(0, best_score, is_baseline=True) + + # Iterate + for iteration in range(1, self.iterations + 1): + print(f"\n ── Iteration {iteration}/{self.iterations} ──") + + # Generate variants + variants = self.generator.generate( + base_prompt=best_prompt, + scores=best_score, + n=self.variants_per_iter, + ) + + if not variants: + print(" ⚠ No valid variants generated — skipping iteration") + continue + + # Score each variant + iteration_best_score = best_score + iteration_best_prompt = best_prompt + + for vi, variant_prompt in enumerate(variants): + print(f" Scoring variant {vi + 1}/{len(variants)}...") + + # Temporarily replace the base prompt with the variant for synthesis + score = self._score_variant( + variant_prompt, moments, creator_name, + ) + + history.append({ + "iteration": iteration, + "variant_index": vi + 1, + "prompt_text": variant_prompt[:200] + "..." if len(variant_prompt) > 200 else variant_prompt, + "prompt_length": len(variant_prompt), + "composite": score.composite, + "scores": {d: getattr(score, d) for d in DIMENSIONS}, + "error": score.error, + "label": f"iter{iteration}_v{vi+1}", + }) + + if score.error: + print(f" ✗ Variant {vi + 1} errored: {score.error}") + continue + + if score.composite > iteration_best_score.composite: + iteration_best_score = score + iteration_best_prompt = variant_prompt + print(f" ✓ New best: {score.composite:.3f} (was {best_score.composite:.3f})") + else: + print(f" · Score {score.composite:.3f} ≤ current best {iteration_best_score.composite:.3f}") + + # Update global best if this iteration improved + if iteration_best_score.composite > best_score.composite: + best_score = iteration_best_score + best_prompt = iteration_best_prompt + print(f" ★ Iteration {iteration} improved: {best_score.composite:.3f}") + else: + print(f" · No improvement in iteration {iteration}") + + self._print_iteration_summary(iteration, best_score) + + # Final report + elapsed = round(time.monotonic() - t0, 2) + self._print_final_report(best_score, history, elapsed) + + return OptimizationResult( + best_prompt=best_prompt, + best_score=best_score, + history=history, + elapsed_seconds=elapsed, + ) + + # ── Internal helpers ────────────────────────────────────────────────── + + def _load_fixture(self) -> dict: + """Load and validate the fixture JSON file.""" + path = Path(self.fixture_path) + if not path.exists(): + raise FileNotFoundError(f"Fixture not found: {path}") + data = json.loads(path.read_text(encoding="utf-8")) + + if "moments" not in data: + raise KeyError("Fixture must contain 'moments' key") + if "creator_name" not in data: + raise KeyError("Fixture must contain 'creator_name' key") + + return data + + def _score_variant( + self, + variant_prompt: str, + moments: list[dict], + creator_name: str, + ) -> ScoreResult: + """Score a variant prompt by running synthesis + scoring. + + Uses the variant as a direct system prompt for synthesis, bypassing + VoiceDial (the optimization loop owns the full prompt text). + """ + from pipeline.schemas import SynthesisResult + from pipeline.stages import _get_stage_config + + import json as _json + import openai as _openai + + model_override, modality = _get_stage_config(self.stage) + + moments_json = _json.dumps(moments, indent=2) + user_prompt = f"{creator_name}\n\n{moments_json}\n" + + t0 = time.monotonic() + try: + raw = self.client.complete( + system_prompt=variant_prompt, + user_prompt=user_prompt, + response_model=SynthesisResult, + modality=modality, + model_override=model_override, + ) + elapsed_synth = round(time.monotonic() - t0, 2) + except (_openai.APIConnectionError, _openai.APITimeoutError) as exc: + elapsed_synth = round(time.monotonic() - t0, 2) + return ScoreResult( + elapsed_seconds=elapsed_synth, + error=f"Synthesis LLM error: {exc}", + ) + except Exception as exc: + elapsed_synth = round(time.monotonic() - t0, 2) + logger.exception("Unexpected error during variant synthesis") + return ScoreResult( + elapsed_seconds=elapsed_synth, + error=f"Unexpected synthesis error: {exc}", + ) + + # Parse synthesis + raw_text = str(raw).strip() + try: + synthesis = self.client.parse_response(raw_text, SynthesisResult) + except Exception as exc: + return ScoreResult( + elapsed_seconds=elapsed_synth, + error=f"Variant synthesis parse error: {exc}", + ) + + if not synthesis.pages: + return ScoreResult( + elapsed_seconds=elapsed_synth, + error="Variant synthesis returned no pages", + ) + + # Score the first page + page = synthesis.pages[0] + page_json = { + "title": page.title, + "creator_name": creator_name, + "summary": page.summary, + "body_sections": [ + {"heading": heading, "content": content} + for heading, content in page.body_sections.items() + ], + } + + result = self.scorer.score_page(page_json, moments) + result.elapsed_seconds = round(result.elapsed_seconds + elapsed_synth, 2) + return result + + def _print_iteration_summary( + self, + iteration: int, + score: ScoreResult, + is_baseline: bool = False, + ) -> None: + """Print a compact one-line summary of the current best scores.""" + label = "BASELINE" if is_baseline else f"ITER {iteration}" + dims = " ".join( + f"{d[:4]}={getattr(score, d):.2f}" for d in DIMENSIONS + ) + print(f" [{label}] composite={score.composite:.3f} {dims}") + + def _print_final_report( + self, + best_score: ScoreResult, + history: list[dict], + elapsed: float, + ) -> None: + """Print the final optimization summary.""" + print(f"\n{'='*60}") + print(" OPTIMIZATION COMPLETE") + print(f"{'='*60}") + print(f" Total time: {elapsed}s") + print(f" Iterations: {self.iterations}") + print(f" Variants scored: {len(history) - 1}") # minus baseline + + baseline_composite = history[0]["composite"] if history else 0.0 + improvement = best_score.composite - baseline_composite + + print(f"\n Baseline composite: {baseline_composite:.3f}") + print(f" Best composite: {best_score.composite:.3f}") + if improvement > 0: + print(f" Improvement: +{improvement:.3f}") + else: + print(f" Improvement: {improvement:.3f} (no gain)") + + print(f"\n Per-dimension best scores:") + for d in DIMENSIONS: + val = getattr(best_score, d) + bar = "█" * int(val * 20) + "░" * (20 - int(val * 20)) + print(f" {d.replace('_', ' ').title():25s} {val:.2f} {bar}") + + errored = sum(1 for h in history if h.get("error")) + if errored: + print(f"\n ⚠ {errored} variant(s) errored during scoring") + + print(f"{'='*60}\n") diff --git a/backend/pipeline/quality/variant_generator.py b/backend/pipeline/quality/variant_generator.py new file mode 100644 index 0000000..3a20adf --- /dev/null +++ b/backend/pipeline/quality/variant_generator.py @@ -0,0 +1,194 @@ +"""LLM-powered prompt variant generator for automated optimization. + +Uses a meta-prompt to instruct the LLM to act as a prompt engineer, +analyzing per-dimension scores and producing targeted prompt mutations +that improve the weakest scoring dimensions while preserving the JSON +output format required by downstream parsing. +""" +from __future__ import annotations + +import logging + +from pipeline.llm_client import LLMClient +from pipeline.quality.scorer import DIMENSIONS, ScoreResult + +logger = logging.getLogger(__name__) + + +# ── Meta-prompt for variant generation ──────────────────────────────────────── + +VARIANT_META_PROMPT = """\ +You are an expert prompt engineer specializing in LLM-powered content synthesis. + +Your task: given a synthesis prompt and its quality evaluation scores, produce an +improved variant of the prompt that targets the weakest-scoring dimensions while +maintaining or improving the others. + +## Scoring Dimensions (each 0.0–1.0) + +- **structural** — Section naming, count (3-6), paragraph depth (2-5 per section) +- **content_specificity** — Concrete details: frequencies, time values, ratios, plugin names, dB values +- **voice_preservation** — Direct quotes preserved, opinions attributed to creator by name, personality retained +- **readability** — Cohesive article flow, related info merged, no redundancy or contradiction +- **factual_fidelity** — Every claim traceable to source material, no hallucinated specifics + +## Rules + +1. Focus your changes on the weakest 1-2 dimensions. Don't dilute the prompt by trying to fix everything. +2. Add specific, actionable instructions — not vague encouragements. +3. **CRITICAL: You MUST preserve the JSON output format section of the prompt EXACTLY as-is.** + The prompt contains instructions about outputting a JSON object with a specific schema + (SynthesisResult with "pages" containing title, summary, body_sections, etc.). + Do NOT modify, remove, or rephrase any part of the JSON format instructions. + Your changes should target the prose synthesis guidelines only. +4. Keep the overall prompt length within 2x of the original. Don't bloat it. +5. Make substantive changes — rewording a sentence or adding one adjective is not enough. + +## Output + +Return ONLY the full modified prompt text. No explanation, no markdown fences, no preamble. +Just the complete prompt that could be used directly as a system prompt. +""" + + +# Format markers that must survive variant generation — if any of these +# are present in the base prompt, the variant must also contain them. +_FORMAT_MARKERS = ["SynthesisResult", '"pages"', "body_sections", "title", "summary"] + + +class PromptVariantGenerator: + """Generates prompt variants by asking an LLM to act as a prompt engineer. + + Given a base prompt and its evaluation scores, produces N mutated + variants targeting the weakest dimensions. + """ + + def __init__(self, client: LLMClient) -> None: + self.client = client + + def generate( + self, + base_prompt: str, + scores: ScoreResult, + n: int = 2, + ) -> list[str]: + """Generate up to *n* valid prompt variants. + + Each variant is produced by a separate LLM call with the meta-prompt. + Variants are validated: they must differ from the base by ≥50 characters + and must contain the JSON format instruction markers found in the base. + + Invalid variants are logged and skipped. + + Parameters + ---------- + base_prompt: + The current best synthesis prompt text. + scores: + ScoreResult from the most recent evaluation of *base_prompt*. + n: + Number of variants to attempt generating. + + Returns + ------- + list[str] + Valid variant prompt strings (may be fewer than *n*). + """ + user_prompt = self._build_user_prompt(base_prompt, scores) + # Identify which format markers are actually present in the base + required_markers = [m for m in _FORMAT_MARKERS if m in base_prompt] + + variants: list[str] = [] + for i in range(n): + logger.info("Generating variant %d/%d...", i + 1, n) + try: + raw = self.client.complete( + system_prompt=VARIANT_META_PROMPT, + user_prompt=user_prompt, + response_model=None, # free-form text, not JSON + modality="chat", + ) + variant = str(raw).strip() + except Exception: + logger.exception("LLM error generating variant %d/%d", i + 1, n) + continue + + # Validate the variant + if not self._validate(variant, base_prompt, required_markers, i + 1): + continue + + variants.append(variant) + logger.info("Variant %d/%d accepted (%d chars)", i + 1, n, len(variant)) + + logger.info( + "Generated %d valid variant(s) out of %d attempts", len(variants), n + ) + return variants + + # ── Internal helpers ────────────────────────────────────────────────── + + def _build_user_prompt(self, base_prompt: str, scores: ScoreResult) -> str: + """Build the user message describing the current prompt and its scores.""" + # Build per-dimension score lines, sorted worst-first + dim_lines: list[str] = [] + dim_scores = [(d, getattr(scores, d, 0.0)) for d in DIMENSIONS] + dim_scores.sort(key=lambda x: x[1]) + + for dim, val in dim_scores: + justification = scores.justifications.get(dim, "") + label = dim.replace("_", " ").title() + line = f" {label}: {val:.2f}" + if justification: + line += f" — {justification}" + dim_lines.append(line) + + weakest = dim_scores[0][0].replace("_", " ").title() + second_weakest = dim_scores[1][0].replace("_", " ").title() if len(dim_scores) > 1 else weakest + + return ( + f"## Current Prompt\n\n{base_prompt}\n\n" + f"## Evaluation Scores (sorted weakest → strongest)\n\n" + + "\n".join(dim_lines) + + f"\n\n Composite: {scores.composite:.3f}\n\n" + f"## Priority\n\n" + f"The weakest dimensions are **{weakest}** and **{second_weakest}**. " + f"Focus your prompt modifications on improving these.\n\n" + f"Return the full modified prompt now." + ) + + def _validate( + self, + variant: str, + base_prompt: str, + required_markers: list[str], + index: int, + ) -> bool: + """Check a variant meets minimum quality gates.""" + if not variant: + logger.warning("Variant %d is empty — skipping", index) + return False + + # Must differ meaningfully from base + diff = abs(len(variant) - len(base_prompt)) + # Also check actual content difference via set-symmetric-difference of lines + base_lines = set(base_prompt.splitlines()) + variant_lines = set(variant.splitlines()) + changed_lines = len(base_lines.symmetric_difference(variant_lines)) + + if diff < 50 and changed_lines < 3: + logger.warning( + "Variant %d too similar to base (len_diff=%d, changed_lines=%d) — skipping", + index, diff, changed_lines, + ) + return False + + # Must preserve format markers + missing = [m for m in required_markers if m not in variant] + if missing: + logger.warning( + "Variant %d missing format markers %s — skipping", + index, missing, + ) + return False + + return True diff --git a/pipeline b/pipeline new file mode 120000 index 0000000..1369d83 --- /dev/null +++ b/pipeline @@ -0,0 +1 @@ +backend/pipeline \ No newline at end of file