fix: Variable ordering bug and stage 5 truncation recovery

Two fixes: 1. page_moment_indices was referenced before assignment in the page persist loop — moved assignment to top of loop body. This caused "cannot access local variable" errors on every stage 5 run. 2. Stage 5 now catches LLMTruncationError and splits the chunk in half for retry, instead of blindly retrying the same oversized prompt. This handles categories where synthesis output exceeds the model context window. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 01:51:28 -05:00 · 2026-04-01 01:51:28 -05:00 · 8272da430b
commit 8272da430b
parent c0df8a7908
1 changed files with 41 additions and 13 deletions
--- a/backend/pipeline/stages.py
+++ b/backend/pipeline/stages.py
@ -1049,9 +1049,10 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
            for _, cls_info in moment_group:
                all_tags.update(cls_info.get("topic_tags", []))
-            # ── Chunked synthesis ────────────────────────────────────
+            # ── Chunked synthesis with truncation recovery ─────────
            if len(moment_group) <= chunk_size:
-                # Small group — single LLM call (original behavior)
+                # Small group — try single LLM call first
                try:
                    result = _synthesize_chunk(
                        moment_group, category, creator_name,
                        system_prompt, llm, model_override, modality, hard_limit,
@ -1062,6 +1063,33 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
                        "Stage 5: category '%s' — %d moments, %d page(s) from single call",
                        category, len(moment_group), len(synthesized_pages),
                    )
                except LLMTruncationError:
                    # Output too large for model context — split in half and retry
                    logger.warning(
                        "Stage 5: category '%s' truncated with %d moments. "
                        "Splitting into smaller chunks and retrying.",
                        category, len(moment_group),
                    )
                    half = max(1, len(moment_group) // 2)
                    chunk_pages = []
                    for sub_start in range(0, len(moment_group), half):
                        sub_chunk = moment_group[sub_start:sub_start + half]
                        sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}"
                        sub_result = _synthesize_chunk(
                            sub_chunk, category, creator_name,
                            system_prompt, llm, model_override, modality, hard_limit,
                            video_id, run_id, sub_label,
                        )
                        # Reindex moment_indices to global offsets
                        for p in sub_result.pages:
                            if p.moment_indices:
                                p.moment_indices = [idx + sub_start for idx in p.moment_indices]
                        chunk_pages.extend(sub_result.pages)
                    synthesized_pages = chunk_pages
                    logger.info(
                        "Stage 5: category '%s' — %d page(s) from recovery split",
                        category, len(synthesized_pages),
                    )
            else:
                # Large group — split into chunks, synthesize each, then merge
                num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size
@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
            prior_page_ids = _load_prior_pages(video_id)
            for page_data in synthesized_pages:
                page_moment_indices = getattr(page_data, "moment_indices", None) or []
                existing = None
                # First: check by slug (most specific match)
@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
                pages_created += 1
                # Link moments to the technique page using moment_indices
                page_moment_indices = getattr(page_data, "moment_indices", None) or []
                if page_moment_indices:
                    # LLM specified which moments belong to this page