From 8272da430bba54b876fd277918a6df39686eb531 Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@ub01.a.xpltd.co>
Date: Wed, 1 Apr 2026 01:51:28 -0500
Subject: [PATCH] fix: Variable ordering bug and stage 5 truncation recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes:

1. page_moment_indices was referenced before assignment in the page
   persist loop — moved assignment to top of loop body. This caused
   "cannot access local variable" errors on every stage 5 run.

2. Stage 5 now catches LLMTruncationError and splits the chunk in
   half for retry, instead of blindly retrying the same oversized
   prompt. This handles categories where synthesis output exceeds
   the model context window.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/pipeline/stages.py | 54 +++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py
index 1532c9f..740c179 100644
--- a/backend/pipeline/stages.py
+++ b/backend/pipeline/stages.py
@@ -1049,19 +1049,47 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
             for _, cls_info in moment_group:
                 all_tags.update(cls_info.get("topic_tags", []))
 
-            # ── Chunked synthesis ────────────────────────────────────
+            # ── Chunked synthesis with truncation recovery ─────────
             if len(moment_group) <= chunk_size:
-                # Small group — single LLM call (original behavior)
-                result = _synthesize_chunk(
-                    moment_group, category, creator_name,
-                    system_prompt, llm, model_override, modality, hard_limit,
-                    video_id, run_id, f"category:{category}",
-                )
-                synthesized_pages = list(result.pages)
-                logger.info(
-                    "Stage 5: category '%s' — %d moments, %d page(s) from single call",
-                    category, len(moment_group), len(synthesized_pages),
-                )
+                # Small group — try single LLM call first
+                try:
+                    result = _synthesize_chunk(
+                        moment_group, category, creator_name,
+                        system_prompt, llm, model_override, modality, hard_limit,
+                        video_id, run_id, f"category:{category}",
+                    )
+                    synthesized_pages = list(result.pages)
+                    logger.info(
+                        "Stage 5: category '%s' — %d moments, %d page(s) from single call",
+                        category, len(moment_group), len(synthesized_pages),
+                    )
+                except LLMTruncationError:
+                    # Output too large for model context — split in half and retry
+                    logger.warning(
+                        "Stage 5: category '%s' truncated with %d moments. "
+                        "Splitting into smaller chunks and retrying.",
+                        category, len(moment_group),
+                    )
+                    half = max(1, len(moment_group) // 2)
+                    chunk_pages = []
+                    for sub_start in range(0, len(moment_group), half):
+                        sub_chunk = moment_group[sub_start:sub_start + half]
+                        sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}"
+                        sub_result = _synthesize_chunk(
+                            sub_chunk, category, creator_name,
+                            system_prompt, llm, model_override, modality, hard_limit,
+                            video_id, run_id, sub_label,
+                        )
+                        # Reindex moment_indices to global offsets
+                        for p in sub_result.pages:
+                            if p.moment_indices:
+                                p.moment_indices = [idx + sub_start for idx in p.moment_indices]
+                        chunk_pages.extend(sub_result.pages)
+                    synthesized_pages = chunk_pages
+                    logger.info(
+                        "Stage 5: category '%s' — %d page(s) from recovery split",
+                        category, len(synthesized_pages),
+                    )
             else:
                 # Large group — split into chunks, synthesize each, then merge
                 num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size
@@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
             prior_page_ids = _load_prior_pages(video_id)
 
             for page_data in synthesized_pages:
+                page_moment_indices = getattr(page_data, "moment_indices", None) or []
                 existing = None
 
                 # First: check by slug (most specific match)
@@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
                 pages_created += 1
 
                 # Link moments to the technique page using moment_indices
-                page_moment_indices = getattr(page_data, "moment_indices", None) or []
 
                 if page_moment_indices:
                     # LLM specified which moments belong to this page