From 8272da430bba54b876fd277918a6df39686eb531 Mon Sep 17 00:00:00 2001 From: jlightner Date: Wed, 1 Apr 2026 01:51:28 -0500 Subject: [PATCH] fix: Variable ordering bug and stage 5 truncation recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: 1. page_moment_indices was referenced before assignment in the page persist loop — moved assignment to top of loop body. This caused "cannot access local variable" errors on every stage 5 run. 2. Stage 5 now catches LLMTruncationError and splits the chunk in half for retry, instead of blindly retrying the same oversized prompt. This handles categories where synthesis output exceeds the model context window. Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/pipeline/stages.py | 54 +++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py index 1532c9f..740c179 100644 --- a/backend/pipeline/stages.py +++ b/backend/pipeline/stages.py @@ -1049,19 +1049,47 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: for _, cls_info in moment_group: all_tags.update(cls_info.get("topic_tags", [])) - # ── Chunked synthesis ──────────────────────────────────── + # ── Chunked synthesis with truncation recovery ───────── if len(moment_group) <= chunk_size: - # Small group — single LLM call (original behavior) - result = _synthesize_chunk( - moment_group, category, creator_name, - system_prompt, llm, model_override, modality, hard_limit, - video_id, run_id, f"category:{category}", - ) - synthesized_pages = list(result.pages) - logger.info( - "Stage 5: category '%s' — %d moments, %d page(s) from single call", - category, len(moment_group), len(synthesized_pages), - ) + # Small group — try single LLM call first + try: + result = _synthesize_chunk( + moment_group, category, creator_name, + system_prompt, llm, model_override, modality, hard_limit, + video_id, run_id, f"category:{category}", + ) + synthesized_pages = list(result.pages) + logger.info( + "Stage 5: category '%s' — %d moments, %d page(s) from single call", + category, len(moment_group), len(synthesized_pages), + ) + except LLMTruncationError: + # Output too large for model context — split in half and retry + logger.warning( + "Stage 5: category '%s' truncated with %d moments. " + "Splitting into smaller chunks and retrying.", + category, len(moment_group), + ) + half = max(1, len(moment_group) // 2) + chunk_pages = [] + for sub_start in range(0, len(moment_group), half): + sub_chunk = moment_group[sub_start:sub_start + half] + sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}" + sub_result = _synthesize_chunk( + sub_chunk, category, creator_name, + system_prompt, llm, model_override, modality, hard_limit, + video_id, run_id, sub_label, + ) + # Reindex moment_indices to global offsets + for p in sub_result.pages: + if p.moment_indices: + p.moment_indices = [idx + sub_start for idx in p.moment_indices] + chunk_pages.extend(sub_result.pages) + synthesized_pages = chunk_pages + logger.info( + "Stage 5: category '%s' — %d page(s) from recovery split", + category, len(synthesized_pages), + ) else: # Large group — split into chunks, synthesize each, then merge num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size @@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: prior_page_ids = _load_prior_pages(video_id) for page_data in synthesized_pages: + page_moment_indices = getattr(page_data, "moment_indices", None) or [] existing = None # First: check by slug (most specific match) @@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: pages_created += 1 # Link moments to the technique page using moment_indices - page_moment_indices = getattr(page_data, "moment_indices", None) or [] if page_moment_indices: # LLM specified which moments belong to this page