fix: Variable ordering bug and stage 5 truncation recovery

Two fixes:

1. page_moment_indices was referenced before assignment in the page
   persist loop — moved assignment to top of loop body. This caused
   "cannot access local variable" errors on every stage 5 run.

2. Stage 5 now catches LLMTruncationError and splits the chunk in
   half for retry, instead of blindly retrying the same oversized
   prompt. This handles categories where synthesis output exceeds
   the model context window.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
jlightner 2026-04-01 01:51:28 -05:00
parent c0df8a7908
commit 8272da430b

View file

@ -1049,9 +1049,10 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
for _, cls_info in moment_group: for _, cls_info in moment_group:
all_tags.update(cls_info.get("topic_tags", [])) all_tags.update(cls_info.get("topic_tags", []))
# ── Chunked synthesis ──────────────────────────────────── # ── Chunked synthesis with truncation recovery ─────────
if len(moment_group) <= chunk_size: if len(moment_group) <= chunk_size:
# Small group — single LLM call (original behavior) # Small group — try single LLM call first
try:
result = _synthesize_chunk( result = _synthesize_chunk(
moment_group, category, creator_name, moment_group, category, creator_name,
system_prompt, llm, model_override, modality, hard_limit, system_prompt, llm, model_override, modality, hard_limit,
@ -1062,6 +1063,33 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
"Stage 5: category '%s'%d moments, %d page(s) from single call", "Stage 5: category '%s'%d moments, %d page(s) from single call",
category, len(moment_group), len(synthesized_pages), category, len(moment_group), len(synthesized_pages),
) )
except LLMTruncationError:
# Output too large for model context — split in half and retry
logger.warning(
"Stage 5: category '%s' truncated with %d moments. "
"Splitting into smaller chunks and retrying.",
category, len(moment_group),
)
half = max(1, len(moment_group) // 2)
chunk_pages = []
for sub_start in range(0, len(moment_group), half):
sub_chunk = moment_group[sub_start:sub_start + half]
sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}"
sub_result = _synthesize_chunk(
sub_chunk, category, creator_name,
system_prompt, llm, model_override, modality, hard_limit,
video_id, run_id, sub_label,
)
# Reindex moment_indices to global offsets
for p in sub_result.pages:
if p.moment_indices:
p.moment_indices = [idx + sub_start for idx in p.moment_indices]
chunk_pages.extend(sub_result.pages)
synthesized_pages = chunk_pages
logger.info(
"Stage 5: category '%s'%d page(s) from recovery split",
category, len(synthesized_pages),
)
else: else:
# Large group — split into chunks, synthesize each, then merge # Large group — split into chunks, synthesize each, then merge
num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size
@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
prior_page_ids = _load_prior_pages(video_id) prior_page_ids = _load_prior_pages(video_id)
for page_data in synthesized_pages: for page_data in synthesized_pages:
page_moment_indices = getattr(page_data, "moment_indices", None) or []
existing = None existing = None
# First: check by slug (most specific match) # First: check by slug (most specific match)
@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
pages_created += 1 pages_created += 1
# Link moments to the technique page using moment_indices # Link moments to the technique page using moment_indices
page_moment_indices = getattr(page_data, "moment_indices", None) or []
if page_moment_indices: if page_moment_indices:
# LLM specified which moments belong to this page # LLM specified which moments belong to this page