fix: Variable ordering bug and stage 5 truncation recovery
Two fixes: 1. page_moment_indices was referenced before assignment in the page persist loop — moved assignment to top of loop body. This caused "cannot access local variable" errors on every stage 5 run. 2. Stage 5 now catches LLMTruncationError and splits the chunk in half for retry, instead of blindly retrying the same oversized prompt. This handles categories where synthesis output exceeds the model context window. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c0df8a7908
commit
8272da430b
1 changed files with 41 additions and 13 deletions
|
|
@ -1049,19 +1049,47 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
|
|||
for _, cls_info in moment_group:
|
||||
all_tags.update(cls_info.get("topic_tags", []))
|
||||
|
||||
# ── Chunked synthesis ────────────────────────────────────
|
||||
# ── Chunked synthesis with truncation recovery ─────────
|
||||
if len(moment_group) <= chunk_size:
|
||||
# Small group — single LLM call (original behavior)
|
||||
result = _synthesize_chunk(
|
||||
moment_group, category, creator_name,
|
||||
system_prompt, llm, model_override, modality, hard_limit,
|
||||
video_id, run_id, f"category:{category}",
|
||||
)
|
||||
synthesized_pages = list(result.pages)
|
||||
logger.info(
|
||||
"Stage 5: category '%s' — %d moments, %d page(s) from single call",
|
||||
category, len(moment_group), len(synthesized_pages),
|
||||
)
|
||||
# Small group — try single LLM call first
|
||||
try:
|
||||
result = _synthesize_chunk(
|
||||
moment_group, category, creator_name,
|
||||
system_prompt, llm, model_override, modality, hard_limit,
|
||||
video_id, run_id, f"category:{category}",
|
||||
)
|
||||
synthesized_pages = list(result.pages)
|
||||
logger.info(
|
||||
"Stage 5: category '%s' — %d moments, %d page(s) from single call",
|
||||
category, len(moment_group), len(synthesized_pages),
|
||||
)
|
||||
except LLMTruncationError:
|
||||
# Output too large for model context — split in half and retry
|
||||
logger.warning(
|
||||
"Stage 5: category '%s' truncated with %d moments. "
|
||||
"Splitting into smaller chunks and retrying.",
|
||||
category, len(moment_group),
|
||||
)
|
||||
half = max(1, len(moment_group) // 2)
|
||||
chunk_pages = []
|
||||
for sub_start in range(0, len(moment_group), half):
|
||||
sub_chunk = moment_group[sub_start:sub_start + half]
|
||||
sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}"
|
||||
sub_result = _synthesize_chunk(
|
||||
sub_chunk, category, creator_name,
|
||||
system_prompt, llm, model_override, modality, hard_limit,
|
||||
video_id, run_id, sub_label,
|
||||
)
|
||||
# Reindex moment_indices to global offsets
|
||||
for p in sub_result.pages:
|
||||
if p.moment_indices:
|
||||
p.moment_indices = [idx + sub_start for idx in p.moment_indices]
|
||||
chunk_pages.extend(sub_result.pages)
|
||||
synthesized_pages = chunk_pages
|
||||
logger.info(
|
||||
"Stage 5: category '%s' — %d page(s) from recovery split",
|
||||
category, len(synthesized_pages),
|
||||
)
|
||||
else:
|
||||
# Large group — split into chunks, synthesize each, then merge
|
||||
num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size
|
||||
|
|
@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
|
|||
prior_page_ids = _load_prior_pages(video_id)
|
||||
|
||||
for page_data in synthesized_pages:
|
||||
page_moment_indices = getattr(page_data, "moment_indices", None) or []
|
||||
existing = None
|
||||
|
||||
# First: check by slug (most specific match)
|
||||
|
|
@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
|
|||
pages_created += 1
|
||||
|
||||
# Link moments to the technique page using moment_indices
|
||||
page_moment_indices = getattr(page_data, "moment_indices", None) or []
|
||||
|
||||
if page_moment_indices:
|
||||
# LLM specified which moments belong to this page
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue