diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py index 1532c9f..740c179 100644 --- a/backend/pipeline/stages.py +++ b/backend/pipeline/stages.py @@ -1049,19 +1049,47 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: for _, cls_info in moment_group: all_tags.update(cls_info.get("topic_tags", [])) - # ── Chunked synthesis ──────────────────────────────────── + # ── Chunked synthesis with truncation recovery ───────── if len(moment_group) <= chunk_size: - # Small group — single LLM call (original behavior) - result = _synthesize_chunk( - moment_group, category, creator_name, - system_prompt, llm, model_override, modality, hard_limit, - video_id, run_id, f"category:{category}", - ) - synthesized_pages = list(result.pages) - logger.info( - "Stage 5: category '%s' — %d moments, %d page(s) from single call", - category, len(moment_group), len(synthesized_pages), - ) + # Small group — try single LLM call first + try: + result = _synthesize_chunk( + moment_group, category, creator_name, + system_prompt, llm, model_override, modality, hard_limit, + video_id, run_id, f"category:{category}", + ) + synthesized_pages = list(result.pages) + logger.info( + "Stage 5: category '%s' — %d moments, %d page(s) from single call", + category, len(moment_group), len(synthesized_pages), + ) + except LLMTruncationError: + # Output too large for model context — split in half and retry + logger.warning( + "Stage 5: category '%s' truncated with %d moments. " + "Splitting into smaller chunks and retrying.", + category, len(moment_group), + ) + half = max(1, len(moment_group) // 2) + chunk_pages = [] + for sub_start in range(0, len(moment_group), half): + sub_chunk = moment_group[sub_start:sub_start + half] + sub_label = f"category:{category} recovery-chunk:{sub_start // half + 1}" + sub_result = _synthesize_chunk( + sub_chunk, category, creator_name, + system_prompt, llm, model_override, modality, hard_limit, + video_id, run_id, sub_label, + ) + # Reindex moment_indices to global offsets + for p in sub_result.pages: + if p.moment_indices: + p.moment_indices = [idx + sub_start for idx in p.moment_indices] + chunk_pages.extend(sub_result.pages) + synthesized_pages = chunk_pages + logger.info( + "Stage 5: category '%s' — %d page(s) from recovery split", + category, len(synthesized_pages), + ) else: # Large group — split into chunks, synthesize each, then merge num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size @@ -1108,6 +1136,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: prior_page_ids = _load_prior_pages(video_id) for page_data in synthesized_pages: + page_moment_indices = getattr(page_data, "moment_indices", None) or [] existing = None # First: check by slug (most specific match) @@ -1202,7 +1231,6 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: pages_created += 1 # Link moments to the technique page using moment_indices - page_moment_indices = getattr(page_data, "moment_indices", None) or [] if page_moment_indices: # LLM specified which moments belong to this page