fix: Inflate LLM token estimates and forward max_tokens on retry
Stage 4 classification was truncating (finish=length) because the 0.15x output ratio underestimated token needs. Inflated all stage ratios, bumped the buffer from 20% to 50%, raised the floor from 2048 to 4096, and fixed _safe_parse_llm_response to forward max_tokens on retry instead of falling back to the 65k default. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0817e6e957
commit
5984129e25
2 changed files with 13 additions and 11 deletions
|
|
@ -59,14 +59,14 @@ def strip_think_tags(text: str) -> str:
|
||||||
# Stage-specific output multipliers: estimated output tokens as a ratio of input tokens.
|
# Stage-specific output multipliers: estimated output tokens as a ratio of input tokens.
|
||||||
# These are empirically tuned based on observed pipeline behavior.
|
# These are empirically tuned based on observed pipeline behavior.
|
||||||
_STAGE_OUTPUT_RATIOS: dict[str, float] = {
|
_STAGE_OUTPUT_RATIOS: dict[str, float] = {
|
||||||
"stage2_segmentation": 0.3, # Compact topic groups — much smaller than input
|
"stage2_segmentation": 0.6, # Compact topic groups — smaller than input
|
||||||
"stage3_extraction": 1.2, # Detailed moments with summaries — can exceed input
|
"stage3_extraction": 2.0, # Detailed moments with summaries — can well exceed input
|
||||||
"stage4_classification": 0.15, # Index + category + tags per moment — very compact
|
"stage4_classification": 0.5, # Index + category + tags per moment — small but varies
|
||||||
"stage5_synthesis": 1.5, # Full prose technique pages — heaviest output
|
"stage5_synthesis": 2.5, # Full prose technique pages — heaviest output
|
||||||
}
|
}
|
||||||
|
|
||||||
# Minimum floor so we never send a trivially small max_tokens
|
# Minimum floor so we never send a trivially small max_tokens
|
||||||
_MIN_MAX_TOKENS = 2048
|
_MIN_MAX_TOKENS = 4096
|
||||||
|
|
||||||
|
|
||||||
def estimate_tokens(text: str) -> int:
|
def estimate_tokens(text: str) -> int:
|
||||||
|
|
@ -111,8 +111,8 @@ def estimate_max_tokens(
|
||||||
ratio = _STAGE_OUTPUT_RATIOS.get(stage or "", 1.0)
|
ratio = _STAGE_OUTPUT_RATIOS.get(stage or "", 1.0)
|
||||||
estimated_output = int(input_tokens * ratio)
|
estimated_output = int(input_tokens * ratio)
|
||||||
|
|
||||||
# Add a 20% buffer for JSON overhead and variability
|
# Add a 50% buffer for JSON overhead and variability
|
||||||
estimated_output = int(estimated_output * 1.2)
|
estimated_output = int(estimated_output * 1.5)
|
||||||
|
|
||||||
# Clamp to [_MIN_MAX_TOKENS, hard_limit]
|
# Clamp to [_MIN_MAX_TOKENS, hard_limit]
|
||||||
result = max(_MIN_MAX_TOKENS, min(estimated_output, hard_limit))
|
result = max(_MIN_MAX_TOKENS, min(estimated_output, hard_limit))
|
||||||
|
|
|
||||||
|
|
@ -263,6 +263,7 @@ def _safe_parse_llm_response(
|
||||||
user_prompt: str,
|
user_prompt: str,
|
||||||
modality: str = "chat",
|
modality: str = "chat",
|
||||||
model_override: str | None = None,
|
model_override: str | None = None,
|
||||||
|
max_tokens: int | None = None,
|
||||||
):
|
):
|
||||||
"""Parse LLM response with one retry on failure.
|
"""Parse LLM response with one retry on failure.
|
||||||
|
|
||||||
|
|
@ -284,6 +285,7 @@ def _safe_parse_llm_response(
|
||||||
retry_raw = llm.complete(
|
retry_raw = llm.complete(
|
||||||
system_prompt, nudge_prompt, response_model=model_cls,
|
system_prompt, nudge_prompt, response_model=model_cls,
|
||||||
modality=modality, model_override=model_override,
|
modality=modality, model_override=model_override,
|
||||||
|
max_tokens=max_tokens,
|
||||||
)
|
)
|
||||||
return llm.parse_response(retry_raw, model_cls)
|
return llm.parse_response(retry_raw, model_cls)
|
||||||
|
|
||||||
|
|
@ -340,7 +342,7 @@ def stage2_segmentation(self, video_id: str, run_id: str | None = None) -> str:
|
||||||
raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id),
|
raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id),
|
||||||
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt,
|
result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt,
|
||||||
modality=modality, model_override=model_override)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
|
|
||||||
# Update topic_label on each segment row
|
# Update topic_label on each segment row
|
||||||
seg_by_index = {s.segment_index: s for s in segments}
|
seg_by_index = {s.segment_index: s for s in segments}
|
||||||
|
|
@ -432,7 +434,7 @@ def stage3_extraction(self, video_id: str, run_id: str | None = None) -> str:
|
||||||
raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label),
|
raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label),
|
||||||
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt,
|
result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt,
|
||||||
modality=modality, model_override=model_override)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
|
|
||||||
# Create KeyMoment rows
|
# Create KeyMoment rows
|
||||||
for moment in result.moments:
|
for moment in result.moments:
|
||||||
|
|
@ -541,7 +543,7 @@ def stage4_classification(self, video_id: str, run_id: str | None = None) -> str
|
||||||
raw = llm.complete(system_prompt, user_prompt, response_model=ClassificationResult, on_complete=_make_llm_callback(video_id, "stage4_classification", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id),
|
raw = llm.complete(system_prompt, user_prompt, response_model=ClassificationResult, on_complete=_make_llm_callback(video_id, "stage4_classification", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id),
|
||||||
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
result = _safe_parse_llm_response(raw, ClassificationResult, llm, system_prompt, user_prompt,
|
result = _safe_parse_llm_response(raw, ClassificationResult, llm, system_prompt, user_prompt,
|
||||||
modality=modality, model_override=model_override)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
|
|
||||||
# Apply content_type overrides and prepare classification data for stage 5
|
# Apply content_type overrides and prepare classification data for stage 5
|
||||||
classification_data = []
|
classification_data = []
|
||||||
|
|
@ -786,7 +788,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
|
||||||
raw = llm.complete(system_prompt, user_prompt, response_model=SynthesisResult, on_complete=_make_llm_callback(video_id, "stage5_synthesis", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=category),
|
raw = llm.complete(system_prompt, user_prompt, response_model=SynthesisResult, on_complete=_make_llm_callback(video_id, "stage5_synthesis", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=category),
|
||||||
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
result = _safe_parse_llm_response(raw, SynthesisResult, llm, system_prompt, user_prompt,
|
result = _safe_parse_llm_response(raw, SynthesisResult, llm, system_prompt, user_prompt,
|
||||||
modality=modality, model_override=model_override)
|
modality=modality, model_override=model_override, max_tokens=max_tokens)
|
||||||
|
|
||||||
# Load prior pages from this video (snapshot taken before pipeline reset)
|
# Load prior pages from this video (snapshot taken before pipeline reset)
|
||||||
prior_page_ids = _load_prior_pages(video_id)
|
prior_page_ids = _load_prior_pages(video_id)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue