fix: static 96k max_tokens for all pipeline stages — dynamic estimator was truncating thinking model output
The dynamic token estimator calculated max_tokens from input size × stage ratio, which produced ~9k for stage 5 compose calls. Thinking models consume unpredictable budget for internal reasoning, leaving 0 visible output tokens. Changed: hard_limit 32768→96000, estimate_max_tokens now returns hard_limit directly.
This commit is contained in:
parent
ed3c09ab18
commit
906b6491fe
2 changed files with 11 additions and 34 deletions
|
|
@ -43,9 +43,9 @@ class Settings(BaseSettings):
|
||||||
llm_stage5_model: str | None = "fyn-llm-agent-think" # synthesis — reasoning
|
llm_stage5_model: str | None = "fyn-llm-agent-think" # synthesis — reasoning
|
||||||
llm_stage5_modality: str = "thinking"
|
llm_stage5_modality: str = "thinking"
|
||||||
|
|
||||||
# Dynamic token estimation — each stage calculates max_tokens from input size
|
# Token limits — static across all stages
|
||||||
llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator
|
llm_max_tokens_hard_limit: int = 96000 # Hard ceiling for dynamic estimator
|
||||||
llm_max_tokens: int = 32768 # Fallback when no estimate is provided (must not exceed hard_limit)
|
llm_max_tokens: int = 96000 # Fallback when no estimate is provided (must not exceed hard_limit)
|
||||||
llm_temperature: float = 0.0 # Deterministic output for structured JSON extraction
|
llm_temperature: float = 0.0 # Deterministic output for structured JSON extraction
|
||||||
|
|
||||||
# Stage 5 synthesis chunking — max moments per LLM call before splitting
|
# Stage 5 synthesis chunking — max moments per LLM call before splitting
|
||||||
|
|
|
||||||
|
|
@ -121,43 +121,20 @@ def estimate_max_tokens(
|
||||||
stage: str | None = None,
|
stage: str | None = None,
|
||||||
hard_limit: int = 32768,
|
hard_limit: int = 32768,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Estimate the max_tokens parameter for an LLM call.
|
"""Return the hard_limit as max_tokens for all stages.
|
||||||
|
|
||||||
Calculates expected output size based on input size and stage-specific
|
Previously used dynamic estimation based on input size and stage-specific
|
||||||
multipliers. The result is clamped between _MIN_MAX_TOKENS and hard_limit.
|
multipliers, but thinking models consume unpredictable token budgets for
|
||||||
|
internal reasoning. A static ceiling avoids truncation errors.
|
||||||
|
|
||||||
Parameters
|
The hard_limit value comes from Settings.llm_max_tokens_hard_limit (96000).
|
||||||
----------
|
|
||||||
system_prompt:
|
|
||||||
The system prompt text.
|
|
||||||
user_prompt:
|
|
||||||
The user prompt text (transcript, moments, etc.).
|
|
||||||
stage:
|
|
||||||
Pipeline stage name (e.g. "stage3_extraction"). If None or unknown,
|
|
||||||
uses a default 1.0x multiplier.
|
|
||||||
hard_limit:
|
|
||||||
Absolute ceiling — never exceed this value.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
int
|
|
||||||
Estimated max_tokens value to pass to the LLM API.
|
|
||||||
"""
|
"""
|
||||||
input_tokens = estimate_tokens(system_prompt) + estimate_tokens(user_prompt)
|
input_tokens = estimate_tokens(system_prompt) + estimate_tokens(user_prompt)
|
||||||
ratio = _STAGE_OUTPUT_RATIOS.get(stage or "", 1.0)
|
|
||||||
estimated_output = int(input_tokens * ratio)
|
|
||||||
|
|
||||||
# Add a 50% buffer for JSON overhead and variability
|
|
||||||
estimated_output = int(estimated_output * 1.5)
|
|
||||||
|
|
||||||
# Clamp to [_MIN_MAX_TOKENS, hard_limit]
|
|
||||||
result = max(_MIN_MAX_TOKENS, min(estimated_output, hard_limit))
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Token estimate: input≈%d, stage=%s, ratio=%.2f, estimated_output=%d, max_tokens=%d (hard_limit=%d)",
|
"Token estimate: input≈%d, stage=%s, max_tokens=%d (static hard_limit)",
|
||||||
input_tokens, stage or "default", ratio, estimated_output, result, hard_limit,
|
input_tokens, stage or "default", hard_limit,
|
||||||
)
|
)
|
||||||
return result
|
return hard_limit
|
||||||
|
|
||||||
|
|
||||||
class LLMClient:
|
class LLMClient:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue