From 906b6491fe00a711f4266dcb5895aeb5ae69aee4 Mon Sep 17 00:00:00 2001 From: jlightner Date: Fri, 3 Apr 2026 08:18:28 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20static=2096k=20max=5Ftokens=20for=20all?= =?UTF-8?q?=20pipeline=20stages=20=E2=80=94=20dynamic=20estimator=20was=20?= =?UTF-8?q?truncating=20thinking=20model=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dynamic token estimator calculated max_tokens from input size × stage ratio, which produced ~9k for stage 5 compose calls. Thinking models consume unpredictable budget for internal reasoning, leaving 0 visible output tokens. Changed: hard_limit 32768→96000, estimate_max_tokens now returns hard_limit directly. --- backend/config.py | 6 +++--- backend/pipeline/llm_client.py | 39 +++++++--------------------------- 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/backend/config.py b/backend/config.py index 9d27fe9..de6e9f0 100644 --- a/backend/config.py +++ b/backend/config.py @@ -43,9 +43,9 @@ class Settings(BaseSettings): llm_stage5_model: str | None = "fyn-llm-agent-think" # synthesis — reasoning llm_stage5_modality: str = "thinking" - # Dynamic token estimation — each stage calculates max_tokens from input size - llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator - llm_max_tokens: int = 32768 # Fallback when no estimate is provided (must not exceed hard_limit) + # Token limits — static across all stages + llm_max_tokens_hard_limit: int = 96000 # Hard ceiling for dynamic estimator + llm_max_tokens: int = 96000 # Fallback when no estimate is provided (must not exceed hard_limit) llm_temperature: float = 0.0 # Deterministic output for structured JSON extraction # Stage 5 synthesis chunking — max moments per LLM call before splitting diff --git a/backend/pipeline/llm_client.py b/backend/pipeline/llm_client.py index 6642a55..e07877a 100644 --- a/backend/pipeline/llm_client.py +++ b/backend/pipeline/llm_client.py @@ -121,43 +121,20 @@ def estimate_max_tokens( stage: str | None = None, hard_limit: int = 32768, ) -> int: - """Estimate the max_tokens parameter for an LLM call. + """Return the hard_limit as max_tokens for all stages. - Calculates expected output size based on input size and stage-specific - multipliers. The result is clamped between _MIN_MAX_TOKENS and hard_limit. + Previously used dynamic estimation based on input size and stage-specific + multipliers, but thinking models consume unpredictable token budgets for + internal reasoning. A static ceiling avoids truncation errors. - Parameters - ---------- - system_prompt: - The system prompt text. - user_prompt: - The user prompt text (transcript, moments, etc.). - stage: - Pipeline stage name (e.g. "stage3_extraction"). If None or unknown, - uses a default 1.0x multiplier. - hard_limit: - Absolute ceiling — never exceed this value. - - Returns - ------- - int - Estimated max_tokens value to pass to the LLM API. + The hard_limit value comes from Settings.llm_max_tokens_hard_limit (96000). """ input_tokens = estimate_tokens(system_prompt) + estimate_tokens(user_prompt) - ratio = _STAGE_OUTPUT_RATIOS.get(stage or "", 1.0) - estimated_output = int(input_tokens * ratio) - - # Add a 50% buffer for JSON overhead and variability - estimated_output = int(estimated_output * 1.5) - - # Clamp to [_MIN_MAX_TOKENS, hard_limit] - result = max(_MIN_MAX_TOKENS, min(estimated_output, hard_limit)) - logger.info( - "Token estimate: input≈%d, stage=%s, ratio=%.2f, estimated_output=%d, max_tokens=%d (hard_limit=%d)", - input_tokens, stage or "default", ratio, estimated_output, result, hard_limit, + "Token estimate: input≈%d, stage=%s, max_tokens=%d (static hard_limit)", + input_tokens, stage or "default", hard_limit, ) - return result + return hard_limit class LLMClient: