From 906b6491fe00a711f4266dcb5895aeb5ae69aee4 Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Fri, 3 Apr 2026 08:18:28 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20static=2096k=20max=5Ftokens=20for=20all?=
 =?UTF-8?q?=20pipeline=20stages=20=E2=80=94=20dynamic=20estimator=20was=20?=
 =?UTF-8?q?truncating=20thinking=20model=20output?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dynamic token estimator calculated max_tokens from input size × stage ratio,
which produced ~9k for stage 5 compose calls. Thinking models consume unpredictable
budget for internal reasoning, leaving 0 visible output tokens.

Changed: hard_limit 32768→96000, estimate_max_tokens now returns hard_limit directly.
---
 backend/config.py              |  6 +++---
 backend/pipeline/llm_client.py | 39 +++++++---------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/backend/config.py b/backend/config.py
index 9d27fe9..de6e9f0 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -43,9 +43,9 @@ class Settings(BaseSettings):
     llm_stage5_model: str | None = "fyn-llm-agent-think"  # synthesis — reasoning
     llm_stage5_modality: str = "thinking"
 
-    # Dynamic token estimation — each stage calculates max_tokens from input size
-    llm_max_tokens_hard_limit: int = 32768   # Hard ceiling for dynamic estimator
-    llm_max_tokens: int = 32768              # Fallback when no estimate is provided (must not exceed hard_limit)
+    # Token limits — static across all stages
+    llm_max_tokens_hard_limit: int = 96000   # Hard ceiling for dynamic estimator
+    llm_max_tokens: int = 96000              # Fallback when no estimate is provided (must not exceed hard_limit)
     llm_temperature: float = 0.0             # Deterministic output for structured JSON extraction
 
     # Stage 5 synthesis chunking — max moments per LLM call before splitting
diff --git a/backend/pipeline/llm_client.py b/backend/pipeline/llm_client.py
index 6642a55..e07877a 100644
--- a/backend/pipeline/llm_client.py
+++ b/backend/pipeline/llm_client.py
@@ -121,43 +121,20 @@ def estimate_max_tokens(
     stage: str | None = None,
     hard_limit: int = 32768,
 ) -> int:
-    """Estimate the max_tokens parameter for an LLM call.
+    """Return the hard_limit as max_tokens for all stages.
 
-    Calculates expected output size based on input size and stage-specific
-    multipliers. The result is clamped between _MIN_MAX_TOKENS and hard_limit.
+    Previously used dynamic estimation based on input size and stage-specific
+    multipliers, but thinking models consume unpredictable token budgets for
+    internal reasoning. A static ceiling avoids truncation errors.
 
-    Parameters
-    ----------
-    system_prompt:
-        The system prompt text.
-    user_prompt:
-        The user prompt text (transcript, moments, etc.).
-    stage:
-        Pipeline stage name (e.g. "stage3_extraction"). If None or unknown,
-        uses a default 1.0x multiplier.
-    hard_limit:
-        Absolute ceiling — never exceed this value.
-
-    Returns
-    -------
-    int
-        Estimated max_tokens value to pass to the LLM API.
+    The hard_limit value comes from Settings.llm_max_tokens_hard_limit (96000).
     """
     input_tokens = estimate_tokens(system_prompt) + estimate_tokens(user_prompt)
-    ratio = _STAGE_OUTPUT_RATIOS.get(stage or "", 1.0)
-    estimated_output = int(input_tokens * ratio)
-
-    # Add a 50% buffer for JSON overhead and variability
-    estimated_output = int(estimated_output * 1.5)
-
-    # Clamp to [_MIN_MAX_TOKENS, hard_limit]
-    result = max(_MIN_MAX_TOKENS, min(estimated_output, hard_limit))
-
     logger.info(
-        "Token estimate: input≈%d, stage=%s, ratio=%.2f, estimated_output=%d, max_tokens=%d (hard_limit=%d)",
-        input_tokens, stage or "default", ratio, estimated_output, result, hard_limit,
+        "Token estimate: input≈%d, stage=%s, max_tokens=%d (static hard_limit)",
+        input_tokens, stage or "default", hard_limit,
     )
-    return result
+    return hard_limit
 
 
 class LLMClient: