fix: Pipeline LLM audit — temperature=0, realistic token ratios, structured request_params

Audit findings & fixes: - temperature was never set (API defaulted to 1.0) → now explicit 0.0 for deterministic JSON - llm_max_tokens=65536 exceeded hard_limit=32768 → aligned to 32768 - Output ratio estimates were 5-30x too high (based on actual pipeline data): stage2: 0.6→0.05, stage3: 2.0→0.3, stage4: 0.5→0.3, stage5: 2.5→0.8 - request_params now structured as api_params (what's sent to LLM) vs pipeline_config (internal estimator settings) — no more ambiguous 'hard_limit' in request params - temperature=0.0 sent on both primary and fallback endpoints
2026-04-01 07:20:09 +00:00 · 2026-04-01 07:20:09 +00:00 · fd1fd6c6f9
commit fd1fd6c6f9
parent d58194ff96
3 changed files with 51 additions and 12 deletions
--- a/backend/config.py
+++ b/backend/config.py
@ -45,7 +45,8 @@ class Settings(BaseSettings):

    # Dynamic token estimation — each stage calculates max_tokens from input size
    llm_max_tokens_hard_limit: int = 32768   # Hard ceiling for dynamic estimator
-    llm_max_tokens: int = 65536              # Fallback when no estimate is provided
+    llm_max_tokens: int = 32768              # Fallback when no estimate is provided (must not exceed hard_limit)
+    llm_temperature: float = 0.0             # Deterministic output for structured JSON extraction

    # Stage 5 synthesis chunking — max moments per LLM call before splitting
    synthesis_chunk_size: int = 30
--- a/backend/pipeline/llm_client.py
+++ b/backend/pipeline/llm_client.py
@ -89,12 +89,16 @@ def strip_think_tags(text: str) -> str:
 # ── Token estimation ─────────────────────────────────────────────────────────

 # Stage-specific output multipliers: estimated output tokens as a ratio of input tokens.
-# These are empirically tuned based on observed pipeline behavior.
+# Tuned from actual pipeline data (KCL Ep 31 audit, April 2026):
+#   stage2: actual compl/prompt = 680/39312 = 0.017 → use 0.05 with buffer
+#   stage3: actual compl/prompt ≈ 1000/7000 = 0.14 → use 0.3 with buffer
+#   stage4: actual compl/prompt = 740/3736 = 0.20 → use 0.3 with buffer
+#   stage5: actual compl/prompt ≈ 2500/7000 = 0.36 → use 0.8 with buffer
 _STAGE_OUTPUT_RATIOS: dict[str, float] = {
-    "stage2_segmentation": 0.6,    # Compact topic groups — smaller than input
-    "stage3_extraction": 2.0,      # Detailed moments with summaries — can well exceed input
-    "stage4_classification": 0.5,  # Index + category + tags per moment — small but varies
-    "stage5_synthesis": 2.5,       # Full prose technique pages — heaviest output
+    "stage2_segmentation": 0.05,   # Compact topic groups — much smaller than input
+    "stage3_extraction": 0.3,      # Key moments with summaries — moderate
+    "stage4_classification": 0.3,  # Tags + categories per moment
+    "stage5_synthesis": 0.8,       # Full prose technique pages — heaviest output
 }

 # Minimum floor so we never send a trivially small max_tokens
@ -235,13 +239,15 @@ class LLMClient:
        primary_model = model_override or self.settings.llm_model
        fallback_model = self.settings.llm_fallback_model
        effective_max_tokens = max_tokens if max_tokens is not None else self.settings.llm_max_tokens
+        effective_temperature = self.settings.llm_temperature

        logger.info(
-            "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d",
+            "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d, temperature=%.1f",
            primary_model,
            modality,
            response_model.__name__ if response_model else None,
            effective_max_tokens,
+            effective_temperature,
        )

        # --- Try primary endpoint ---
@ -250,6 +256,7 @@ class LLMClient:
                model=primary_model,
                messages=messages,
                max_tokens=effective_max_tokens,
+                temperature=effective_temperature,
                **kwargs,
            )
            raw = response.choices[0].message.content or ""
@ -296,6 +303,7 @@ class LLMClient:
                model=fallback_model,
                messages=messages,
                max_tokens=effective_max_tokens,
+                temperature=effective_temperature,
                **kwargs,
            )
            raw = response.choices[0].message.content or ""
--- a/backend/pipeline/stages.py
+++ b/backend/pipeline/stages.py
@ -185,6 +185,36 @@ def _make_llm_callback(
        )
    return callback

+
+def _build_request_params(
+    max_tokens: int,
+    model_override: str | None,
+    modality: str,
+    response_model: str,
+    hard_limit: int,
+) -> dict:
+    """Build the request_params dict for pipeline event logging.
+
+    Separates actual API params (sent to the LLM) from internal config
+    (used by our estimator only) so the debug JSON is unambiguous.
+    """
+    settings = get_settings()
+    return {
+        "api_params": {
+            "max_tokens": max_tokens,
+            "model": model_override or settings.llm_model,
+            "temperature": settings.llm_temperature,
+            "response_format": "json_object" if modality == "chat" else "none (thinking mode)",
+        },
+        "pipeline_config": {
+            "modality": modality,
+            "response_model": response_model,
+            "estimator_hard_limit": hard_limit,
+            "fallback_max_tokens": settings.llm_max_tokens,
+        },
+    }
+
+
 # ── Helpers ──────────────────────────────────────────────────────────────────

 _engine = None
@ -376,7 +406,7 @@ def stage2_segmentation(self, video_id: str, run_id: str | None = None) -> str:
        hard_limit = get_settings().llm_max_tokens_hard_limit
        max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage2_segmentation", hard_limit=hard_limit)
        logger.info("Stage 2 using model=%s, modality=%s, max_tokens=%d", model_override or "default", modality, max_tokens)
-        _s2_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SegmentationResult", "hard_limit": hard_limit}
+        _s2_request_params = _build_request_params(max_tokens, model_override, modality, "SegmentationResult", hard_limit)
        raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, request_params=_s2_request_params),
                           modality=modality, model_override=model_override, max_tokens=max_tokens)
        result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt,
@ -469,7 +499,7 @@ def stage3_extraction(self, video_id: str, run_id: str | None = None) -> str:
            )

            max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage3_extraction", hard_limit=hard_limit)
-            _s3_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ExtractionResult", "hard_limit": hard_limit}
+            _s3_request_params = _build_request_params(max_tokens, model_override, modality, "ExtractionResult", hard_limit)
            raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label, request_params=_s3_request_params),
                               modality=modality, model_override=model_override, max_tokens=max_tokens)
            result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt,
@ -567,7 +597,7 @@ def _classify_moment_batch(
            video_id, "stage4_classification",
            system_prompt=system_prompt, user_prompt=user_prompt,
            run_id=run_id, context_label=batch_label,
-            request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ClassificationResult", "hard_limit": hard_limit},
+            request_params=_build_request_params(max_tokens, model_override, modality, "ClassificationResult", hard_limit),
        ),
        modality=modality, model_override=model_override,
        max_tokens=max_tokens,
@ -877,7 +907,7 @@ def _synthesize_chunk(
            video_id, "stage5_synthesis",
            system_prompt=system_prompt, user_prompt=user_prompt,
            run_id=run_id, context_label=chunk_label,
-            request_params={"max_tokens": estimated_input, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
+            request_params=_build_request_params(estimated_input, model_override, modality, "SynthesisResult", hard_limit),
        ),
        modality=modality, model_override=model_override, max_tokens=estimated_input,
    )
@ -959,7 +989,7 @@ def _merge_pages_by_slug(
                system_prompt=merge_system_prompt,
                user_prompt=merge_user_prompt,
                run_id=run_id, context_label=f"merge:{slug}",
-                request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
+                request_params=_build_request_params(max_tokens, model_override, modality, "SynthesisResult", hard_limit),
            ),
            modality=modality, model_override=model_override,
            max_tokens=max_tokens,