diff --git a/backend/config.py b/backend/config.py
index 17d5bc9..9d27fe9 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -45,7 +45,8 @@ class Settings(BaseSettings):
 
     # Dynamic token estimation — each stage calculates max_tokens from input size
     llm_max_tokens_hard_limit: int = 32768   # Hard ceiling for dynamic estimator
-    llm_max_tokens: int = 65536              # Fallback when no estimate is provided
+    llm_max_tokens: int = 32768              # Fallback when no estimate is provided (must not exceed hard_limit)
+    llm_temperature: float = 0.0             # Deterministic output for structured JSON extraction
 
     # Stage 5 synthesis chunking — max moments per LLM call before splitting
     synthesis_chunk_size: int = 30
diff --git a/backend/pipeline/llm_client.py b/backend/pipeline/llm_client.py
index 5ec49e1..6642a55 100644
--- a/backend/pipeline/llm_client.py
+++ b/backend/pipeline/llm_client.py
@@ -89,12 +89,16 @@ def strip_think_tags(text: str) -> str:
 # ── Token estimation ─────────────────────────────────────────────────────────
 
 # Stage-specific output multipliers: estimated output tokens as a ratio of input tokens.
-# These are empirically tuned based on observed pipeline behavior.
+# Tuned from actual pipeline data (KCL Ep 31 audit, April 2026):
+#   stage2: actual compl/prompt = 680/39312 = 0.017 → use 0.05 with buffer
+#   stage3: actual compl/prompt ≈ 1000/7000 = 0.14 → use 0.3 with buffer
+#   stage4: actual compl/prompt = 740/3736 = 0.20 → use 0.3 with buffer
+#   stage5: actual compl/prompt ≈ 2500/7000 = 0.36 → use 0.8 with buffer
 _STAGE_OUTPUT_RATIOS: dict[str, float] = {
-    "stage2_segmentation": 0.6,    # Compact topic groups — smaller than input
-    "stage3_extraction": 2.0,      # Detailed moments with summaries — can well exceed input
-    "stage4_classification": 0.5,  # Index + category + tags per moment — small but varies
-    "stage5_synthesis": 2.5,       # Full prose technique pages — heaviest output
+    "stage2_segmentation": 0.05,   # Compact topic groups — much smaller than input
+    "stage3_extraction": 0.3,      # Key moments with summaries — moderate
+    "stage4_classification": 0.3,  # Tags + categories per moment
+    "stage5_synthesis": 0.8,       # Full prose technique pages — heaviest output
 }
 
 # Minimum floor so we never send a trivially small max_tokens
@@ -235,13 +239,15 @@ class LLMClient:
         primary_model = model_override or self.settings.llm_model
         fallback_model = self.settings.llm_fallback_model
         effective_max_tokens = max_tokens if max_tokens is not None else self.settings.llm_max_tokens
+        effective_temperature = self.settings.llm_temperature
 
         logger.info(
-            "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d",
+            "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d, temperature=%.1f",
             primary_model,
             modality,
             response_model.__name__ if response_model else None,
             effective_max_tokens,
+            effective_temperature,
         )
 
         # --- Try primary endpoint ---
@@ -250,6 +256,7 @@ class LLMClient:
                 model=primary_model,
                 messages=messages,
                 max_tokens=effective_max_tokens,
+                temperature=effective_temperature,
                 **kwargs,
             )
             raw = response.choices[0].message.content or ""
@@ -296,6 +303,7 @@ class LLMClient:
                 model=fallback_model,
                 messages=messages,
                 max_tokens=effective_max_tokens,
+                temperature=effective_temperature,
                 **kwargs,
             )
             raw = response.choices[0].message.content or ""
diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py
index 4a4a788..db9d62f 100644
--- a/backend/pipeline/stages.py
+++ b/backend/pipeline/stages.py
@@ -185,6 +185,36 @@ def _make_llm_callback(
         )
     return callback
 
+
+def _build_request_params(
+    max_tokens: int,
+    model_override: str | None,
+    modality: str,
+    response_model: str,
+    hard_limit: int,
+) -> dict:
+    """Build the request_params dict for pipeline event logging.
+
+    Separates actual API params (sent to the LLM) from internal config
+    (used by our estimator only) so the debug JSON is unambiguous.
+    """
+    settings = get_settings()
+    return {
+        "api_params": {
+            "max_tokens": max_tokens,
+            "model": model_override or settings.llm_model,
+            "temperature": settings.llm_temperature,
+            "response_format": "json_object" if modality == "chat" else "none (thinking mode)",
+        },
+        "pipeline_config": {
+            "modality": modality,
+            "response_model": response_model,
+            "estimator_hard_limit": hard_limit,
+            "fallback_max_tokens": settings.llm_max_tokens,
+        },
+    }
+
+
 # ── Helpers ──────────────────────────────────────────────────────────────────
 
 _engine = None
@@ -376,7 +406,7 @@ def stage2_segmentation(self, video_id: str, run_id: str | None = None) -> str:
         hard_limit = get_settings().llm_max_tokens_hard_limit
         max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage2_segmentation", hard_limit=hard_limit)
         logger.info("Stage 2 using model=%s, modality=%s, max_tokens=%d", model_override or "default", modality, max_tokens)
-        _s2_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SegmentationResult", "hard_limit": hard_limit}
+        _s2_request_params = _build_request_params(max_tokens, model_override, modality, "SegmentationResult", hard_limit)
         raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, request_params=_s2_request_params),
                            modality=modality, model_override=model_override, max_tokens=max_tokens)
         result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt,
@@ -469,7 +499,7 @@ def stage3_extraction(self, video_id: str, run_id: str | None = None) -> str:
             )
 
             max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage3_extraction", hard_limit=hard_limit)
-            _s3_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ExtractionResult", "hard_limit": hard_limit}
+            _s3_request_params = _build_request_params(max_tokens, model_override, modality, "ExtractionResult", hard_limit)
             raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label, request_params=_s3_request_params),
                                modality=modality, model_override=model_override, max_tokens=max_tokens)
             result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt,
@@ -567,7 +597,7 @@ def _classify_moment_batch(
             video_id, "stage4_classification",
             system_prompt=system_prompt, user_prompt=user_prompt,
             run_id=run_id, context_label=batch_label,
-            request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ClassificationResult", "hard_limit": hard_limit},
+            request_params=_build_request_params(max_tokens, model_override, modality, "ClassificationResult", hard_limit),
         ),
         modality=modality, model_override=model_override,
         max_tokens=max_tokens,
@@ -877,7 +907,7 @@ def _synthesize_chunk(
             video_id, "stage5_synthesis",
             system_prompt=system_prompt, user_prompt=user_prompt,
             run_id=run_id, context_label=chunk_label,
-            request_params={"max_tokens": estimated_input, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
+            request_params=_build_request_params(estimated_input, model_override, modality, "SynthesisResult", hard_limit),
         ),
         modality=modality, model_override=model_override, max_tokens=estimated_input,
     )
@@ -959,7 +989,7 @@ def _merge_pages_by_slug(
                 system_prompt=merge_system_prompt,
                 user_prompt=merge_user_prompt,
                 run_id=run_id, context_label=f"merge:{slug}",
-                request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
+                request_params=_build_request_params(max_tokens, model_override, modality, "SynthesisResult", hard_limit),
             ),
             modality=modality, model_override=model_override,
             max_tokens=max_tokens,