fix: Pipeline LLM audit — temperature=0, realistic token ratios, structured request_params

Audit findings & fixes:
- temperature was never set (API defaulted to 1.0) → now explicit 0.0 for deterministic JSON
- llm_max_tokens=65536 exceeded hard_limit=32768 → aligned to 32768
- Output ratio estimates were 5-30x too high (based on actual pipeline data):
  stage2: 0.6→0.05, stage3: 2.0→0.3, stage4: 0.5→0.3, stage5: 2.5→0.8
- request_params now structured as api_params (what's sent to LLM) vs pipeline_config
  (internal estimator settings) — no more ambiguous 'hard_limit' in request params
- temperature=0.0 sent on both primary and fallback endpoints
This commit is contained in:
jlightner 2026-04-01 07:20:09 +00:00
parent d58194ff96
commit fd1fd6c6f9
3 changed files with 51 additions and 12 deletions

View file

@ -45,7 +45,8 @@ class Settings(BaseSettings):
# Dynamic token estimation — each stage calculates max_tokens from input size
llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator
llm_max_tokens: int = 65536 # Fallback when no estimate is provided
llm_max_tokens: int = 32768 # Fallback when no estimate is provided (must not exceed hard_limit)
llm_temperature: float = 0.0 # Deterministic output for structured JSON extraction
# Stage 5 synthesis chunking — max moments per LLM call before splitting
synthesis_chunk_size: int = 30

View file

@ -89,12 +89,16 @@ def strip_think_tags(text: str) -> str:
# ── Token estimation ─────────────────────────────────────────────────────────
# Stage-specific output multipliers: estimated output tokens as a ratio of input tokens.
# These are empirically tuned based on observed pipeline behavior.
# Tuned from actual pipeline data (KCL Ep 31 audit, April 2026):
# stage2: actual compl/prompt = 680/39312 = 0.017 → use 0.05 with buffer
# stage3: actual compl/prompt ≈ 1000/7000 = 0.14 → use 0.3 with buffer
# stage4: actual compl/prompt = 740/3736 = 0.20 → use 0.3 with buffer
# stage5: actual compl/prompt ≈ 2500/7000 = 0.36 → use 0.8 with buffer
_STAGE_OUTPUT_RATIOS: dict[str, float] = {
"stage2_segmentation": 0.6, # Compact topic groups — smaller than input
"stage3_extraction": 2.0, # Detailed moments with summaries — can well exceed input
"stage4_classification": 0.5, # Index + category + tags per moment — small but varies
"stage5_synthesis": 2.5, # Full prose technique pages — heaviest output
"stage2_segmentation": 0.05, # Compact topic groups — much smaller than input
"stage3_extraction": 0.3, # Key moments with summaries — moderate
"stage4_classification": 0.3, # Tags + categories per moment
"stage5_synthesis": 0.8, # Full prose technique pages — heaviest output
}
# Minimum floor so we never send a trivially small max_tokens
@ -235,13 +239,15 @@ class LLMClient:
primary_model = model_override or self.settings.llm_model
fallback_model = self.settings.llm_fallback_model
effective_max_tokens = max_tokens if max_tokens is not None else self.settings.llm_max_tokens
effective_temperature = self.settings.llm_temperature
logger.info(
"LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d",
"LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d, temperature=%.1f",
primary_model,
modality,
response_model.__name__ if response_model else None,
effective_max_tokens,
effective_temperature,
)
# --- Try primary endpoint ---
@ -250,6 +256,7 @@ class LLMClient:
model=primary_model,
messages=messages,
max_tokens=effective_max_tokens,
temperature=effective_temperature,
**kwargs,
)
raw = response.choices[0].message.content or ""
@ -296,6 +303,7 @@ class LLMClient:
model=fallback_model,
messages=messages,
max_tokens=effective_max_tokens,
temperature=effective_temperature,
**kwargs,
)
raw = response.choices[0].message.content or ""

View file

@ -185,6 +185,36 @@ def _make_llm_callback(
)
return callback
def _build_request_params(
max_tokens: int,
model_override: str | None,
modality: str,
response_model: str,
hard_limit: int,
) -> dict:
"""Build the request_params dict for pipeline event logging.
Separates actual API params (sent to the LLM) from internal config
(used by our estimator only) so the debug JSON is unambiguous.
"""
settings = get_settings()
return {
"api_params": {
"max_tokens": max_tokens,
"model": model_override or settings.llm_model,
"temperature": settings.llm_temperature,
"response_format": "json_object" if modality == "chat" else "none (thinking mode)",
},
"pipeline_config": {
"modality": modality,
"response_model": response_model,
"estimator_hard_limit": hard_limit,
"fallback_max_tokens": settings.llm_max_tokens,
},
}
# ── Helpers ──────────────────────────────────────────────────────────────────
_engine = None
@ -376,7 +406,7 @@ def stage2_segmentation(self, video_id: str, run_id: str | None = None) -> str:
hard_limit = get_settings().llm_max_tokens_hard_limit
max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage2_segmentation", hard_limit=hard_limit)
logger.info("Stage 2 using model=%s, modality=%s, max_tokens=%d", model_override or "default", modality, max_tokens)
_s2_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SegmentationResult", "hard_limit": hard_limit}
_s2_request_params = _build_request_params(max_tokens, model_override, modality, "SegmentationResult", hard_limit)
raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, request_params=_s2_request_params),
modality=modality, model_override=model_override, max_tokens=max_tokens)
result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt,
@ -469,7 +499,7 @@ def stage3_extraction(self, video_id: str, run_id: str | None = None) -> str:
)
max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage3_extraction", hard_limit=hard_limit)
_s3_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ExtractionResult", "hard_limit": hard_limit}
_s3_request_params = _build_request_params(max_tokens, model_override, modality, "ExtractionResult", hard_limit)
raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label, request_params=_s3_request_params),
modality=modality, model_override=model_override, max_tokens=max_tokens)
result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt,
@ -567,7 +597,7 @@ def _classify_moment_batch(
video_id, "stage4_classification",
system_prompt=system_prompt, user_prompt=user_prompt,
run_id=run_id, context_label=batch_label,
request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ClassificationResult", "hard_limit": hard_limit},
request_params=_build_request_params(max_tokens, model_override, modality, "ClassificationResult", hard_limit),
),
modality=modality, model_override=model_override,
max_tokens=max_tokens,
@ -877,7 +907,7 @@ def _synthesize_chunk(
video_id, "stage5_synthesis",
system_prompt=system_prompt, user_prompt=user_prompt,
run_id=run_id, context_label=chunk_label,
request_params={"max_tokens": estimated_input, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
request_params=_build_request_params(estimated_input, model_override, modality, "SynthesisResult", hard_limit),
),
modality=modality, model_override=model_override, max_tokens=estimated_input,
)
@ -959,7 +989,7 @@ def _merge_pages_by_slug(
system_prompt=merge_system_prompt,
user_prompt=merge_user_prompt,
run_id=run_id, context_label=f"merge:{slug}",
request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit},
request_params=_build_request_params(max_tokens, model_override, modality, "SynthesisResult", hard_limit),
),
modality=modality, model_override=model_override,
max_tokens=max_tokens,