diff --git a/backend/config.py b/backend/config.py index 17d5bc9..9d27fe9 100644 --- a/backend/config.py +++ b/backend/config.py @@ -45,7 +45,8 @@ class Settings(BaseSettings): # Dynamic token estimation — each stage calculates max_tokens from input size llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator - llm_max_tokens: int = 65536 # Fallback when no estimate is provided + llm_max_tokens: int = 32768 # Fallback when no estimate is provided (must not exceed hard_limit) + llm_temperature: float = 0.0 # Deterministic output for structured JSON extraction # Stage 5 synthesis chunking — max moments per LLM call before splitting synthesis_chunk_size: int = 30 diff --git a/backend/pipeline/llm_client.py b/backend/pipeline/llm_client.py index 5ec49e1..6642a55 100644 --- a/backend/pipeline/llm_client.py +++ b/backend/pipeline/llm_client.py @@ -89,12 +89,16 @@ def strip_think_tags(text: str) -> str: # ── Token estimation ───────────────────────────────────────────────────────── # Stage-specific output multipliers: estimated output tokens as a ratio of input tokens. -# These are empirically tuned based on observed pipeline behavior. +# Tuned from actual pipeline data (KCL Ep 31 audit, April 2026): +# stage2: actual compl/prompt = 680/39312 = 0.017 → use 0.05 with buffer +# stage3: actual compl/prompt ≈ 1000/7000 = 0.14 → use 0.3 with buffer +# stage4: actual compl/prompt = 740/3736 = 0.20 → use 0.3 with buffer +# stage5: actual compl/prompt ≈ 2500/7000 = 0.36 → use 0.8 with buffer _STAGE_OUTPUT_RATIOS: dict[str, float] = { - "stage2_segmentation": 0.6, # Compact topic groups — smaller than input - "stage3_extraction": 2.0, # Detailed moments with summaries — can well exceed input - "stage4_classification": 0.5, # Index + category + tags per moment — small but varies - "stage5_synthesis": 2.5, # Full prose technique pages — heaviest output + "stage2_segmentation": 0.05, # Compact topic groups — much smaller than input + "stage3_extraction": 0.3, # Key moments with summaries — moderate + "stage4_classification": 0.3, # Tags + categories per moment + "stage5_synthesis": 0.8, # Full prose technique pages — heaviest output } # Minimum floor so we never send a trivially small max_tokens @@ -235,13 +239,15 @@ class LLMClient: primary_model = model_override or self.settings.llm_model fallback_model = self.settings.llm_fallback_model effective_max_tokens = max_tokens if max_tokens is not None else self.settings.llm_max_tokens + effective_temperature = self.settings.llm_temperature logger.info( - "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d", + "LLM request: model=%s, modality=%s, response_model=%s, max_tokens=%d, temperature=%.1f", primary_model, modality, response_model.__name__ if response_model else None, effective_max_tokens, + effective_temperature, ) # --- Try primary endpoint --- @@ -250,6 +256,7 @@ class LLMClient: model=primary_model, messages=messages, max_tokens=effective_max_tokens, + temperature=effective_temperature, **kwargs, ) raw = response.choices[0].message.content or "" @@ -296,6 +303,7 @@ class LLMClient: model=fallback_model, messages=messages, max_tokens=effective_max_tokens, + temperature=effective_temperature, **kwargs, ) raw = response.choices[0].message.content or "" diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py index 4a4a788..db9d62f 100644 --- a/backend/pipeline/stages.py +++ b/backend/pipeline/stages.py @@ -185,6 +185,36 @@ def _make_llm_callback( ) return callback + +def _build_request_params( + max_tokens: int, + model_override: str | None, + modality: str, + response_model: str, + hard_limit: int, +) -> dict: + """Build the request_params dict for pipeline event logging. + + Separates actual API params (sent to the LLM) from internal config + (used by our estimator only) so the debug JSON is unambiguous. + """ + settings = get_settings() + return { + "api_params": { + "max_tokens": max_tokens, + "model": model_override or settings.llm_model, + "temperature": settings.llm_temperature, + "response_format": "json_object" if modality == "chat" else "none (thinking mode)", + }, + "pipeline_config": { + "modality": modality, + "response_model": response_model, + "estimator_hard_limit": hard_limit, + "fallback_max_tokens": settings.llm_max_tokens, + }, + } + + # ── Helpers ────────────────────────────────────────────────────────────────── _engine = None @@ -376,7 +406,7 @@ def stage2_segmentation(self, video_id: str, run_id: str | None = None) -> str: hard_limit = get_settings().llm_max_tokens_hard_limit max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage2_segmentation", hard_limit=hard_limit) logger.info("Stage 2 using model=%s, modality=%s, max_tokens=%d", model_override or "default", modality, max_tokens) - _s2_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SegmentationResult", "hard_limit": hard_limit} + _s2_request_params = _build_request_params(max_tokens, model_override, modality, "SegmentationResult", hard_limit) raw = llm.complete(system_prompt, user_prompt, response_model=SegmentationResult, on_complete=_make_llm_callback(video_id, "stage2_segmentation", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, request_params=_s2_request_params), modality=modality, model_override=model_override, max_tokens=max_tokens) result = _safe_parse_llm_response(raw, SegmentationResult, llm, system_prompt, user_prompt, @@ -469,7 +499,7 @@ def stage3_extraction(self, video_id: str, run_id: str | None = None) -> str: ) max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage3_extraction", hard_limit=hard_limit) - _s3_request_params = {"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ExtractionResult", "hard_limit": hard_limit} + _s3_request_params = _build_request_params(max_tokens, model_override, modality, "ExtractionResult", hard_limit) raw = llm.complete(system_prompt, user_prompt, response_model=ExtractionResult, on_complete=_make_llm_callback(video_id, "stage3_extraction", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=topic_label, request_params=_s3_request_params), modality=modality, model_override=model_override, max_tokens=max_tokens) result = _safe_parse_llm_response(raw, ExtractionResult, llm, system_prompt, user_prompt, @@ -567,7 +597,7 @@ def _classify_moment_batch( video_id, "stage4_classification", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=batch_label, - request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "ClassificationResult", "hard_limit": hard_limit}, + request_params=_build_request_params(max_tokens, model_override, modality, "ClassificationResult", hard_limit), ), modality=modality, model_override=model_override, max_tokens=max_tokens, @@ -877,7 +907,7 @@ def _synthesize_chunk( video_id, "stage5_synthesis", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=chunk_label, - request_params={"max_tokens": estimated_input, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit}, + request_params=_build_request_params(estimated_input, model_override, modality, "SynthesisResult", hard_limit), ), modality=modality, model_override=model_override, max_tokens=estimated_input, ) @@ -959,7 +989,7 @@ def _merge_pages_by_slug( system_prompt=merge_system_prompt, user_prompt=merge_user_prompt, run_id=run_id, context_label=f"merge:{slug}", - request_params={"max_tokens": max_tokens, "model_override": model_override, "modality": modality, "response_model": "SynthesisResult", "hard_limit": hard_limit}, + request_params=_build_request_params(max_tokens, model_override, modality, "SynthesisResult", hard_limit), ), modality=modality, model_override=model_override, max_tokens=max_tokens,