From cf759f37394f95525716654f6b1fbe04ba221d8c Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Mon, 30 Mar 2026 04:08:29 +0000
Subject: [PATCH] fix: Add max_tokens=16384 to LLM requests (OpenWebUI defaults
 to 1000, truncating pipeline JSON)

---
 .env.example                   | 3 +++
 backend/config.py              | 3 +++
 backend/pipeline/llm_client.py | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/.env.example b/.env.example
index ac7a180..1bc2026 100644
--- a/.env.example
+++ b/.env.example
@@ -29,6 +29,9 @@ LLM_FALLBACK_MODEL=fyn-qwen35-chat
 #LLM_STAGE5_MODEL=fyn-qwen35-thinking
 #LLM_STAGE5_MODALITY=thinking
 
+# Max tokens for LLM responses (OpenWebUI defaults to 1000 — pipeline needs much more)
+LLM_MAX_TOKENS=16384
+
 # Embedding endpoint (Ollama container in the compose stack)
 EMBEDDING_API_URL=http://chrysopedia-ollama:11434/v1
 EMBEDDING_MODEL=nomic-embed-text
diff --git a/backend/config.py b/backend/config.py
index 8a2b9a9..db37b87 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -43,6 +43,9 @@ class Settings(BaseSettings):
     llm_stage5_model: str | None = None  # synthesis — thinking model recommended
     llm_stage5_modality: str = "chat"
 
+    # Max tokens for LLM responses (OpenWebUI defaults to 1000 which truncates pipeline JSON)
+    llm_max_tokens: int = 16384
+
     # Embedding endpoint
     embedding_api_url: str = "http://localhost:11434/v1"
     embedding_model: str = "nomic-embed-text"
diff --git a/backend/pipeline/llm_client.py b/backend/pipeline/llm_client.py
index 3c6671a..3ecf1df 100644
--- a/backend/pipeline/llm_client.py
+++ b/backend/pipeline/llm_client.py
@@ -136,6 +136,7 @@ class LLMClient:
             response = self._primary.chat.completions.create(
                 model=primary_model,
                 messages=messages,
+                max_tokens=self.settings.llm_max_tokens,
                 **kwargs,
             )
             raw = response.choices[0].message.content or ""
@@ -156,6 +157,7 @@ class LLMClient:
             response = self._fallback.chat.completions.create(
                 model=fallback_model,
                 messages=messages,
+                max_tokens=self.settings.llm_max_tokens,
                 **kwargs,
             )
             raw = response.choices[0].message.content or ""