"""Synchronous LLM client with primary/fallback endpoint logic. Uses the OpenAI-compatible API (works with Ollama, vLLM, OpenWebUI, etc.). Celery tasks run synchronously, so this uses ``openai.OpenAI`` (not Async). Supports two modalities: - **chat**: Standard JSON mode with ``response_format: {"type": "json_object"}`` - **thinking**: For reasoning models that emit ``...`` blocks before their answer. Skips ``response_format``, appends JSON instructions to the system prompt, and strips think tags from the response. """ from __future__ import annotations import logging import re from typing import TypeVar import openai from pydantic import BaseModel from config import Settings logger = logging.getLogger(__name__) T = TypeVar("T", bound=BaseModel) # ── Think-tag stripping ────────────────────────────────────────────────────── _THINK_PATTERN = re.compile(r".*?", re.DOTALL) def strip_think_tags(text: str) -> str: """Remove ``...`` blocks from LLM output. Thinking/reasoning models often prefix their JSON with a reasoning trace wrapped in ```` tags. This strips all such blocks (including multiline and multiple occurrences) and returns the cleaned text. Handles: - Single ``...`` block - Multiple blocks in one response - Multiline content inside think tags - Responses with no think tags (passthrough) - Empty input (passthrough) """ if not text: return text cleaned = _THINK_PATTERN.sub("", text) return cleaned.strip() class LLMClient: """Sync LLM client that tries a primary endpoint and falls back on failure.""" def __init__(self, settings: Settings) -> None: self.settings = settings self._primary = openai.OpenAI( base_url=settings.llm_api_url, api_key=settings.llm_api_key, ) self._fallback = openai.OpenAI( base_url=settings.llm_fallback_url, api_key=settings.llm_api_key, ) # ── Core completion ────────────────────────────────────────────────── def complete( self, system_prompt: str, user_prompt: str, response_model: type[BaseModel] | None = None, modality: str = "chat", model_override: str | None = None, ) -> str: """Send a chat completion request, falling back on connection/timeout errors. Parameters ---------- system_prompt: System message content. user_prompt: User message content. response_model: If provided and modality is "chat", ``response_format`` is set to ``{"type": "json_object"}``. For "thinking" modality, JSON instructions are appended to the system prompt instead. modality: Either "chat" (default) or "thinking". Thinking modality skips response_format and strips ```` tags from output. model_override: Model name to use instead of the default. If None, uses the configured default for the endpoint. Returns ------- str Raw completion text from the model (think tags stripped if thinking). """ kwargs: dict = {} effective_system = system_prompt if modality == "thinking": # Thinking models often don't support response_format: json_object. # Instead, append explicit JSON instructions to the system prompt. if response_model is not None: json_schema_hint = ( "\n\nYou MUST respond with ONLY valid JSON. " "No markdown code fences, no explanation, no preamble — " "just the raw JSON object." ) effective_system = system_prompt + json_schema_hint else: # Chat modality — use standard JSON mode if response_model is not None: kwargs["response_format"] = {"type": "json_object"} messages = [ {"role": "system", "content": effective_system}, {"role": "user", "content": user_prompt}, ] primary_model = model_override or self.settings.llm_model fallback_model = self.settings.llm_fallback_model logger.info( "LLM request: model=%s, modality=%s, response_model=%s", primary_model, modality, response_model.__name__ if response_model else None, ) # --- Try primary endpoint --- try: response = self._primary.chat.completions.create( model=primary_model, messages=messages, **kwargs, ) raw = response.choices[0].message.content or "" if modality == "thinking": raw = strip_think_tags(raw) return raw except (openai.APIConnectionError, openai.APITimeoutError) as exc: logger.warning( "Primary LLM endpoint failed (%s: %s), trying fallback at %s", type(exc).__name__, exc, self.settings.llm_fallback_url, ) # --- Try fallback endpoint --- try: response = self._fallback.chat.completions.create( model=fallback_model, messages=messages, **kwargs, ) raw = response.choices[0].message.content or "" if modality == "thinking": raw = strip_think_tags(raw) return raw except (openai.APIConnectionError, openai.APITimeoutError, openai.APIError) as exc: logger.error( "Fallback LLM endpoint also failed (%s: %s). Giving up.", type(exc).__name__, exc, ) raise # ── Response parsing ───────────────────────────────────────────────── def parse_response(self, text: str, model: type[T]) -> T: """Parse raw LLM output as JSON and validate against a Pydantic model. Parameters ---------- text: Raw JSON string from the LLM. model: Pydantic model class to validate against. Returns ------- T Validated Pydantic model instance. Raises ------ pydantic.ValidationError If the JSON doesn't match the schema. ValueError If the text is not valid JSON. """ try: return model.model_validate_json(text) except Exception: logger.error( "Failed to parse LLM response as %s. Response text: %.500s", model.__name__, text, ) raise