"""Synchronous embedding client using the OpenAI-compatible /v1/embeddings API. Uses ``openai.OpenAI`` (sync) since Celery tasks run synchronously. Handles connection failures gracefully — embedding is non-blocking for the pipeline. """ from __future__ import annotations import logging import openai from config import Settings logger = logging.getLogger(__name__) class EmbeddingClient: """Sync embedding client backed by an OpenAI-compatible /v1/embeddings endpoint.""" def __init__(self, settings: Settings) -> None: self.settings = settings self._client = openai.OpenAI( base_url=settings.embedding_api_url, api_key=settings.llm_api_key, ) def embed(self, texts: list[str]) -> list[list[float]]: """Generate embedding vectors for a batch of texts. Parameters ---------- texts: List of strings to embed. Returns ------- list[list[float]] Embedding vectors. Returns empty list on connection/timeout errors so the pipeline can continue without embeddings. """ if not texts: return [] try: response = self._client.embeddings.create( model=self.settings.embedding_model, input=texts, ) except (openai.APIConnectionError, openai.APITimeoutError) as exc: logger.warning( "Embedding API unavailable (%s: %s). Skipping %d texts.", type(exc).__name__, exc, len(texts), ) return [] except openai.APIError as exc: logger.warning( "Embedding API error (%s: %s). Skipping %d texts.", type(exc).__name__, exc, len(texts), ) return [] vectors = [item.embedding for item in response.data] # Validate dimensions expected_dim = self.settings.embedding_dimensions for i, vec in enumerate(vectors): if len(vec) != expected_dim: logger.warning( "Embedding dimension mismatch at index %d: expected %d, got %d. " "Returning empty list.", i, expected_dim, len(vec), ) return [] logger.info( "Generated %d embeddings (dim=%d) using model=%s", len(vectors), expected_dim, self.settings.embedding_model, ) return vectors