chrysopedia/backend/pipeline/embedding_client.py
jlightner 4b0914b12b fix: restore complete project tree from ub01 canonical state
Auto-mode commit 7aa33cd accidentally deleted 78 files (14,814 lines) during M005
execution. Subsequent commits rebuilt some frontend files but backend/, alembic/,
tests/, whisper/, docker configs, and prompts were never restored in this repo.

This commit restores the full project tree by syncing from ub01's working directory,
which has all M001-M007 features running in production containers.

Restored: backend/ (config, models, routers, database, redis, search_service, worker),
alembic/ (6 migrations), docker/ (Dockerfiles, nginx, compose), prompts/ (4 stages),
tests/, whisper/, README.md, .env.example, chrysopedia-spec.md
2026-03-31 02:10:41 +00:00

88 lines
2.6 KiB
Python

"""Synchronous embedding client using the OpenAI-compatible /v1/embeddings API.
Uses ``openai.OpenAI`` (sync) since Celery tasks run synchronously.
Handles connection failures gracefully — embedding is non-blocking for the pipeline.
"""
from __future__ import annotations
import logging
import openai
from config import Settings
logger = logging.getLogger(__name__)
class EmbeddingClient:
"""Sync embedding client backed by an OpenAI-compatible /v1/embeddings endpoint."""
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._client = openai.OpenAI(
base_url=settings.embedding_api_url,
api_key=settings.llm_api_key,
)
def embed(self, texts: list[str]) -> list[list[float]]:
"""Generate embedding vectors for a batch of texts.
Parameters
----------
texts:
List of strings to embed.
Returns
-------
list[list[float]]
Embedding vectors. Returns empty list on connection/timeout errors
so the pipeline can continue without embeddings.
"""
if not texts:
return []
try:
response = self._client.embeddings.create(
model=self.settings.embedding_model,
input=texts,
)
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
logger.warning(
"Embedding API unavailable (%s: %s). Skipping %d texts.",
type(exc).__name__,
exc,
len(texts),
)
return []
except openai.APIError as exc:
logger.warning(
"Embedding API error (%s: %s). Skipping %d texts.",
type(exc).__name__,
exc,
len(texts),
)
return []
vectors = [item.embedding for item in response.data]
# Validate dimensions
expected_dim = self.settings.embedding_dimensions
for i, vec in enumerate(vectors):
if len(vec) != expected_dim:
logger.warning(
"Embedding dimension mismatch at index %d: expected %d, got %d. "
"Returning empty list.",
i,
expected_dim,
len(vec),
)
return []
logger.info(
"Generated %d embeddings (dim=%d) using model=%s",
len(vectors),
expected_dim,
self.settings.embedding_model,
)
return vectors