Auto-mode commit 7aa33cd accidentally deleted 78 files (14,814 lines) during M005
execution. Subsequent commits rebuilt some frontend files but backend/, alembic/,
tests/, whisper/, docker configs, and prompts were never restored in this repo.
This commit restores the full project tree by syncing from ub01's working directory,
which has all M001-M007 features running in production containers.
Restored: backend/ (config, models, routers, database, redis, search_service, worker),
alembic/ (6 migrations), docker/ (Dockerfiles, nginx, compose), prompts/ (4 stages),
tests/, whisper/, README.md, .env.example, chrysopedia-spec.md
88 lines
2.6 KiB
Python
88 lines
2.6 KiB
Python
"""Synchronous embedding client using the OpenAI-compatible /v1/embeddings API.
|
|
|
|
Uses ``openai.OpenAI`` (sync) since Celery tasks run synchronously.
|
|
Handles connection failures gracefully — embedding is non-blocking for the pipeline.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
import openai
|
|
|
|
from config import Settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EmbeddingClient:
|
|
"""Sync embedding client backed by an OpenAI-compatible /v1/embeddings endpoint."""
|
|
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self._client = openai.OpenAI(
|
|
base_url=settings.embedding_api_url,
|
|
api_key=settings.llm_api_key,
|
|
)
|
|
|
|
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
"""Generate embedding vectors for a batch of texts.
|
|
|
|
Parameters
|
|
----------
|
|
texts:
|
|
List of strings to embed.
|
|
|
|
Returns
|
|
-------
|
|
list[list[float]]
|
|
Embedding vectors. Returns empty list on connection/timeout errors
|
|
so the pipeline can continue without embeddings.
|
|
"""
|
|
if not texts:
|
|
return []
|
|
|
|
try:
|
|
response = self._client.embeddings.create(
|
|
model=self.settings.embedding_model,
|
|
input=texts,
|
|
)
|
|
except (openai.APIConnectionError, openai.APITimeoutError) as exc:
|
|
logger.warning(
|
|
"Embedding API unavailable (%s: %s). Skipping %d texts.",
|
|
type(exc).__name__,
|
|
exc,
|
|
len(texts),
|
|
)
|
|
return []
|
|
except openai.APIError as exc:
|
|
logger.warning(
|
|
"Embedding API error (%s: %s). Skipping %d texts.",
|
|
type(exc).__name__,
|
|
exc,
|
|
len(texts),
|
|
)
|
|
return []
|
|
|
|
vectors = [item.embedding for item in response.data]
|
|
|
|
# Validate dimensions
|
|
expected_dim = self.settings.embedding_dimensions
|
|
for i, vec in enumerate(vectors):
|
|
if len(vec) != expected_dim:
|
|
logger.warning(
|
|
"Embedding dimension mismatch at index %d: expected %d, got %d. "
|
|
"Returning empty list.",
|
|
i,
|
|
expected_dim,
|
|
len(vec),
|
|
)
|
|
return []
|
|
|
|
logger.info(
|
|
"Generated %d embeddings (dim=%d) using model=%s",
|
|
len(vectors),
|
|
expected_dim,
|
|
self.settings.embedding_model,
|
|
)
|
|
return vectors
|