From 899ab742a8cf6f3a97c80ddfc104be1bd5b56475 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sat, 4 Apr 2026 14:31:28 +0000 Subject: [PATCH] =?UTF-8?q?test:=20Added=20automatic=20primary=E2=86=92fal?= =?UTF-8?q?lback=20LLM=20endpoint=20switching=20in=20ChatSe=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/chat_service.py" - "backend/tests/test_chat.py" - "docker-compose.yml" GSD-Task: S08/T01 --- .gsd/milestones/M025/M025-ROADMAP.md | 2 +- .../milestones/M025/slices/S07/S07-SUMMARY.md | 85 +++++++++++++ .gsd/milestones/M025/slices/S07/S07-UAT.md | 53 ++++++++ .../M025/slices/S07/tasks/T02-VERIFY.json | 16 +++ .gsd/milestones/M025/slices/S08/S08-PLAN.md | 32 ++++- .../M025/slices/S08/S08-RESEARCH.md | 96 ++++++++++++++ .../M025/slices/S08/tasks/T01-PLAN.md | 39 ++++++ .../M025/slices/S08/tasks/T01-SUMMARY.md | 79 ++++++++++++ .../M025/slices/S08/tasks/T02-PLAN.md | 32 +++++ backend/chat_service.py | 47 ++++++- backend/tests/test_chat.py | 118 ++++++++++++++++++ docker-compose.yml | 2 + 12 files changed, 597 insertions(+), 4 deletions(-) create mode 100644 .gsd/milestones/M025/slices/S07/S07-SUMMARY.md create mode 100644 .gsd/milestones/M025/slices/S07/S07-UAT.md create mode 100644 .gsd/milestones/M025/slices/S07/tasks/T02-VERIFY.json create mode 100644 .gsd/milestones/M025/slices/S08/S08-RESEARCH.md create mode 100644 .gsd/milestones/M025/slices/S08/tasks/T01-PLAN.md create mode 100644 .gsd/milestones/M025/slices/S08/tasks/T01-SUMMARY.md create mode 100644 .gsd/milestones/M025/slices/S08/tasks/T02-PLAN.md diff --git a/.gsd/milestones/M025/M025-ROADMAP.md b/.gsd/milestones/M025/M025-ROADMAP.md index 156e3ca..1dbf6be 100644 --- a/.gsd/milestones/M025/M025-ROADMAP.md +++ b/.gsd/milestones/M025/M025-ROADMAP.md @@ -12,7 +12,7 @@ Production hardening, mobile polish, creator onboarding, and formal validation. | S04 | [B] Rate Limiting + Cost Management | low | — | ✅ | Chat requests limited per-user and per-creator. Token usage dashboard in admin. | | S05 | [B] AI Transparency Page | low | — | ✅ | Creator sees all entities, relationships, and technique pages derived from their content | | S06 | [B] Graph Backend Evaluation | low | — | ✅ | Benchmark report: NetworkX vs Neo4j at current and projected entity counts | -| S07 | [A] Data Export (GDPR-Style) | medium | — | ⬜ | Creator downloads a ZIP with all derived content, entities, and relationships | +| S07 | [A] Data Export (GDPR-Style) | medium | — | ✅ | Creator downloads a ZIP with all derived content, entities, and relationships | | S08 | [B] Load Testing + Fallback Resilience | medium | — | ⬜ | 10 concurrent chat sessions maintain acceptable latency. DGX down → Ollama fallback works. | | S09 | [B] Prompt Optimization Pass | low | — | ⬜ | Chat quality reviewed across creators. Personality fidelity assessed. | | S10 | Requirement Validation (R015, R037-R041) | low | — | ⬜ | R015, R037, R038, R039, R041 formally validated and signed off | diff --git a/.gsd/milestones/M025/slices/S07/S07-SUMMARY.md b/.gsd/milestones/M025/slices/S07/S07-SUMMARY.md new file mode 100644 index 0000000..5d0225a --- /dev/null +++ b/.gsd/milestones/M025/slices/S07/S07-SUMMARY.md @@ -0,0 +1,85 @@ +--- +id: S07 +parent: M025 +milestone: M025 +provides: + - GET /creator/export endpoint returning ZIP archive of all creator-owned data + - Export My Data button on CreatorDashboard +requires: + [] +affects: + [] +key_files: + - backend/routers/creator_dashboard.py + - backend/tests/test_export.py + - frontend/src/pages/CreatorDashboard.tsx + - frontend/src/pages/CreatorDashboard.module.css + - frontend/src/api/creator-dashboard.ts +key_decisions: + - In-memory ZIP via io.BytesIO — per-creator datasets are small enough that disk streaming isn't needed + - Column introspection via __table__.columns for serialization — adapts automatically to schema changes + - Blob download via hidden anchor + URL.createObjectURL — standard browser download pattern +patterns_established: + - Authenticated blob download pattern: fetch with Bearer token → response.blob() → object URL → hidden anchor click → URL.revokeObjectURL +observability_surfaces: + - Structured logging on export start (creator_id) and completion (file count, approximate size) +drill_down_paths: + - .gsd/milestones/M025/slices/S07/tasks/T01-SUMMARY.md + - .gsd/milestones/M025/slices/S07/tasks/T02-SUMMARY.md +duration: "" +verification_result: passed +completed_at: 2026-04-04T14:20:43.400Z +blocker_discovered: false +--- + +# S07: [A] Data Export (GDPR-Style) + +**Creator can download a ZIP archive of all their derived content, entities, and relationships via an authenticated endpoint, with a one-click button on the dashboard.** + +## What Happened + +Added a GDPR-style data export feature spanning backend and frontend. The backend endpoint `GET /creator/export` queries 12 creator-owned tables (creators, source_videos, key_moments, technique_pages, technique_page_versions, related_technique_links, video_consents, consent_audit_log, posts, post_attachments, highlight_candidates, generated_shorts), serializes each to JSON with UUID/datetime handling via `default=str`, and packages them into an in-memory ZIP archive with `export_metadata.json` containing timestamp and creator_id. Related technique links include both directions (outgoing and incoming). The endpoint reuses the established auth pattern from the transparency endpoint. + +The frontend adds an "Export My Data" button to CreatorDashboard.tsx with loading spinner during download, inline error display on failure, and browser download via hidden anchor + object URL on success. The `exportCreatorData()` function lives in `creator-dashboard.ts` alongside other dashboard API functions. + +9 standalone ASGI tests cover ZIP validity, JSON content correctness, UUID/datetime serialization, cross-references, metadata fields, auth requirement (401), and creator-link requirement (404). + +## Verification + +Backend: `cd backend && python -m pytest tests/test_export.py -v` — 9/9 pass (ZIP structure, JSON validity, serialization, auth, 404). Frontend: `cd frontend && npm run build` — clean build, 0 errors. + +## Requirements Advanced + +None. + +## Requirements Validated + +None. + +## New Requirements Surfaced + +None. + +## Requirements Invalidated or Re-scoped + +None. + +## Deviations + +ZIP contains 13 files (12 data tables + metadata) rather than the 10 mentioned in the plan's must-haves. The plan description already referenced all 12 tables; the must-have count was understated. Export function placed in creator-dashboard.ts rather than a separate creator.ts — keeps dashboard API functions co-located. + +## Known Limitations + +Binary attachments (uploaded files, generated short videos) are not included in the export — only metadata. The export_metadata.json notes this. Filename uses creator_id rather than slug. + +## Follow-ups + +None. + +## Files Created/Modified + +- `backend/routers/creator_dashboard.py` — Added GET /creator/export endpoint with ZIP archive generation for 12 tables +- `backend/tests/test_export.py` — 9 standalone ASGI tests for export endpoint +- `frontend/src/pages/CreatorDashboard.tsx` — Added Export My Data button with loading/error states +- `frontend/src/pages/CreatorDashboard.module.css` — Export button styling matching dashboard design +- `frontend/src/api/creator-dashboard.ts` — Added exportCreatorData() blob download function diff --git a/.gsd/milestones/M025/slices/S07/S07-UAT.md b/.gsd/milestones/M025/slices/S07/S07-UAT.md new file mode 100644 index 0000000..73dca03 --- /dev/null +++ b/.gsd/milestones/M025/slices/S07/S07-UAT.md @@ -0,0 +1,53 @@ +# S07: [A] Data Export (GDPR-Style) — UAT + +**Milestone:** M025 +**Written:** 2026-04-04T14:20:43.401Z + +## UAT: Data Export (GDPR-Style) + +### Preconditions +- Logged in as a user linked to a creator account +- Creator has at least one source video processed through the pipeline (producing key moments, technique pages, etc.) + +### Test 1: Export button visible on dashboard +1. Navigate to Creator Dashboard +2. **Expected:** "Export My Data" button is visible in the dashboard UI +3. Button has a download icon and matches dashboard styling + +### Test 2: Successful export download +1. Click "Export My Data" button +2. **Expected:** Button shows loading state (spinner, disabled) +3. Wait for download to complete +4. **Expected:** Browser downloads a file named `chrysopedia-export-{creator_id}.zip` +5. Open the ZIP file +6. **Expected:** Contains 13 files: `creators.json`, `source_videos.json`, `key_moments.json`, `technique_pages.json`, `technique_page_versions.json`, `related_technique_links.json`, `video_consents.json`, `consent_audit_log.json`, `posts.json`, `post_attachments.json`, `highlight_candidates.json`, `generated_shorts.json`, `export_metadata.json` +7. Open `export_metadata.json` +8. **Expected:** Contains `exported_at` (ISO timestamp), `creator_id`, and a note about binary attachments not being included + +### Test 3: JSON content validity +1. From the downloaded ZIP, open `creators.json` +2. **Expected:** Valid JSON array with one entry containing the creator's name, slug, and other fields +3. Open `technique_pages.json` +4. **Expected:** Valid JSON array. UUID fields are strings (not objects). Datetime fields are ISO-formatted strings. + +### Test 4: Related links include cross-references +1. Open `related_technique_links.json` from the ZIP +2. **Expected:** Includes links where this creator's technique pages are the source AND links where they are the target + +### Test 5: Auth required +1. Open browser dev tools, clear auth token +2. Try accessing `GET /api/v1/creator/export` directly +3. **Expected:** 401 Unauthorized response + +### Test 6: Non-creator user gets 404 +1. Log in as a user that is NOT linked to any creator +2. Navigate to `/api/v1/creator/export` +3. **Expected:** 404 response (no creator record found) + +### Test 7: Error state on failure +1. Simulate a backend failure (e.g., stop the API mid-request or use network throttling to cause timeout) +2. **Expected:** Button returns to normal state, inline error message displayed to user + +### Edge Cases +- **Empty creator (no videos):** Export should still succeed with empty arrays in each JSON file +- **Large dataset:** Export completes without timeout for creators with many videos/moments diff --git a/.gsd/milestones/M025/slices/S07/tasks/T02-VERIFY.json b/.gsd/milestones/M025/slices/S07/tasks/T02-VERIFY.json new file mode 100644 index 0000000..f57209d --- /dev/null +++ b/.gsd/milestones/M025/slices/S07/tasks/T02-VERIFY.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": 1, + "taskId": "T02", + "unitId": "M025/S07/T02", + "timestamp": 1775312370686, + "passed": true, + "discoverySource": "task-plan", + "checks": [ + { + "command": "cd frontend", + "exitCode": 0, + "durationMs": 8, + "verdict": "pass" + } + ] +} diff --git a/.gsd/milestones/M025/slices/S08/S08-PLAN.md b/.gsd/milestones/M025/slices/S08/S08-PLAN.md index 231f56e..b1c6bb9 100644 --- a/.gsd/milestones/M025/slices/S08/S08-PLAN.md +++ b/.gsd/milestones/M025/slices/S08/S08-PLAN.md @@ -1,6 +1,36 @@ # S08: [B] Load Testing + Fallback Resilience -**Goal:** Load test concurrent chat sessions and verify fallback resilience +**Goal:** ChatService survives primary LLM endpoint failure via automatic fallback. Load test script proves 10 concurrent chat sessions maintain acceptable latency. **Demo:** After this: 10 concurrent chat sessions maintain acceptable latency. DGX down → Ollama fallback works. ## Tasks +- [x] **T01: Added automatic primary→fallback LLM endpoint switching in ChatService with two unit tests covering APIConnectionError and InternalServerError scenarios** — Add automatic fallback from primary to secondary LLM endpoint in ChatService, matching the pattern already used by the sync LLMClient in pipeline/llm_client.py. When the primary openai.AsyncOpenAI client fails with APIConnectionError, APITimeoutError, or InternalServerError during streaming, retry the entire create() call with a fallback client pointing at settings.llm_fallback_url + settings.llm_fallback_model. Add the fallback_used field to the SSE done event. Update docker-compose.yml to pass LLM_FALLBACK_URL=http://chrysopedia-ollama:11434/v1 to the API container. Write unit tests for both APIConnectionError and InternalServerError fallback scenarios. + +Steps: +1. Read `backend/chat_service.py` and `backend/pipeline/llm_client.py` to understand the existing fallback pattern. +2. In ChatService.__init__, create `self._fallback_openai = openai.AsyncOpenAI(base_url=settings.llm_fallback_url, api_key=settings.llm_api_key)`. +3. In stream_response(), wrap the `self._openai.chat.completions.create(...)` call and its async iteration in a try/except for `(openai.APIConnectionError, openai.APITimeoutError, openai.InternalServerError)`. On catch, log WARNING with `chat_llm_fallback` prefix including the error type and message, then retry the same create() call using `self._fallback_openai` and `self.settings.llm_fallback_model`. Track `fallback_used = True`. +4. Add `fallback_used` (bool) to the done event data dict: `{"cascade_tier": ..., "conversation_id": ..., "fallback_used": fallback_used}`. +5. Update the model name logged in the usage log call — when fallback is used, pass `self.settings.llm_fallback_model` instead of `self.settings.llm_model`. +6. In `docker-compose.yml`, add `LLM_FALLBACK_URL: http://chrysopedia-ollama:11434/v1` and `LLM_FALLBACK_MODEL: fyn-llm-agent-chat` to the chrysopedia-api environment block. +7. In `backend/tests/test_chat.py`, add two test functions: + - `test_chat_fallback_on_connection_error`: mock primary openai to raise `openai.APIConnectionError`, mock fallback openai to return streaming chunks. Assert SSE events include tokens and done event has `fallback_used: true`. + - `test_chat_fallback_on_internal_server_error`: same but with `openai.InternalServerError`. +8. Run `cd backend && python -m pytest tests/test_chat.py -v -k fallback` — both tests pass. + - Estimate: 45m + - Files: backend/chat_service.py, backend/tests/test_chat.py, docker-compose.yml + - Verify: cd backend && python -m pytest tests/test_chat.py -v -k fallback +- [ ] **T02: Write async load test script for 10 concurrent chat sessions** — Create a standalone Python script that fires 10 concurrent chat requests to the SSE endpoint, parses streaming events to measure time-to-first-token (TTFT) and total response time, and reports p50/p95/max latency statistics. Uses httpx (already a project dependency) + asyncio. No external load testing tools needed. + +Steps: +1. Create `scripts/load_test_chat.py` with argparse accepting `--url` (default http://localhost:8096), `--concurrency` (default 10), `--query` (default 'What are common compression techniques?'). +2. Implement an async function `run_single_chat(client, url, query)` that: POSTs to `{url}/api/v1/chat` with `{"query": query}`, reads the SSE stream line-by-line, records timestamp of first `event: token` line (TTFT), records total time when stream ends, returns a result dict with ttft_ms, total_ms, token_count, error (if any). +3. Implement `run_load_test(url, concurrency, query)` that creates an httpx.AsyncClient with timeout=60s, fires `concurrency` concurrent `run_single_chat` calls via asyncio.gather, collects results. +4. Compute and print statistics: for both TTFT and total time, show min/p50/p95/max. Show error count. Show per-request summary table. +5. Add `--auth-token` optional flag for authenticated requests (sets Authorization header) to avoid IP rate limit (10/hour default). Document in script docstring that running 10 requests from one IP will hit the rate limit unless authenticated or rate limit is raised. +6. Add `--output` flag to write results as JSON to a file. +7. Validate the script runs: `python scripts/load_test_chat.py --help` exits 0. +8. Test SSE parsing logic with a small inline unit test or a `--dry-run` flag that uses a mock response. + - Estimate: 40m + - Files: scripts/load_test_chat.py + - Verify: python scripts/load_test_chat.py --help && echo 'Script OK' diff --git a/.gsd/milestones/M025/slices/S08/S08-RESEARCH.md b/.gsd/milestones/M025/slices/S08/S08-RESEARCH.md new file mode 100644 index 0000000..a4e0cb4 --- /dev/null +++ b/.gsd/milestones/M025/slices/S08/S08-RESEARCH.md @@ -0,0 +1,96 @@ +# S08 Research: Load Testing + Fallback Resilience + +## Summary + +Two independent deliverables: (1) a load test proving 10 concurrent chat sessions maintain acceptable latency, and (2) adding LLM fallback logic to `ChatService` so chat survives when the primary LLM endpoint (DGX/OpenWebUI) is down. The load test is straightforward Python scripting with `httpx` (already a dependency). The fallback is a targeted code change in one file. + +## Recommendation + +**Targeted research.** The pipeline's sync `LLMClient` already has working fallback logic — the async `ChatService` just needs the same pattern adapted for `openai.AsyncOpenAI`. Load testing uses `httpx` + `asyncio` against the live endpoint. No new libraries needed. + +## Implementation Landscape + +### 1. Fallback Resilience (the riskier piece — build first) + +**Current state:** `ChatService.__init__()` creates a single `openai.AsyncOpenAI` client pointing at `settings.llm_api_url`. When streaming fails, it yields `event: error` and returns. No retry, no fallback. + +**What exists in the pipeline:** `LLMClient` (sync, `backend/pipeline/llm_client.py`) creates two clients — `self._primary` and `self._fallback` — and catches `(openai.APIConnectionError, openai.APITimeoutError)` on the primary before retrying on the fallback. This is the exact pattern to replicate. + +**Production config (from container env):** +- `LLM_API_URL=https://chat.forgetyour.name/api` (OpenWebUI → DGX backend) +- `LLM_FALLBACK_URL=https://chat.forgetyour.name/api` (currently same URL — needs changing to Ollama) +- `LLM_FALLBACK_MODEL=fyn-llm-agent-chat` + +**Key observation:** Both primary and fallback currently point to the same URL. For the fallback to be useful, the compose config needs `LLM_FALLBACK_URL=http://chrysopedia-ollama:11434/v1` and `LLM_FALLBACK_MODEL` set to a model that Ollama actually has loaded. This is a config change in docker-compose.yml or .env. + +**Implementation in `chat_service.py`:** +- Add `self._fallback_openai = openai.AsyncOpenAI(base_url=settings.llm_fallback_url, api_key=settings.llm_api_key)` +- In `stream_response()`, wrap the streaming `create()` call in a try/except for `(openai.APIConnectionError, openai.APITimeoutError, openai.InternalServerError)`. +- On catch, log a warning, then retry with `self._fallback_openai` and `self.settings.llm_fallback_model`. +- The `event: done` payload should include a `fallback_used: true` field so the frontend/logs can distinguish. +- **Important:** Also catch `openai.InternalServerError` — the current production failure is a 500 from OpenWebUI, not a connection error. + +**Files to modify:** +- `backend/chat_service.py` — add fallback client + retry logic in `stream_response()` +- `docker-compose.yml` — set `LLM_FALLBACK_URL` to Ollama endpoint + +**Verification:** +- Unit test: mock primary to raise `APIConnectionError`, assert fallback is called and SSE events still stream. +- Integration: with DGX endpoint unreachable, chat should still work via Ollama. + +### 2. Load Testing (10 concurrent chat sessions) + +**No load testing tools installed** on either aux or ub01. No need for k6/locust/wrk — `httpx` + `asyncio` is sufficient for 10 concurrent connections to a streaming SSE endpoint. + +**Architecture:** +- Single Python script (`tests/load_test_chat.py` or `scripts/load_test_chat.py`) +- Uses `httpx.AsyncClient` with `stream=True` to POST to `/api/v1/chat` +- Fires 10 concurrent requests via `asyncio.gather()` +- Measures: time-to-first-token (TTFT), total response time, error rate +- Reports p50/p95/max latencies + +**Target endpoint:** `http://ub01:8096/api/v1/chat` (production, through nginx) + +**Rate limiting concern:** Default `rate_limit_ip_per_hour=10` means 10 concurrent requests from one IP will exhaust the limit. The load test needs to either: +- Temporarily increase the rate limit, or +- Use authenticated users (30/hour default), or +- Run from within Docker network (bypass nginx, hit API directly) + +**Recommended approach:** Run against `http://chrysopedia-api:8000/api/v1/chat` from inside the Docker network (via `docker exec` or a temporary container on the same network) to avoid nginx buffering artifacts. Alternatively, temporarily set `RATE_LIMIT_IP_PER_HOUR=100` for the test. + +**"Acceptable latency" target:** The slice says "maintain acceptable latency." R015 sets a 30-second retrieval target for search-to-read. Chat is not R015-scoped but a reasonable bar: TTFT < 5s, total completion < 30s for a typical query with 10 concurrent users. The key metric is degradation — does latency at 10 concurrent differ meaningfully from latency at 1? + +**SSE parsing:** The load test needs to parse SSE events from the stream to measure TTFT (time from request to first `event: token`). The format is `event: \ndata: \n\n`. + +**Bottleneck analysis:** +- Single uvicorn worker (no `--workers` flag in Dockerfile CMD) — all 10 requests share one event loop. Async FastAPI + async openai client should handle this fine, but if any blocking call exists, it will serialize. +- Embedding calls (via Ollama) for search context retrieval could be the bottleneck — Ollama processes sequentially. +- LLM streaming is the longest phase — 10 concurrent streams to OpenWebUI/DGX should be fine if the backend supports it. + +### Natural Task Decomposition + +1. **T01: Add fallback to ChatService** — modify `chat_service.py` to create a fallback async client and retry on primary failure. Add unit test. ~30min. +2. **T02: Configure Ollama fallback in deployment** — set `LLM_FALLBACK_URL` and `LLM_FALLBACK_MODEL` in docker-compose.yml. Verify Ollama has the model loaded. ~15min. +3. **T03: Write load test script** — Python asyncio script that fires 10 concurrent chat requests, parses SSE, reports TTFT/total latency/error metrics. ~30min. +4. **T04: Run load test + document results** — Execute against production, capture results, write brief report. ~20min. + +T01 and T02 can be done in parallel. T03 is independent. T04 depends on T01+T02+T03. + +## Constraints & Risks + +- **Ollama model availability:** Need to verify `chrysopedia-ollama` has `fyn-llm-agent-chat` or an equivalent model. If not, a model pull is needed first. +- **Rate limiter:** Load test will hit IP rate limits at default settings. Must plan around this. +- **Single worker:** The API runs a single uvicorn worker. This is fine for async I/O-bound work but any CPU-bound processing (JSON parsing, Pydantic validation) will serialize under load. +- **Current LLM endpoint is down:** `https://chat.forgetyour.name/api` returned 500 during research. This makes fallback resilience immediately relevant — it's a real production issue right now. + +## Key Files + +| File | Role | +|------|------| +| `backend/chat_service.py` | Chat service — needs fallback logic | +| `backend/pipeline/llm_client.py` | Sync LLM client with working fallback pattern to replicate | +| `backend/config.py` | Settings — already has `llm_fallback_url` and `llm_fallback_model` | +| `backend/routers/chat.py` | Chat router — no changes needed | +| `backend/tests/test_chat.py` | Existing chat tests — add fallback test | +| `docker-compose.yml` | Deployment config — needs `LLM_FALLBACK_URL` env var | +| `docker/Dockerfile.api` | Single uvicorn worker — context for load test expectations | diff --git a/.gsd/milestones/M025/slices/S08/tasks/T01-PLAN.md b/.gsd/milestones/M025/slices/S08/tasks/T01-PLAN.md new file mode 100644 index 0000000..1dbd898 --- /dev/null +++ b/.gsd/milestones/M025/slices/S08/tasks/T01-PLAN.md @@ -0,0 +1,39 @@ +--- +estimated_steps: 12 +estimated_files: 3 +skills_used: [] +--- + +# T01: Add LLM fallback client to ChatService with unit tests + +Add automatic fallback from primary to secondary LLM endpoint in ChatService, matching the pattern already used by the sync LLMClient in pipeline/llm_client.py. When the primary openai.AsyncOpenAI client fails with APIConnectionError, APITimeoutError, or InternalServerError during streaming, retry the entire create() call with a fallback client pointing at settings.llm_fallback_url + settings.llm_fallback_model. Add the fallback_used field to the SSE done event. Update docker-compose.yml to pass LLM_FALLBACK_URL=http://chrysopedia-ollama:11434/v1 to the API container. Write unit tests for both APIConnectionError and InternalServerError fallback scenarios. + +Steps: +1. Read `backend/chat_service.py` and `backend/pipeline/llm_client.py` to understand the existing fallback pattern. +2. In ChatService.__init__, create `self._fallback_openai = openai.AsyncOpenAI(base_url=settings.llm_fallback_url, api_key=settings.llm_api_key)`. +3. In stream_response(), wrap the `self._openai.chat.completions.create(...)` call and its async iteration in a try/except for `(openai.APIConnectionError, openai.APITimeoutError, openai.InternalServerError)`. On catch, log WARNING with `chat_llm_fallback` prefix including the error type and message, then retry the same create() call using `self._fallback_openai` and `self.settings.llm_fallback_model`. Track `fallback_used = True`. +4. Add `fallback_used` (bool) to the done event data dict: `{"cascade_tier": ..., "conversation_id": ..., "fallback_used": fallback_used}`. +5. Update the model name logged in the usage log call — when fallback is used, pass `self.settings.llm_fallback_model` instead of `self.settings.llm_model`. +6. In `docker-compose.yml`, add `LLM_FALLBACK_URL: http://chrysopedia-ollama:11434/v1` and `LLM_FALLBACK_MODEL: fyn-llm-agent-chat` to the chrysopedia-api environment block. +7. In `backend/tests/test_chat.py`, add two test functions: + - `test_chat_fallback_on_connection_error`: mock primary openai to raise `openai.APIConnectionError`, mock fallback openai to return streaming chunks. Assert SSE events include tokens and done event has `fallback_used: true`. + - `test_chat_fallback_on_internal_server_error`: same but with `openai.InternalServerError`. +8. Run `cd backend && python -m pytest tests/test_chat.py -v -k fallback` — both tests pass. + +## Inputs + +- ``backend/chat_service.py` — current ChatService with single openai client, no fallback` +- ``backend/pipeline/llm_client.py` — reference pattern for primary/fallback logic` +- ``backend/config.py` — Settings with llm_fallback_url and llm_fallback_model fields` +- ``backend/tests/test_chat.py` — existing chat tests with standalone ASGI client pattern` +- ``docker-compose.yml` — deployment config, needs LLM_FALLBACK_URL env var` + +## Expected Output + +- ``backend/chat_service.py` — ChatService with fallback AsyncOpenAI client and retry logic in stream_response` +- ``backend/tests/test_chat.py` — two new test functions for fallback on APIConnectionError and InternalServerError` +- ``docker-compose.yml` — LLM_FALLBACK_URL and LLM_FALLBACK_MODEL in chrysopedia-api environment` + +## Verification + +cd backend && python -m pytest tests/test_chat.py -v -k fallback diff --git a/.gsd/milestones/M025/slices/S08/tasks/T01-SUMMARY.md b/.gsd/milestones/M025/slices/S08/tasks/T01-SUMMARY.md new file mode 100644 index 0000000..11b9bd9 --- /dev/null +++ b/.gsd/milestones/M025/slices/S08/tasks/T01-SUMMARY.md @@ -0,0 +1,79 @@ +--- +id: T01 +parent: S08 +milestone: M025 +provides: [] +requires: [] +affects: [] +key_files: ["backend/chat_service.py", "backend/tests/test_chat.py", "docker-compose.yml"] +key_decisions: ["Catch APIConnectionError, APITimeoutError, and InternalServerError on primary create() then retry with fallback — matches sync LLMClient pattern"] +patterns_established: [] +drill_down_paths: [] +observability_surfaces: [] +duration: "" +verification_result: "Ran cd backend && python -m pytest tests/test_chat.py -v -k fallback — 5 passed. Ran full suite — 26/26 passed." +completed_at: 2026-04-04T14:31:10.052Z +blocker_discovered: false +--- + +# T01: Added automatic primary→fallback LLM endpoint switching in ChatService with two unit tests covering APIConnectionError and InternalServerError scenarios + +> Added automatic primary→fallback LLM endpoint switching in ChatService with two unit tests covering APIConnectionError and InternalServerError scenarios + +## What Happened +--- +id: T01 +parent: S08 +milestone: M025 +key_files: + - backend/chat_service.py + - backend/tests/test_chat.py + - docker-compose.yml +key_decisions: + - Catch APIConnectionError, APITimeoutError, and InternalServerError on primary create() then retry with fallback — matches sync LLMClient pattern +duration: "" +verification_result: passed +completed_at: 2026-04-04T14:31:10.053Z +blocker_discovered: false +--- + +# T01: Added automatic primary→fallback LLM endpoint switching in ChatService with two unit tests covering APIConnectionError and InternalServerError scenarios + +**Added automatic primary→fallback LLM endpoint switching in ChatService with two unit tests covering APIConnectionError and InternalServerError scenarios** + +## What Happened + +Added a _fallback_openai AsyncOpenAI client to ChatService.__init__ using settings.llm_fallback_url. Wrapped the primary streaming create() call in a try/except for (APIConnectionError, APITimeoutError, InternalServerError). On catch, logs WARNING with chat_llm_fallback prefix, resets accumulated response, and retries the entire streaming call using the fallback client and settings.llm_fallback_model. If fallback also fails, emits SSE error event. The fallback_used boolean is included in the done event and the usage log records the actual model used. Added LLM_FALLBACK_URL and LLM_FALLBACK_MODEL to docker-compose.yml API environment. Wrote two test functions with side_effect mock factory accounting for SearchService's AsyncOpenAI call ordering. + +## Verification + +Ran cd backend && python -m pytest tests/test_chat.py -v -k fallback — 5 passed. Ran full suite — 26/26 passed. + +## Verification Evidence + +| # | Command | Exit Code | Verdict | Duration | +|---|---------|-----------|---------|----------| +| 1 | `cd backend && python -m pytest tests/test_chat.py -v -k fallback` | 0 | ✅ pass | 5600ms | +| 2 | `cd backend && python -m pytest tests/test_chat.py -v` | 0 | ✅ pass | 4400ms | + + +## Deviations + +Test mock factory uses call_count=2/3 instead of 1/2 because patching chat_service.openai.AsyncOpenAI intercepts SearchService's constructor call as well (shared module object). + +## Known Issues + +None. + +## Files Created/Modified + +- `backend/chat_service.py` +- `backend/tests/test_chat.py` +- `docker-compose.yml` + + +## Deviations +Test mock factory uses call_count=2/3 instead of 1/2 because patching chat_service.openai.AsyncOpenAI intercepts SearchService's constructor call as well (shared module object). + +## Known Issues +None. diff --git a/.gsd/milestones/M025/slices/S08/tasks/T02-PLAN.md b/.gsd/milestones/M025/slices/S08/tasks/T02-PLAN.md new file mode 100644 index 0000000..b8de233 --- /dev/null +++ b/.gsd/milestones/M025/slices/S08/tasks/T02-PLAN.md @@ -0,0 +1,32 @@ +--- +estimated_steps: 10 +estimated_files: 1 +skills_used: [] +--- + +# T02: Write async load test script for 10 concurrent chat sessions + +Create a standalone Python script that fires 10 concurrent chat requests to the SSE endpoint, parses streaming events to measure time-to-first-token (TTFT) and total response time, and reports p50/p95/max latency statistics. Uses httpx (already a project dependency) + asyncio. No external load testing tools needed. + +Steps: +1. Create `scripts/load_test_chat.py` with argparse accepting `--url` (default http://localhost:8096), `--concurrency` (default 10), `--query` (default 'What are common compression techniques?'). +2. Implement an async function `run_single_chat(client, url, query)` that: POSTs to `{url}/api/v1/chat` with `{"query": query}`, reads the SSE stream line-by-line, records timestamp of first `event: token` line (TTFT), records total time when stream ends, returns a result dict with ttft_ms, total_ms, token_count, error (if any). +3. Implement `run_load_test(url, concurrency, query)` that creates an httpx.AsyncClient with timeout=60s, fires `concurrency` concurrent `run_single_chat` calls via asyncio.gather, collects results. +4. Compute and print statistics: for both TTFT and total time, show min/p50/p95/max. Show error count. Show per-request summary table. +5. Add `--auth-token` optional flag for authenticated requests (sets Authorization header) to avoid IP rate limit (10/hour default). Document in script docstring that running 10 requests from one IP will hit the rate limit unless authenticated or rate limit is raised. +6. Add `--output` flag to write results as JSON to a file. +7. Validate the script runs: `python scripts/load_test_chat.py --help` exits 0. +8. Test SSE parsing logic with a small inline unit test or a `--dry-run` flag that uses a mock response. + +## Inputs + +- ``backend/routers/chat.py` — chat endpoint contract (POST /api/v1/chat, SSE response format)` +- ``backend/chat_service.py` — SSE event protocol (sources, token, done, error)` + +## Expected Output + +- ``scripts/load_test_chat.py` — standalone async load test script with SSE parsing, latency measurement, and statistics reporting` + +## Verification + +python scripts/load_test_chat.py --help && echo 'Script OK' diff --git a/backend/chat_service.py b/backend/chat_service.py index 6e27a93..ca68013 100644 --- a/backend/chat_service.py +++ b/backend/chat_service.py @@ -59,6 +59,10 @@ class ChatService: base_url=settings.llm_api_url, api_key=settings.llm_api_key, ) + self._fallback_openai = openai.AsyncOpenAI( + base_url=settings.llm_fallback_url, + api_key=settings.llm_api_key, + ) self._redis = redis async def _load_history(self, conversation_id: str) -> list[dict[str, str]]: @@ -244,6 +248,7 @@ class ChatService: accumulated_response = "" usage_data: dict[str, int] | None = None + fallback_used = False try: stream = await self._openai.chat.completions.create( @@ -269,6 +274,44 @@ class ChatService: accumulated_response += text yield _sse("token", text) + except (openai.APIConnectionError, openai.APITimeoutError, openai.InternalServerError) as exc: + logger.warning( + "chat_llm_fallback primary failed (%s: %s), retrying with fallback at %s", + type(exc).__name__, exc, self.settings.llm_fallback_url, + ) + fallback_used = True + accumulated_response = "" + usage_data = None + + try: + stream = await self._fallback_openai.chat.completions.create( + model=self.settings.llm_fallback_model, + messages=messages, + stream=True, + stream_options={"include_usage": True}, + temperature=temperature, + max_tokens=2048, + ) + + async for chunk in stream: + if hasattr(chunk, "usage") and chunk.usage is not None: + usage_data = { + "prompt_tokens": chunk.usage.prompt_tokens or 0, + "completion_tokens": chunk.usage.completion_tokens or 0, + "total_tokens": chunk.usage.total_tokens or 0, + } + choice = chunk.choices[0] if chunk.choices else None + if choice and choice.delta and choice.delta.content: + text = choice.delta.content + accumulated_response += text + yield _sse("token", text) + + except Exception: + tb = traceback.format_exc() + logger.error("chat_llm_error fallback also failed query=%r cid=%s\n%s", query, conversation_id, tb) + yield _sse("error", {"message": "LLM generation failed"}) + return + except Exception: tb = traceback.format_exc() logger.error("chat_llm_error query=%r cid=%s\n%s", query, conversation_id, tb) @@ -301,7 +344,7 @@ class ChatService: query=query, usage=usage_data, cascade_tier=cascade_tier, - model=self.settings.llm_model, + model=self.settings.llm_fallback_model if fallback_used else self.settings.llm_model, latency_ms=latency_ms, ) @@ -311,7 +354,7 @@ class ChatService: query, creator, cascade_tier, len(sources), latency_ms, conversation_id, usage_data.get("total_tokens", 0), ) - yield _sse("done", {"cascade_tier": cascade_tier, "conversation_id": conversation_id}) + yield _sse("done", {"cascade_tier": cascade_tier, "conversation_id": conversation_id, "fallback_used": fallback_used}) # ── Helpers ────────────────────────────────────────────────────────────────── diff --git a/backend/tests/test_chat.py b/backend/tests/test_chat.py index 16aeb0b..b257bd2 100644 --- a/backend/tests/test_chat.py +++ b/backend/tests/test_chat.py @@ -20,6 +20,7 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest import pytest_asyncio +import openai from httpx import ASGITransport, AsyncClient # Ensure backend/ is on sys.path @@ -958,3 +959,120 @@ async def test_personality_weight_string_returns_422(chat_client): json={"query": "test", "personality_weight": "high"}, ) assert resp.status_code == 422 + + +# ── LLM fallback tests ────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_chat_fallback_on_connection_error(chat_client): + """When primary LLM raises APIConnectionError, fallback client serves the response.""" + search_result = _fake_search_result() + + # Primary client raises on create() + mock_primary = MagicMock() + mock_primary.chat.completions.create = AsyncMock( + side_effect=openai.APIConnectionError(request=MagicMock()), + ) + + # Fallback client succeeds + mock_fallback = MagicMock() + mock_fallback.chat.completions.create = AsyncMock( + return_value=_mock_openai_stream(["fallback ", "answer"]), + ) + + # AsyncOpenAI is called 3 times in ChatService.__init__: + # 1. SearchService (irrelevant, search is mocked) + # 2. self._openai (primary) + # 3. self._fallback_openai (fallback) + call_count = 0 + + def _make_client(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 2: + return mock_primary + if call_count == 3: + return mock_fallback + return MagicMock() + + with ( + patch("chat_service.SearchService.search", new_callable=AsyncMock, return_value=search_result), + patch("chat_service.openai.AsyncOpenAI", side_effect=_make_client), + ): + resp = await chat_client.post("/api/v1/chat", json={"query": "test fallback"}) + + assert resp.status_code == 200 + events = _parse_sse(resp.text) + event_types = [e["event"] for e in events] + + assert "sources" in event_types + assert "token" in event_types + assert "done" in event_types + assert "error" not in event_types + + # Verify tokens came from fallback + token_texts = [e["data"] for e in events if e["event"] == "token"] + combined = "".join(token_texts) + assert "fallback answer" in combined + + # Done event should have fallback_used=True + done_data = next(e for e in events if e["event"] == "done")["data"] + assert done_data["fallback_used"] is True + + +@pytest.mark.asyncio +async def test_chat_fallback_on_internal_server_error(chat_client): + """When primary LLM raises InternalServerError, fallback client serves the response.""" + search_result = _fake_search_result() + + # Primary client raises InternalServerError on create() + mock_primary = MagicMock() + mock_primary.chat.completions.create = AsyncMock( + side_effect=openai.InternalServerError( + message="GPU OOM", + response=MagicMock(status_code=500), + body=None, + ), + ) + + # Fallback client succeeds + mock_fallback = MagicMock() + mock_fallback.chat.completions.create = AsyncMock( + return_value=_mock_openai_stream(["recovered ", "response"]), + ) + + call_count = 0 + + def _make_client(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 2: + return mock_primary + if call_count == 3: + return mock_fallback + return MagicMock() + + with ( + patch("chat_service.SearchService.search", new_callable=AsyncMock, return_value=search_result), + patch("chat_service.openai.AsyncOpenAI", side_effect=_make_client), + ): + resp = await chat_client.post("/api/v1/chat", json={"query": "test ise fallback"}) + + assert resp.status_code == 200 + events = _parse_sse(resp.text) + event_types = [e["event"] for e in events] + + assert "sources" in event_types + assert "token" in event_types + assert "done" in event_types + assert "error" not in event_types + + # Verify tokens from fallback + token_texts = [e["data"] for e in events if e["event"] == "token"] + combined = "".join(token_texts) + assert "recovered response" in combined + + # Done event should have fallback_used=True + done_data = next(e for e in events if e["event"] == "done")["data"] + assert done_data["fallback_used"] is True diff --git a/docker-compose.yml b/docker-compose.yml index c349d97..41e4f4a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -121,6 +121,8 @@ services: REDIS_URL: redis://chrysopedia-redis:6379/0 QDRANT_URL: http://chrysopedia-qdrant:6333 EMBEDDING_API_URL: http://chrysopedia-ollama:11434/v1 + LLM_FALLBACK_URL: http://chrysopedia-ollama:11434/v1 + LLM_FALLBACK_MODEL: fyn-llm-agent-chat PROMPTS_PATH: /prompts volumes: - /vmPool/r/services/chrysopedia_data:/data