feat: Built Redis sliding-window rate limiter, ChatUsageLog model with…

- "backend/rate_limiter.py" - "backend/models.py" - "backend/routers/chat.py" - "backend/chat_service.py" - "backend/config.py" - "alembic/versions/031_add_chat_usage_log.py" GSD-Task: S04/T01
2026-04-04 13:36:29 +00:00 · 2026-04-04 13:36:29 +00:00 · 638477cc8e
commit 638477cc8e
parent a0e228d5b4
6 changed files with 352 additions and 7 deletions
--- a/alembic/versions/031_add_chat_usage_log.py
+++ b/alembic/versions/031_add_chat_usage_log.py
@ -0,0 +1,40 @@
 """add_chat_usage_log
 Revision ID: 031_chat_usage_log
 Revises: 030_onboarding
 Create Date: 2026-04-04
 """
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import UUID
 # revision identifiers
 revision = "031_chat_usage_log"
 down_revision = "030_onboarding"
 branch_labels = None
 depends_on = None
 def upgrade() -> None:
    op.create_table(
        "chat_usage_log",
        sa.Column("id", UUID(as_uuid=True), primary_key=True, server_default=sa.func.gen_random_uuid()),
        sa.Column("user_id", UUID(as_uuid=True), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True),
        sa.Column("client_ip", sa.String(45), nullable=True),
        sa.Column("creator_slug", sa.String(255), nullable=True),
        sa.Column("query", sa.Text(), nullable=False),
        sa.Column("prompt_tokens", sa.Integer(), nullable=False, server_default="0"),
        sa.Column("completion_tokens", sa.Integer(), nullable=False, server_default="0"),
        sa.Column("total_tokens", sa.Integer(), nullable=False, server_default="0"),
        sa.Column("cascade_tier", sa.String(50), nullable=True),
        sa.Column("model", sa.String(100), nullable=True),
        sa.Column("latency_ms", sa.Float(), nullable=True),
        sa.Column("created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()),
    )
    op.create_index("ix_chat_usage_log_created_at", "chat_usage_log", ["created_at"])
 def downgrade() -> None:
    op.drop_index("ix_chat_usage_log_created_at", table_name="chat_usage_log")
    op.drop_table("chat_usage_log")
--- a/backend/chat_service.py
+++ b/backend/chat_service.py
@ -127,6 +127,46 @@ class ChatService:
        voice_block = _build_personality_block(creator_name, profile, weight)
        return system_prompt + "\n\n" + voice_block
    async def _log_usage(
        self,
        db: AsyncSession,
        user_id: Any | None,
        client_ip: str | None,
        creator_slug: str | None,
        query: str,
        usage: dict[str, int],
        cascade_tier: str,
        model: str,
        latency_ms: float,
    ) -> None:
        """Insert a ChatUsageLog row. Non-blocking — errors logged, not raised."""
        try:
            from models import ChatUsageLog
            log_entry = ChatUsageLog(
                user_id=user_id,
                client_ip=client_ip,
                creator_slug=creator_slug,
                query=query[:2000],  # truncate very long queries
                prompt_tokens=usage.get("prompt_tokens", 0),
                completion_tokens=usage.get("completion_tokens", 0),
                total_tokens=usage.get("total_tokens", 0),
                cascade_tier=cascade_tier,
                model=model,
                latency_ms=latency_ms,
            )
            db.add(log_entry)
            await db.commit()
        except Exception:
            logger.error(
                "chat_usage_log_insert_error user=%s ip=%s",
                user_id, client_ip, exc_info=True,
            )
            try:
                await db.rollback()
            except Exception:
                pass
    async def stream_response(
        self,
        query: str,
@ -134,6 +174,8 @@ class ChatService:
        creator: str | None = None,
        conversation_id: str | None = None,
        personality_weight: float = 0.0,
        user_id: Any | None = None,
        client_ip: str | None = None,
    ) -> AsyncIterator[str]:
        """Yield SSE-formatted events for a chat query.
@ -201,17 +243,26 @@ class ChatService:
        messages.append({"role": "user", "content": query})
        accumulated_response = ""
        usage_data: dict[str, int] | None = None
        try:
            stream = await self._openai.chat.completions.create(
                model=self.settings.llm_model,
                messages=messages,
                stream=True,
                stream_options={"include_usage": True},
                temperature=temperature,
                max_tokens=2048,
            )
            async for chunk in stream:
                # The final chunk with stream_options carries usage in chunk.usage
                if hasattr(chunk, "usage") and chunk.usage is not None:
                    usage_data = {
                        "prompt_tokens": chunk.usage.prompt_tokens or 0,
                        "completion_tokens": chunk.usage.completion_tokens or 0,
                        "total_tokens": chunk.usage.total_tokens or 0,
                    }
                choice = chunk.choices[0] if chunk.choices else None
                if choice and choice.delta and choice.delta.content:
                    text = choice.delta.content
@ -227,11 +278,38 @@ class ChatService:
        # ── 4. Save conversation history ────────────────────────────────
        await self._save_history(conversation_id, history, query, accumulated_response)
-        # ── 5. Done event ───────────────────────────────────────────────
+        # ── 5. Log token usage ──────────────────────────────────────────
        latency_ms = (time.monotonic() - start) * 1000
        # Fallback: estimate tokens from character counts if stream_options not available
        if usage_data is None:
            prompt_chars = sum(len(m.get("content", "")) for m in messages)
            est_prompt = prompt_chars // 4
            est_completion = len(accumulated_response) // 4
            usage_data = {
                "prompt_tokens": est_prompt,
                "completion_tokens": est_completion,
                "total_tokens": est_prompt + est_completion,
            }
            logger.warning("chat_usage_estimated cid=%s (stream_options usage not available)", conversation_id)
        await self._log_usage(
            db=db,
            user_id=user_id,
            client_ip=client_ip,
            creator_slug=creator,
            query=query,
            usage=usage_data,
            cascade_tier=cascade_tier,
            model=self.settings.llm_model,
            latency_ms=latency_ms,
        )
        # ── 6. Done event ───────────────────────────────────────────────
        logger.info(
-            "chat_done query=%r creator=%r cascade_tier=%s source_count=%d latency_ms=%.1f cid=%s",
+            "chat_done query=%r creator=%r cascade_tier=%s source_count=%d latency_ms=%.1f cid=%s tokens=%d",
            query, creator, cascade_tier, len(sources), latency_ms, conversation_id,
            usage_data.get("total_tokens", 0),
        )
        yield _sse("done", {"cascade_tier": cascade_tier, "conversation_id": conversation_id})
--- a/backend/config.py
+++ b/backend/config.py
@ -91,6 +91,11 @@ class Settings(BaseSettings):
    smtp_from_address: str = ""
    smtp_tls: bool = True
    # Rate limiting (per hour)
    rate_limit_user_per_hour: int = 30
    rate_limit_ip_per_hour: int = 10
    rate_limit_creator_per_hour: int = 60
    # Git commit SHA (set at Docker build time or via env var)
    git_commit_sha: str = "unknown"
--- a/backend/models.py
+++ b/backend/models.py
@ -902,3 +902,31 @@ class GeneratedShort(Base):
    # relationships
    highlight_candidate: Mapped[HighlightCandidate] = sa_relationship()
 # ── Chat Usage Tracking ──────────────────────────────────────────────────────
 class ChatUsageLog(Base):
    """Per-request token usage log for chat completions.
    Append-only table — one row per chat request. Used for cost tracking,
    rate limit analytics, and the admin usage dashboard.
    """
    __tablename__ = "chat_usage_log"
    id: Mapped[uuid.UUID] = _uuid_pk()
    user_id: Mapped[uuid.UUID | None] = mapped_column(
        ForeignKey("users.id", ondelete="SET NULL"), nullable=True,
    )
    client_ip: Mapped[str | None] = mapped_column(String(45), nullable=True)
    creator_slug: Mapped[str | None] = mapped_column(String(255), nullable=True)
    query: Mapped[str] = mapped_column(Text, nullable=False)
    prompt_tokens: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    completion_tokens: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    total_tokens: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    cascade_tier: Mapped[str | None] = mapped_column(String(50), nullable=True)
    model: Mapped[str | None] = mapped_column(String(100), nullable=True)
    latency_ms: Mapped[float | None] = mapped_column(Float, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        default=_now, server_default=func.now(), index=True,
    )
--- a/backend/rate_limiter.py
+++ b/backend/rate_limiter.py
@ -0,0 +1,116 @@
 """Redis sliding-window rate limiter using sorted sets.
 Each rate limit key is a Redis sorted set where members are unique
 request identifiers (timestamps with microseconds) and scores are
 Unix timestamps. On each check, expired entries are pruned, the
 current request is added, and the count determines whether the
 request is allowed.
 Fail-open: If Redis is unavailable, requests are allowed through
 with a WARNING log.
 """
 from __future__ import annotations
 import logging
 import time
 from dataclasses import dataclass
 import redis.asyncio as aioredis
 logger = logging.getLogger("chrysopedia.rate_limiter")
 _KEY_PREFIX = "chrysopedia:ratelimit"
@dataclass
 class RateLimitResult:
    """Result of a rate limit check."""
    allowed: bool
    remaining: int
    retry_after: int  # seconds until the window slides enough to allow a request; 0 if allowed
 class RateLimiter:
    """Sliding-window rate limiter backed by Redis sorted sets.
    Usage::
        limiter = RateLimiter(redis)
        result = await limiter.check_rate_limit("user:abc123", limit=30, window_seconds=3600)
        if not result.allowed:
            return 429, result.retry_after
    """
    def __init__(self, redis: aioredis.Redis) -> None:
        self._redis = redis
    @staticmethod
    def key(scope: str, identifier: str) -> str:
        """Build a namespaced Redis key for a rate limit bucket."""
        return f"{_KEY_PREFIX}:{scope}:{identifier}"
    async def check_rate_limit(
        self,
        key: str,
        limit: int,
        window_seconds: int = 3600,
    ) -> RateLimitResult:
        """Check whether a request is within the rate limit.
        Uses a sorted set where:
        - ZREMRANGEBYSCORE prunes entries older than the window
        - ZCARD counts current entries
        - ZADD adds the current request if under limit
        Returns a RateLimitResult with allowed/remaining/retry_after.
        On Redis errors, fails open (allowed=True).
        """
        now = time.time()
        window_start = now - window_seconds
        try:
            pipe = self._redis.pipeline(transaction=True)
            # Remove expired entries
            pipe.zremrangebyscore(key, "-inf", window_start)
            # Count remaining entries
            pipe.zcard(key)
            results = await pipe.execute()
            current_count: int = results[1]
            if current_count >= limit:
                # Over limit — calculate retry_after from oldest entry
                oldest = await self._redis.zrange(key, 0, 0, withscores=True)
                if oldest:
                    oldest_score = oldest[0][1]
                    retry_after = int(oldest_score + window_seconds - now) + 1
                    retry_after = max(retry_after, 1)
                else:
                    retry_after = window_seconds
                return RateLimitResult(
                    allowed=False,
                    remaining=0,
                    retry_after=retry_after,
                )
            # Under limit — add this request
            member = f"{now}:{id(key)}"  # unique member per call
            await self._redis.zadd(key, {member: now})
            # Set TTL on the key so it auto-expires after the window
            await self._redis.expire(key, window_seconds + 60)
            remaining = limit - current_count - 1
            return RateLimitResult(
                allowed=True,
                remaining=max(remaining, 0),
                retry_after=0,
            )
        except Exception:
            logger.warning(
                "rate_limit_redis_error key=%s — failing open", key, exc_info=True
            )
            return RateLimitResult(allowed=True, remaining=limit, retry_after=0)
--- a/backend/routers/chat.py
+++ b/backend/routers/chat.py
@ -2,20 +2,25 @@
 Accepts a query and optional creator filter, returns a Server-Sent Events
 stream with sources, token, done, and error events.
 Rate limiting: per-user (authenticated), per-IP (anonymous), and per-creator.
 """
 from __future__ import annotations
 import logging
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, Request
-from fastapi.responses import StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from auth import get_optional_user
 from chat_service import ChatService
 from config import Settings, get_settings
 from database import get_session
 from models import User
 from rate_limiter import RateLimiter
 from redis_client import get_redis
 logger = logging.getLogger("chrysopedia.chat.router")
@ -32,23 +37,94 @@ class ChatRequest(BaseModel):
    personality_weight: float = Field(default=0.0, ge=0.0, le=1.0)
-@router.post("")
+def _get_client_ip(request: Request) -> str:
    """Extract client IP, preferring X-Forwarded-For behind a reverse proxy."""
    forwarded = request.headers.get("x-forwarded-for")
    if forwarded:
        return forwarded.split(",")[0].strip()
    return request.client.host if request.client else "unknown"
@router.post("", response_model=None)
 async def chat(
    body: ChatRequest,
    request: Request,
    db: AsyncSession = Depends(get_session),
    settings: Settings = Depends(get_settings),
-) -> StreamingResponse:
+    user: User | None = Depends(get_optional_user),
 ):
    """Stream a chat response as Server-Sent Events.
    Rate limits are checked before processing:
    - Authenticated users: ``rate_limit_user_per_hour`` requests/hour
    - Anonymous (IP-based): ``rate_limit_ip_per_hour`` requests/hour
    - Per-creator (if creator filter set): ``rate_limit_creator_per_hour`` requests/hour
    SSE protocol:
    - ``event: sources`` — citation metadata array (sent first)
    - ``event: token``   — streamed text chunk (repeated)
    - ``event: done``    — completion metadata with cascade_tier, conversation_id
    - ``event: error``   — error message (on failure)
    """
-    logger.info("chat_request query=%r creator=%r cid=%r weight=%.2f", body.query, body.creator, body.conversation_id, body.personality_weight)
+    client_ip = _get_client_ip(request)
    user_id = user.id if user else None
    logger.info(
        "chat_request query=%r creator=%r cid=%r weight=%.2f user=%s ip=%s",
        body.query, body.creator, body.conversation_id,
        body.personality_weight, user_id, client_ip,
    )
    redis = await get_redis()
    # ── Rate limiting ───────────────────────────────────────────────────
    limiter = RateLimiter(redis)
    # User-based limit (authenticated) or IP-based limit (anonymous)
    if user_id:
        identity_key = RateLimiter.key("user", str(user_id))
        identity_limit = settings.rate_limit_user_per_hour
    else:
        identity_key = RateLimiter.key("ip", client_ip)
        identity_limit = settings.rate_limit_ip_per_hour
    result = await limiter.check_rate_limit(identity_key, identity_limit, window_seconds=3600)
    if not result.allowed:
        scope = "user" if user_id else "ip"
        logger.warning(
            "rate_limit_exceeded scope=%s key=%s remaining=%d retry_after=%d",
            scope, identity_key, result.remaining, result.retry_after,
        )
        return JSONResponse(
            status_code=429,
            content={
                "error": "Rate limit exceeded",
                "retry_after": result.retry_after,
            },
            headers={"Retry-After": str(result.retry_after)},
        )
    # Per-creator limit (if creator filter is provided)
    if body.creator:
        creator_key = RateLimiter.key("creator", body.creator)
        creator_result = await limiter.check_rate_limit(
            creator_key, settings.rate_limit_creator_per_hour, window_seconds=3600,
        )
        if not creator_result.allowed:
            logger.warning(
                "rate_limit_exceeded scope=creator key=%s retry_after=%d",
                creator_key, creator_result.retry_after,
            )
            return JSONResponse(
                status_code=429,
                content={
                    "error": "Creator rate limit exceeded",
                    "retry_after": creator_result.retry_after,
                },
                headers={"Retry-After": str(creator_result.retry_after)},
            )
    # ── Stream response ─────────────────────────────────────────────────
    service = ChatService(settings, redis=redis)
    return StreamingResponse(
@ -58,6 +134,8 @@ async def chat(
            creator=body.creator,
            conversation_id=body.conversation_id,
            personality_weight=body.personality_weight,
            user_id=user_id,
            client_ip=client_ip,
        ),
        media_type="text/event-stream",
        headers={