From b86366116ae6d1e2d29c621b7b94d8b71f26557b Mon Sep 17 00:00:00 2001 From: xpltd Date: Sat, 21 Mar 2026 17:59:24 -0500 Subject: [PATCH] Fix SSE busy-loop (ping=0), keep curl in image, recover zombie jobs on startup Three bugs causing 100% CPU and container crash-looping in production: 1. sse-starlette ping=0 causes await anyio.sleep(0) busy loop in _ping task. Each SSE connection spins a ping task at 100% CPU. Changed to ping=15 (built-in keepalive). Removed our manual ping yield in favor of continue. 2. Dockerfile purged curl after installing deno, but Docker healthcheck (and compose override) uses curl. Healthcheck always failed -> autoheal restarted the container every ~2 minutes. Keep curl in the image. 3. Downloads that fail during server shutdown leave zombie jobs stuck in queued/downloading status (event loop closes before error handler can update DB). Added startup recovery that marks these as failed. --- Dockerfile | 7 ++++--- backend/app/main.py | 14 ++++++++++++++ backend/app/routers/sse.py | 6 ++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9c85935..74d09c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,10 +27,11 @@ FROM python:3.12-slim AS runtime # Install ffmpeg (required by yt-dlp for muxing/transcoding) # Install deno (required by yt-dlp for YouTube JS interpretation) +# Keep curl for Docker healthcheck probes RUN apt-get update && \ apt-get install -y --no-install-recommends ffmpeg curl unzip && \ curl -fsSL https://deno.land/install.sh | DENO_INSTALL=/usr/local sh && \ - apt-get purge -y curl unzip && \ + apt-get purge -y unzip && \ apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* @@ -62,6 +63,6 @@ ENV MEDIARIP__DOWNLOADS__OUTPUT_DIR=/downloads \ EXPOSE 8000 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1 + CMD curl -f http://localhost:8000/api/health || exit 1 -CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["python", "start.py"] diff --git a/backend/app/main.py b/backend/app/main.py index 679d0ff..57091c6 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -78,6 +78,20 @@ async def lifespan(app: FastAPI): # --- Download service --- download_service = DownloadService(config, db, broker, loop) + # --- Recover zombie jobs from unclean shutdown --- + # Jobs stuck in queued/downloading status from a previous crash will never + # complete — mark them as failed so they don't confuse the UI. + try: + recovered = await db.execute( + "UPDATE jobs SET status = 'failed', error_message = 'Interrupted by server restart' " + "WHERE status IN ('queued', 'downloading')" + ) + await db.commit() + if recovered.rowcount > 0: + logger.warning("Recovered %d zombie job(s) from previous shutdown", recovered.rowcount) + except Exception as e: + logger.error("Failed to recover zombie jobs: %s", e) + # --- Purge scheduler --- scheduler = None if config.purge.enabled: diff --git a/backend/app/routers/sse.py b/backend/app/routers/sse.py index b98bf03..c1b0235 100644 --- a/backend/app/routers/sse.py +++ b/backend/app/routers/sse.py @@ -70,7 +70,9 @@ async def event_generator( "data": json.dumps(event.model_dump()), } except asyncio.TimeoutError: - yield {"event": "ping", "data": ""} + # No event in KEEPALIVE_TIMEOUT — loop back and wait again. + # sse-starlette's built-in ping handles the actual keepalive. + continue finally: broker.unsubscribe(session_id, queue) logger.info("SSE disconnected for session %s", session_id) @@ -87,5 +89,5 @@ async def sse_events( return EventSourceResponse( event_generator(session_id, broker, db), - ping=0, # we handle keepalive ourselves + ping=15, # sse-starlette sends keepalive pings (0 = busy-loop bug) )