From b86366116ae6d1e2d29c621b7b94d8b71f26557b Mon Sep 17 00:00:00 2001
From: xpltd <dev@xpltd.co>
Date: Sat, 21 Mar 2026 17:59:24 -0500
Subject: [PATCH] Fix SSE busy-loop (ping=0), keep curl in image, recover
 zombie jobs on startup

Three bugs causing 100% CPU and container crash-looping in production:

1. sse-starlette ping=0 causes await anyio.sleep(0) busy loop in _ping task.
   Each SSE connection spins a ping task at 100% CPU. Changed to ping=15
   (built-in keepalive). Removed our manual ping yield in favor of continue.

2. Dockerfile purged curl after installing deno, but Docker healthcheck
   (and compose override) uses curl. Healthcheck always failed -> autoheal
   restarted the container every ~2 minutes. Keep curl in the image.

3. Downloads that fail during server shutdown leave zombie jobs stuck in
   queued/downloading status (event loop closes before error handler can
   update DB). Added startup recovery that marks these as failed.
---
 Dockerfile                 |  7 ++++---
 backend/app/main.py        | 14 ++++++++++++++
 backend/app/routers/sse.py |  6 ++++--
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9c85935..74d09c3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,10 +27,11 @@ FROM python:3.12-slim AS runtime
 
 # Install ffmpeg (required by yt-dlp for muxing/transcoding)
 # Install deno (required by yt-dlp for YouTube JS interpretation)
+# Keep curl for Docker healthcheck probes
 RUN apt-get update && \
     apt-get install -y --no-install-recommends ffmpeg curl unzip && \
     curl -fsSL https://deno.land/install.sh | DENO_INSTALL=/usr/local sh && \
-    apt-get purge -y curl unzip && \
+    apt-get purge -y unzip && \
     apt-get autoremove -y && \
     rm -rf /var/lib/apt/lists/*
 
@@ -62,6 +63,6 @@ ENV MEDIARIP__DOWNLOADS__OUTPUT_DIR=/downloads \
 EXPOSE 8000
 
 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
-    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1
+    CMD curl -f http://localhost:8000/api/health || exit 1
 
-CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["python", "start.py"]
diff --git a/backend/app/main.py b/backend/app/main.py
index 679d0ff..57091c6 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -78,6 +78,20 @@ async def lifespan(app: FastAPI):
     # --- Download service ---
     download_service = DownloadService(config, db, broker, loop)
 
+    # --- Recover zombie jobs from unclean shutdown ---
+    # Jobs stuck in queued/downloading status from a previous crash will never
+    # complete — mark them as failed so they don't confuse the UI.
+    try:
+        recovered = await db.execute(
+            "UPDATE jobs SET status = 'failed', error_message = 'Interrupted by server restart' "
+            "WHERE status IN ('queued', 'downloading')"
+        )
+        await db.commit()
+        if recovered.rowcount > 0:
+            logger.warning("Recovered %d zombie job(s) from previous shutdown", recovered.rowcount)
+    except Exception as e:
+        logger.error("Failed to recover zombie jobs: %s", e)
+
     # --- Purge scheduler ---
     scheduler = None
     if config.purge.enabled:
diff --git a/backend/app/routers/sse.py b/backend/app/routers/sse.py
index b98bf03..c1b0235 100644
--- a/backend/app/routers/sse.py
+++ b/backend/app/routers/sse.py
@@ -70,7 +70,9 @@ async def event_generator(
                         "data": json.dumps(event.model_dump()),
                     }
             except asyncio.TimeoutError:
-                yield {"event": "ping", "data": ""}
+                # No event in KEEPALIVE_TIMEOUT — loop back and wait again.
+                # sse-starlette's built-in ping handles the actual keepalive.
+                continue
     finally:
         broker.unsubscribe(session_id, queue)
         logger.info("SSE disconnected for session %s", session_id)
@@ -87,5 +89,5 @@ async def sse_events(
 
     return EventSourceResponse(
         event_generator(session_id, broker, db),
-        ping=0,  # we handle keepalive ourselves
+        ping=15,  # sse-starlette sends keepalive pings (0 = busy-loop bug)
     )