mirror of
https://github.com/xpltdco/media-rip.git
synced 2026-04-03 02:53:58 -06:00
Fix SSE busy-loop (ping=0), keep curl in image, recover zombie jobs on startup
Three bugs causing 100% CPU and container crash-looping in production: 1. sse-starlette ping=0 causes await anyio.sleep(0) busy loop in _ping task. Each SSE connection spins a ping task at 100% CPU. Changed to ping=15 (built-in keepalive). Removed our manual ping yield in favor of continue. 2. Dockerfile purged curl after installing deno, but Docker healthcheck (and compose override) uses curl. Healthcheck always failed -> autoheal restarted the container every ~2 minutes. Keep curl in the image. 3. Downloads that fail during server shutdown leave zombie jobs stuck in queued/downloading status (event loop closes before error handler can update DB). Added startup recovery that marks these as failed.
This commit is contained in:
parent
182104e57f
commit
b86366116a
3 changed files with 22 additions and 5 deletions
|
|
@ -27,10 +27,11 @@ FROM python:3.12-slim AS runtime
|
||||||
|
|
||||||
# Install ffmpeg (required by yt-dlp for muxing/transcoding)
|
# Install ffmpeg (required by yt-dlp for muxing/transcoding)
|
||||||
# Install deno (required by yt-dlp for YouTube JS interpretation)
|
# Install deno (required by yt-dlp for YouTube JS interpretation)
|
||||||
|
# Keep curl for Docker healthcheck probes
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends ffmpeg curl unzip && \
|
apt-get install -y --no-install-recommends ffmpeg curl unzip && \
|
||||||
curl -fsSL https://deno.land/install.sh | DENO_INSTALL=/usr/local sh && \
|
curl -fsSL https://deno.land/install.sh | DENO_INSTALL=/usr/local sh && \
|
||||||
apt-get purge -y curl unzip && \
|
apt-get purge -y unzip && \
|
||||||
apt-get autoremove -y && \
|
apt-get autoremove -y && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
@ -62,6 +63,6 @@ ENV MEDIARIP__DOWNLOADS__OUTPUT_DIR=/downloads \
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
||||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1
|
CMD curl -f http://localhost:8000/api/health || exit 1
|
||||||
|
|
||||||
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["python", "start.py"]
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,20 @@ async def lifespan(app: FastAPI):
|
||||||
# --- Download service ---
|
# --- Download service ---
|
||||||
download_service = DownloadService(config, db, broker, loop)
|
download_service = DownloadService(config, db, broker, loop)
|
||||||
|
|
||||||
|
# --- Recover zombie jobs from unclean shutdown ---
|
||||||
|
# Jobs stuck in queued/downloading status from a previous crash will never
|
||||||
|
# complete — mark them as failed so they don't confuse the UI.
|
||||||
|
try:
|
||||||
|
recovered = await db.execute(
|
||||||
|
"UPDATE jobs SET status = 'failed', error_message = 'Interrupted by server restart' "
|
||||||
|
"WHERE status IN ('queued', 'downloading')"
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
if recovered.rowcount > 0:
|
||||||
|
logger.warning("Recovered %d zombie job(s) from previous shutdown", recovered.rowcount)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to recover zombie jobs: %s", e)
|
||||||
|
|
||||||
# --- Purge scheduler ---
|
# --- Purge scheduler ---
|
||||||
scheduler = None
|
scheduler = None
|
||||||
if config.purge.enabled:
|
if config.purge.enabled:
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,9 @@ async def event_generator(
|
||||||
"data": json.dumps(event.model_dump()),
|
"data": json.dumps(event.model_dump()),
|
||||||
}
|
}
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
yield {"event": "ping", "data": ""}
|
# No event in KEEPALIVE_TIMEOUT — loop back and wait again.
|
||||||
|
# sse-starlette's built-in ping handles the actual keepalive.
|
||||||
|
continue
|
||||||
finally:
|
finally:
|
||||||
broker.unsubscribe(session_id, queue)
|
broker.unsubscribe(session_id, queue)
|
||||||
logger.info("SSE disconnected for session %s", session_id)
|
logger.info("SSE disconnected for session %s", session_id)
|
||||||
|
|
@ -87,5 +89,5 @@ async def sse_events(
|
||||||
|
|
||||||
return EventSourceResponse(
|
return EventSourceResponse(
|
||||||
event_generator(session_id, broker, db),
|
event_generator(session_id, broker, db),
|
||||||
ping=0, # we handle keepalive ourselves
|
ping=15, # sse-starlette sends keepalive pings (0 = busy-loop bug)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue