media-rip/backend/app/services/download.py
xpltd 1da3ef37f1 M002/S04: fix filename resolution for downloads
- Use extract_info + prepare_filename to determine output filename
  before downloading (yt-dlp skips progress hooks when file exists)
- Normalize filenames to relative paths (strip output dir prefix)
- Include filename in completion SSE event so frontend displays it
- Fixes file download 404s from subdirectory source templates
2026-03-18 23:44:29 -05:00

368 lines
13 KiB
Python

"""Download service — yt-dlp wrapper with sync-to-async progress bridging.
Wraps synchronous yt-dlp operations in a :class:`~concurrent.futures.ThreadPoolExecutor`
and bridges progress events to the async world via :class:`~app.core.sse_broker.SSEBroker`.
Each download job gets a **fresh** ``YoutubeDL`` instance — they are never shared across
threads (yt-dlp has mutable internal state: cookies, temp files, logger).
"""
from __future__ import annotations
import asyncio
import logging
import os
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
import yt_dlp
from app.core.config import AppConfig
from app.core.database import (
create_job,
get_job,
update_job_progress,
update_job_status,
)
from app.core.sse_broker import SSEBroker
from app.models.job import (
FormatInfo,
Job,
JobCreate,
JobStatus,
ProgressEvent,
)
from app.services.output_template import resolve_template
logger = logging.getLogger("mediarip.download")
class DownloadService:
"""Manages yt-dlp downloads with async-compatible progress reporting.
Parameters
----------
config:
Application configuration (download paths, concurrency, templates).
db:
Async SQLite connection (aiosqlite).
broker:
SSE event broker for real-time progress push.
loop:
The asyncio event loop. Captured once at construction — must not be
called from inside a worker thread.
"""
def __init__(
self,
config: AppConfig,
db, # aiosqlite.Connection
broker: SSEBroker,
loop: asyncio.AbstractEventLoop,
) -> None:
self._config = config
self._db = db
self._broker = broker
self._loop = loop
self._executor = ThreadPoolExecutor(
max_workers=config.downloads.max_concurrent,
thread_name_prefix="ytdl",
)
# Per-job throttle state for DB writes (only used inside worker threads)
self._last_db_percent: dict[str, float] = {}
# ------------------------------------------------------------------
# Public async interface
# ------------------------------------------------------------------
async def enqueue(self, job_create: JobCreate, session_id: str) -> Job:
"""Create a job and submit it for background download.
Returns the ``Job`` immediately with status ``queued``.
"""
job_id = str(uuid.uuid4())
template = resolve_template(
job_create.url,
job_create.output_template,
self._config,
)
now = datetime.now(timezone.utc).isoformat()
job = Job(
id=job_id,
session_id=session_id,
url=job_create.url,
status=JobStatus.queued,
format_id=job_create.format_id,
quality=job_create.quality,
output_template=template,
created_at=now,
)
await create_job(self._db, job)
logger.info("Job %s created for URL: %s", job_id, job_create.url)
# Build yt-dlp options
output_dir = self._config.downloads.output_dir
os.makedirs(output_dir, exist_ok=True)
outtmpl = os.path.join(output_dir, template)
opts: dict = {
"outtmpl": outtmpl,
"quiet": True,
"no_warnings": True,
"noprogress": True,
}
if job_create.format_id:
opts["format"] = job_create.format_id
elif job_create.quality:
opts["format"] = job_create.quality
self._loop.run_in_executor(
self._executor,
self._run_download,
job_id,
job_create.url,
opts,
session_id,
)
return job
async def get_formats(self, url: str) -> list[FormatInfo]:
"""Extract available formats for *url* without downloading.
Runs yt-dlp ``extract_info`` in the thread pool.
"""
info = await self._loop.run_in_executor(
self._executor,
self._extract_info,
url,
)
if not info:
return []
formats_raw = info.get("formats") or []
result: list[FormatInfo] = []
for f in formats_raw:
result.append(
FormatInfo(
format_id=f.get("format_id", "unknown"),
ext=f.get("ext", "unknown"),
resolution=f.get("resolution"),
codec=f.get("vcodec"),
filesize=f.get("filesize"), # may be None — that's fine
format_note=f.get("format_note"),
vcodec=f.get("vcodec"),
acodec=f.get("acodec"),
)
)
# Sort: best resolution first (descending by height, fallback 0)
result.sort(
key=lambda fi: _parse_resolution_height(fi.resolution),
reverse=True,
)
return result
async def cancel(self, job_id: str) -> None:
"""Mark a job as failed with a cancellation message.
Note: yt-dlp has no reliable mid-stream abort mechanism. The
worker thread continues but the job is marked as failed in the DB.
"""
await update_job_status(
self._db, job_id, JobStatus.failed.value, "Cancelled by user"
)
logger.info("Job %s cancelled by user", job_id)
def shutdown(self) -> None:
"""Shut down the thread pool (non-blocking)."""
self._executor.shutdown(wait=False)
logger.info("Download executor shut down")
# ------------------------------------------------------------------
# Private — runs in worker threads
# ------------------------------------------------------------------
def _run_download(
self,
job_id: str,
url: str,
opts: dict,
session_id: str,
) -> None:
"""Execute yt-dlp download in a worker thread.
Creates a fresh ``YoutubeDL`` instance (never shared) and bridges
progress events to the async event loop.
"""
logger.info("Job %s starting download: %s", job_id, url)
self._last_db_percent[job_id] = -1.0
def progress_hook(d: dict) -> None:
try:
event = ProgressEvent.from_yt_dlp(job_id, d)
# Normalize filename to be relative to the output directory
# so the frontend can construct download URLs correctly.
if event.filename:
abs_path = Path(event.filename).resolve()
out_dir = Path(self._config.downloads.output_dir).resolve()
try:
event.filename = str(abs_path.relative_to(out_dir))
except ValueError:
# Not under output_dir — use basename as fallback
event.filename = abs_path.name
# Always publish to SSE broker (cheap, in-memory)
self._broker.publish(session_id, event)
# Throttle DB writes: ≥1% change or status change
last_pct = self._last_db_percent.get(job_id, -1.0)
status_changed = d.get("status") in ("finished", "error")
pct_changed = abs(event.percent - last_pct) >= 1.0
if pct_changed or status_changed:
self._last_db_percent[job_id] = event.percent
logger.debug(
"Job %s DB write: percent=%.1f status=%s filename=%s",
job_id, event.percent, event.status, event.filename,
)
future = asyncio.run_coroutine_threadsafe(
update_job_progress(
self._db,
job_id,
event.percent,
event.speed,
event.eta,
event.filename,
),
self._loop,
)
# Block worker thread until DB write completes
future.result(timeout=10)
except Exception:
logger.exception("Job %s progress hook error (status=%s)", job_id, d.get("status"))
opts["progress_hooks"] = [progress_hook]
try:
# Mark as downloading and notify SSE
asyncio.run_coroutine_threadsafe(
update_job_status(self._db, job_id, JobStatus.downloading.value),
self._loop,
).result(timeout=10)
self._broker.publish(session_id, {
"event": "job_update",
"data": {"job_id": job_id, "status": "downloading", "percent": 0,
"speed": None, "eta": None, "filename": None},
})
# Fresh YoutubeDL instance — never shared
with yt_dlp.YoutubeDL(opts) as ydl:
# Extract info first to determine the output filename.
# This is needed because yt-dlp may skip progress hooks
# entirely when the file already exists.
info = ydl.extract_info(url, download=False)
if info:
raw_fn = ydl.prepare_filename(info)
abs_path = Path(raw_fn).resolve()
out_dir = Path(self._config.downloads.output_dir).resolve()
try:
relative_fn = str(abs_path.relative_to(out_dir))
except ValueError:
relative_fn = abs_path.name
else:
relative_fn = None
ydl.download([url])
# Persist filename to DB (progress hooks may not have fired
# if the file already existed)
if relative_fn:
asyncio.run_coroutine_threadsafe(
update_job_progress(
self._db, job_id, 100.0,
None, None, relative_fn,
),
self._loop,
).result(timeout=10)
# Mark as completed and notify SSE
asyncio.run_coroutine_threadsafe(
update_job_status(self._db, job_id, JobStatus.completed.value),
self._loop,
).result(timeout=10)
self._broker.publish(session_id, {
"event": "job_update",
"data": {"job_id": job_id, "status": "completed", "percent": 100,
"speed": None, "eta": None, "filename": relative_fn},
})
logger.info("Job %s completed", job_id)
except Exception as e:
logger.error("Job %s failed: %s", job_id, e, exc_info=True)
try:
asyncio.run_coroutine_threadsafe(
update_job_status(
self._db, job_id, JobStatus.failed.value, str(e)
),
self._loop,
).result(timeout=10)
self._broker.publish(session_id, {
"event": "job_update",
"data": {"job_id": job_id, "status": "failed", "percent": 0,
"speed": None, "eta": None, "filename": None,
"error_message": str(e)},
})
except Exception:
logger.exception("Job %s failed to update status after error", job_id)
finally:
self._last_db_percent.pop(job_id, None)
def _extract_info(self, url: str) -> dict | None:
"""Run yt-dlp extract_info synchronously (called from thread pool)."""
opts = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
}
try:
with yt_dlp.YoutubeDL(opts) as ydl:
return ydl.extract_info(url, download=False)
except Exception:
logger.exception("Format extraction failed for %s", url)
return None
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _parse_resolution_height(resolution: str | None) -> int:
"""Extract numeric height from a resolution string like '1080p' or '1920x1080'.
Returns 0 for unparseable values so they sort last.
"""
if not resolution:
return 0
resolution = resolution.lower().strip()
# Handle "1080p" style
if resolution.endswith("p"):
try:
return int(resolution[:-1])
except ValueError:
pass
# Handle "1920x1080" style
if "x" in resolution:
try:
return int(resolution.split("x")[-1])
except ValueError:
pass
# Handle bare number
try:
return int(resolution)
except ValueError:
return 0