From d5183043314eff37ab34dd36c8e868c9b1716d7a Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Wed, 1 Apr 2026 04:21:19 +0000
Subject: [PATCH] fix: detect video from URL extension when yt-dlp extract_flat
 strips codec info

archive.org and other direct-file hosts return metadata without vcodec
when using extract_flat mode. The UI was incorrectly labeling these as
'Audio Only'. Now we check the URL path extension and yt-dlp's reported
ext against known video containers as a fallback before marking a source
as audio-only.

Fixes incorrect audio-only detection for archive.org video URLs.
---
 backend/app/services/download.py | 48 ++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/backend/app/services/download.py b/backend/app/services/download.py
index 2f7c3b1..9fe9811 100644
--- a/backend/app/services/download.py
+++ b/backend/app/services/download.py
@@ -561,6 +561,31 @@ class DownloadService:
         url_lower = url.lower()
         return any(domain in url_lower for domain in audio_domains)
 
+    @staticmethod
+    def _url_or_ext_implies_video(url: str, ext: str | None) -> bool:
+        """Return True if the URL path or reported extension is a known video container.
+
+        This acts as a fallback when yt-dlp's extract_flat mode strips codec
+        metadata (common for archive.org, direct-file URLs, etc.), which would
+        otherwise cause the UI to wrongly label the source as "audio only".
+        """
+        video_extensions = {
+            "mp4", "mkv", "webm", "avi", "mov", "flv", "wmv", "mpg",
+            "mpeg", "m4v", "ts", "3gp", "ogv",
+        }
+        # Check the extension reported by yt-dlp
+        if ext and ext.lower() in video_extensions:
+            return True
+        # Check the URL path for a video file extension
+        from urllib.parse import urlparse
+        path = urlparse(url).path.lower()
+        # Strip any trailing slashes / query residue
+        path = path.rstrip("/")
+        for vext in video_extensions:
+            if path.endswith(f".{vext}"):
+                return True
+        return False
+
     @staticmethod
     def _get_auth_hint(url: str) -> str | None:
         """Return a user-facing hint for sites that commonly need auth."""
@@ -645,13 +670,27 @@ class DownloadService:
                         "url": e.get("url") or e.get("webpage_url", ""),
                         "duration": e.get("duration"),
                     })
+            # Domain-based detection may miss video playlists on generic
+            # hosting sites (e.g. archive.org).  If any entry URL looks like
+            # a video file, override domain_audio for the whole playlist.
+            playlist_audio = domain_audio
+            if playlist_audio:
+                for e_check in entries:
+                    entry_url = e_check.get("url", "")
+                    if self._url_or_ext_implies_video(entry_url, None):
+                        playlist_audio = False
+                        break
+            if not playlist_audio and not domain_audio:
+                # Also check the top-level URL itself
+                if self._url_or_ext_implies_video(url, info.get("ext")):
+                    playlist_audio = False
             result = {
                 "type": "playlist",
                 "title": info.get("title", "Playlist"),
                 "count": len(entries),
                 "entries": entries,
-                "is_audio_only": domain_audio,
-                "default_ext": self._guess_ext_from_url(url, domain_audio),
+                "is_audio_only": playlist_audio,
+                "default_ext": self._guess_ext_from_url(url, playlist_audio),
             }
             if unavailable_count > 0:
                 result["unavailable_count"] = unavailable_count
@@ -659,6 +698,11 @@ class DownloadService:
         else:
             # Single video/track
             has_video = bool(info.get("vcodec") and info["vcodec"] != "none")
+            # extract_flat mode often strips codec info, so also check the
+            # URL extension and the reported ext — if either is a known video
+            # container we should NOT mark it as audio-only.
+            if not has_video:
+                has_video = self._url_or_ext_implies_video(url, info.get("ext"))
             is_audio_only = domain_audio or not has_video
             # Detect likely file extension
             ext = info.get("ext")