import type { PlatformSource, FetchRecentContentOptions } from './platform-source'; import type { Channel, PlatformSourceMetadata, PlatformContentMetadata, PlaylistDiscoveryResult, } from '../types/index'; import { Platform, ContentType } from '../types/index'; import { execYtDlp, parseSingleJson, parseJsonLines } from './yt-dlp'; // ── URL Validation ── const YOUTUBE_URL_PATTERNS = [ /^https?:\/\/(www\.)?youtube\.com\/@[\w.-]+/, /^https?:\/\/(www\.)?youtube\.com\/channel\/[\w-]+/, /^https?:\/\/(www\.)?youtube\.com\/c\/[\w.-]+/, /^https?:\/\/(www\.)?youtube\.com\/user\/[\w.-]+/, /^https?:\/\/youtu\.be\/[\w-]+/, ]; export function isYouTubeUrl(url: string): boolean { return YOUTUBE_URL_PATTERNS.some((p) => p.test(url)); } // ── Implementation ── export class YouTubeSource implements PlatformSource { async resolveChannel(url: string): Promise { const result = await execYtDlp( ['--dump-single-json', '--playlist-items', '0', '--flat-playlist', url], { timeout: 30_000 } ); const data = parseSingleJson(result.stdout) as Record; const channelName = (data.channel as string) ?? (data.uploader as string) ?? 'Unknown Channel'; const channelId = (data.channel_id as string) ?? ''; const channelUrl = (data.channel_url as string) ?? (data.uploader_url as string) ?? url; // Pick the best thumbnail — yt-dlp returns an array sorted by quality const thumbnails = data.thumbnails as Array<{ url?: string; width?: number }> | undefined; const imageUrl = thumbnails?.length ? (thumbnails[thumbnails.length - 1]?.url ?? null) : null; // Extract enrichment metadata const description = typeof data.description === 'string' ? data.description : null; const subscriberCount = typeof data.channel_follower_count === 'number' ? data.channel_follower_count : null; // Banner: try channel_banner_url first, then look for wide thumbnails (>=1024px) let bannerUrl: string | null = null; if (typeof data.channel_banner_url === 'string') { bannerUrl = data.channel_banner_url; } else if (thumbnails?.length) { const wideThumbnail = thumbnails.find((t) => (t.width ?? 0) >= 1024); if (wideThumbnail?.url) bannerUrl = wideThumbnail.url; } return { name: channelName, platformId: channelId, imageUrl, url: channelUrl, platform: Platform.YouTube, bannerUrl, description, subscriberCount, }; } /** * Hybrid two-phase fetch for YouTube content. * * Phase 1 (Discovery): `--flat-playlist` for fast enumeration — returns IDs * and titles without per-video metadata fetches. Extremely fast even for * large channels. * * Phase 2 (Enrichment): For items NOT in `existingIds`, fetch full metadata * per-video to get `upload_date`. Respects `rateLimitDelay` between calls. * Individual enrichment failures are tolerated — the item is returned with * `publishedAt: null` rather than aborting the scan. */ async fetchRecentContent( channel: Channel, options?: FetchRecentContentOptions ): Promise { const limit = options?.limit ?? 50; const existingIds = options?.existingIds ?? new Set(); const rateLimitDelay = options?.rateLimitDelay ?? 1000; const discoveryOnly = options?.discoveryOnly ?? false; const signal = options?.signal; // ── Phase 1: Fast discovery via --flat-playlist ── // Timeout scales with limit: 60s base + 30s per 500 items const discoveryTimeout = 60_000 + Math.ceil(limit / 500) * 30_000; const flatResult = await execYtDlp( [ '--flat-playlist', '--dump-json', '--playlist-items', `1:${limit}`, channel.url, ], { timeout: discoveryTimeout } ); const flatEntries = parseJsonLines(flatResult.stdout); const discoveredItems = flatEntries.map((entry) => mapEntry(entry)); // If discovery-only, skip Phase 2 entirely — caller gets fast results if (discoveryOnly) { return discoveredItems; } // ── Phase 2: Enrich new items with upload_date ── const newItems = discoveredItems.filter( (item) => !existingIds.has(item.platformContentId) ); if (newItems.length === 0) { // All items already known — return flat results as-is return discoveredItems; } console.log( `[youtube] Phase 2: enriching ${newItems.length} new items (${discoveredItems.length - newItems.length} already known)` ); // Build a map of enriched data keyed by platformContentId const enrichedMap = new Map(); for (let i = 0; i < newItems.length; i++) { // Check cancellation between enrichment calls if (signal?.aborted) { console.log(`[youtube] Phase 2 aborted after ${i} enrichments`); break; } const item = newItems[i]; // Rate limit delay between enrichment calls (skip before first) if (i > 0 && rateLimitDelay > 0) { await sleep(rateLimitDelay); } try { const videoUrl = `https://www.youtube.com/watch?v=${item.platformContentId}`; const enrichResult = await execYtDlp( ['--dump-json', '--no-playlist', videoUrl], { timeout: 15_000 } ); const enrichedEntry = parseSingleJson(enrichResult.stdout); const enrichedItem = mapEntry(enrichedEntry); enrichedMap.set(item.platformContentId, enrichedItem); console.log( `[youtube] Enriched ${i + 1}/${newItems.length}: ${item.platformContentId}` ); } catch (err) { // Tolerate individual failures — item keeps flat-playlist data (publishedAt: null) console.warn( `[youtube] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}` ); } } // Merge enriched data back into the full list return discoveredItems.map((item) => { const enriched = enrichedMap.get(item.platformContentId); return enriched ?? item; }); } /** * Fetch ALL content for a channel — no playlist-items limit. * Used by back-catalog import. Extended timeout (300s) to handle * large channels with thousands of videos. */ async fetchAllContent( channel: Channel ): Promise { const result = await execYtDlp( [ '--flat-playlist', '--dump-json', channel.url, ], { timeout: 300_000 } ); const entries = parseJsonLines(result.stdout); return entries.map((entry) => mapEntry(entry)); } /** * Fetch playlists for a YouTube channel, with video-to-playlist mappings. * Step 1: Enumerate playlists from the channel's /playlists tab. * Step 2: For each playlist, fetch the video IDs it contains. */ async fetchPlaylists( channel: Channel ): Promise { // Get playlist list from /playlists tab const listResult = await execYtDlp( ['--flat-playlist', '--dump-json', `${channel.url}/playlists`], { timeout: 60_000 } ); const playlistEntries = parseJsonLines(listResult.stdout); // For each playlist, fetch its video IDs const results: PlaylistDiscoveryResult[] = []; for (const entry of playlistEntries) { const e = entry as Record; const playlistId = e.id as string; const title = (e.title as string) ?? 'Untitled Playlist'; if (!playlistId) continue; const playlistUrl = `https://www.youtube.com/playlist?list=${playlistId}`; const videoResult = await execYtDlp( ['--flat-playlist', '--dump-json', playlistUrl], { timeout: 60_000 } ); const videoEntries = parseJsonLines(videoResult.stdout); const videoIds = videoEntries .map((v) => (v as Record).id as string) .filter(Boolean); results.push({ platformPlaylistId: playlistId, title, videoIds }); } return results; } } // ── Helpers ── /** * Parse yt-dlp's upload_date (YYYYMMDD) into an ISO 8601 datetime string. * Returns null for missing or malformed values. */ function parseUploadDate(raw: string | undefined): string | null { if (!raw || raw.length !== 8) return null; const y = raw.slice(0, 4); const m = raw.slice(4, 6); const d = raw.slice(6, 8); return `${y}-${m}-${d}T00:00:00Z`; } function mapEntry(entry: unknown): PlatformContentMetadata { const e = entry as Record; return { platformContentId: (e.id as string) ?? '', title: (e.title as string) ?? 'Untitled', url: (e.url as string) ?? (e.webpage_url as string) ?? '', contentType: detectContentType(e), duration: typeof e.duration === 'number' ? e.duration : null, thumbnailUrl: extractThumbnailUrl(e), publishedAt: parseUploadDate(e.upload_date as string | undefined), }; } function detectContentType( entry: Record ): PlatformContentMetadata['contentType'] { const liveStatus = entry.live_status as string | undefined; if (liveStatus === 'is_live' || liveStatus === 'is_upcoming') { return ContentType.Livestream; } return ContentType.Video; } function extractThumbnailUrl(entry: Record): string | null { if (typeof entry.thumbnail === 'string') return entry.thumbnail; const thumbnails = entry.thumbnails as Array<{ url?: string }> | undefined; if (thumbnails?.length) { return thumbnails[thumbnails.length - 1]?.url ?? null; } return null; } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); }