import type { Channel, PlatformSourceMetadata, PlatformContentMetadata, ContentType } from '../types/index'; import type { PlatformSource, FetchRecentContentOptions } from './platform-source'; import { execYtDlp, parseJsonLines, parseSingleJson } from './yt-dlp'; /** * Generic platform source — catch-all for any URL yt-dlp supports. * * Works with Vimeo, Twitch VODs, Bandcamp, Dailymotion, Twitter/X, * Instagram, TikTok, Reddit, news sites with embedded video, blogs, * and hundreds of other sites yt-dlp can extract from. * * Unlike YouTube/SoundCloud sources which use channel-level enumeration, * the Generic source treats the channel URL as a playlist/page to scrape. * Content discovery uses yt-dlp's built-in extractors with no platform-specific logic. */ export class GenericSource implements PlatformSource { /** * Resolve a URL to channel-like metadata. * * For generic URLs, the "channel" is whatever yt-dlp identifies as the * playlist/page/uploader. Falls back to the URL domain as the name * if yt-dlp can't extract structured metadata. */ async resolveChannel(url: string): Promise { try { const result = await execYtDlp( [ '--dump-single-json', '--playlist-items', '0', '--flat-playlist', url, ], { timeout: 30_000 } ); const data = parseSingleJson(result.stdout) as Record; // yt-dlp returns various shapes depending on the site const name = data.channel ?? data.uploader ?? data.playlist_title ?? data.title ?? new URL(url).hostname; const platformId = data.channel_id ?? data.uploader_id ?? data.playlist_id ?? data.id ?? url; const channelUrl = data.channel_url ?? data.uploader_url ?? data.webpage_url ?? url; // Best thumbnail const thumbnails = data.thumbnails as Array<{ url: string; width?: number }> | undefined; const imageUrl = thumbnails?.length ? thumbnails[thumbnails.length - 1].url : null; return { name: String(name), platformId: String(platformId), imageUrl, url: String(channelUrl), platform: 'generic' as const, description: data.description ? String(data.description) : null, }; } catch { // Fallback: use URL domain as name, URL as identifier const hostname = (() => { try { return new URL(url).hostname; } catch { return 'Unknown'; } })(); return { name: hostname, platformId: url, imageUrl: null, url, platform: 'generic' as const, }; } } /** * Fetch content from a generic URL. * * Treats the channel URL as a page/playlist and enumerates items via * --flat-playlist. Each item is a potential downloadable media file. */ async fetchRecentContent( channel: Channel, options?: FetchRecentContentOptions ): Promise { const limit = options?.limit ?? 50; const discoveryOnly = options?.discoveryOnly ?? false; const existingIds = options?.existingIds ?? new Set(); const rateLimitDelay = options?.rateLimitDelay ?? 2000; const signal = options?.signal; // Discovery: enumerate items from the URL const discoveryTimeout = 60_000 + Math.ceil(limit / 500) * 30_000; const flatResult = await execYtDlp( [ '--flat-playlist', '--dump-json', '--playlist-items', `1:${limit}`, channel.url, ], { timeout: discoveryTimeout } ); const flatEntries = parseJsonLines(flatResult.stdout) as Record[]; const discoveredItems = flatEntries.map((entry) => mapEntry(entry)); if (discoveryOnly) { return discoveredItems; } // Enrichment: fetch full metadata for new items only const newItems = discoveredItems.filter( (item) => !existingIds.has(item.platformContentId) ); if (newItems.length === 0) return discoveredItems; console.log( `[generic] Enriching ${newItems.length} new items (${discoveredItems.length - newItems.length} already known)` ); const enrichedMap = new Map(); for (let i = 0; i < newItems.length; i++) { if (signal?.aborted) { console.log(`[generic] Enrichment aborted after ${i} items`); break; } const item = newItems[i]; if (i > 0 && rateLimitDelay > 0) { await sleep(rateLimitDelay); } try { const enrichResult = await execYtDlp( ['--dump-json', '--no-playlist', item.url], { timeout: 30_000 } ); const enrichedEntry = parseSingleJson(enrichResult.stdout) as Record; enrichedMap.set(item.platformContentId, mapEntry(enrichedEntry)); } catch (err) { console.warn( `[generic] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}` ); } } return discoveredItems.map((item) => { const enriched = enrichedMap.get(item.platformContentId); return enriched ?? item; }); } } // ── Helpers ── function mapEntry(entry: Record): PlatformContentMetadata { const id = String(entry.id ?? entry.url ?? ''); const title = String(entry.title ?? entry.fulltitle ?? 'Untitled'); const url = String(entry.webpage_url ?? entry.url ?? entry.original_url ?? ''); // Content type detection const liveStatus = entry.live_status as string | undefined; const isLive = liveStatus === 'is_live' || liveStatus === 'is_upcoming'; const isAudio = entry._type === 'audio' || (entry.vcodec === 'none' && entry.acodec !== 'none') || /\.(mp3|flac|wav|ogg|opus|m4a|aac)$/i.test(url); let contentType: ContentType = 'video'; if (isLive) contentType = 'livestream'; else if (isAudio) contentType = 'audio'; // Duration const duration = typeof entry.duration === 'number' ? Math.round(entry.duration) : null; // Thumbnail — best quality const thumbnails = entry.thumbnails as Array<{ url: string }> | undefined; const thumbnailUrl = thumbnails?.length ? thumbnails[thumbnails.length - 1].url : (entry.thumbnail as string | undefined) ?? null; // Published date let publishedAt: string | null = null; const uploadDate = entry.upload_date as string | undefined; if (uploadDate && /^\d{8}$/.test(uploadDate)) { publishedAt = `${uploadDate.slice(0, 4)}-${uploadDate.slice(4, 6)}-${uploadDate.slice(6, 8)}T00:00:00Z`; } return { platformContentId: id, title, url, contentType, duration, thumbnailUrl, publishedAt }; } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); }