tubearr/src/sources/youtube.ts
jlightner f494d31e60 fix: raise default scan limit from 100 to 500, use 999 for initial scans
- Default scanLimit increased to 500 (was 100, missing most channel content)
- First scan (lastCheckedAt === null) uses max(scanLimit, 999) for full catalog
- Discovery timeout scales with limit: 60s base + 30s per 500 items
- Updated platform-settings-repository defaults to match
2026-04-03 22:07:24 +00:00

294 lines
9.6 KiB
TypeScript

import type { PlatformSource, FetchRecentContentOptions } from './platform-source';
import type {
Channel,
PlatformSourceMetadata,
PlatformContentMetadata,
PlaylistDiscoveryResult,
} from '../types/index';
import { Platform, ContentType } from '../types/index';
import { execYtDlp, parseSingleJson, parseJsonLines } from './yt-dlp';
// ── URL Validation ──
const YOUTUBE_URL_PATTERNS = [
/^https?:\/\/(www\.)?youtube\.com\/@[\w.-]+/,
/^https?:\/\/(www\.)?youtube\.com\/channel\/[\w-]+/,
/^https?:\/\/(www\.)?youtube\.com\/c\/[\w.-]+/,
/^https?:\/\/(www\.)?youtube\.com\/user\/[\w.-]+/,
/^https?:\/\/youtu\.be\/[\w-]+/,
];
export function isYouTubeUrl(url: string): boolean {
return YOUTUBE_URL_PATTERNS.some((p) => p.test(url));
}
// ── Implementation ──
export class YouTubeSource implements PlatformSource {
async resolveChannel(url: string): Promise<PlatformSourceMetadata> {
const result = await execYtDlp(
['--dump-single-json', '--playlist-items', '0', '--flat-playlist', url],
{ timeout: 30_000 }
);
const data = parseSingleJson(result.stdout) as Record<string, unknown>;
const channelName =
(data.channel as string) ??
(data.uploader as string) ??
'Unknown Channel';
const channelId = (data.channel_id as string) ?? '';
const channelUrl =
(data.channel_url as string) ??
(data.uploader_url as string) ??
url;
// Pick the best thumbnail — yt-dlp returns an array sorted by quality
const thumbnails = data.thumbnails as Array<{ url?: string; width?: number }> | undefined;
const imageUrl = thumbnails?.length
? (thumbnails[thumbnails.length - 1]?.url ?? null)
: null;
// Extract enrichment metadata
const description = typeof data.description === 'string' ? data.description : null;
const subscriberCount = typeof data.channel_follower_count === 'number'
? data.channel_follower_count
: null;
// Banner: try channel_banner_url first, then look for wide thumbnails (>=1024px)
let bannerUrl: string | null = null;
if (typeof data.channel_banner_url === 'string') {
bannerUrl = data.channel_banner_url;
} else if (thumbnails?.length) {
const wideThumbnail = thumbnails.find((t) => (t.width ?? 0) >= 1024);
if (wideThumbnail?.url) bannerUrl = wideThumbnail.url;
}
return {
name: channelName,
platformId: channelId,
imageUrl,
url: channelUrl,
platform: Platform.YouTube,
bannerUrl,
description,
subscriberCount,
};
}
/**
* Hybrid two-phase fetch for YouTube content.
*
* Phase 1 (Discovery): `--flat-playlist` for fast enumeration — returns IDs
* and titles without per-video metadata fetches. Extremely fast even for
* large channels.
*
* Phase 2 (Enrichment): For items NOT in `existingIds`, fetch full metadata
* per-video to get `upload_date`. Respects `rateLimitDelay` between calls.
* Individual enrichment failures are tolerated — the item is returned with
* `publishedAt: null` rather than aborting the scan.
*/
async fetchRecentContent(
channel: Channel,
options?: FetchRecentContentOptions
): Promise<PlatformContentMetadata[]> {
const limit = options?.limit ?? 50;
const existingIds = options?.existingIds ?? new Set<string>();
const rateLimitDelay = options?.rateLimitDelay ?? 1000;
const discoveryOnly = options?.discoveryOnly ?? false;
const signal = options?.signal;
// ── Phase 1: Fast discovery via --flat-playlist ──
// Timeout scales with limit: 60s base + 30s per 500 items
const discoveryTimeout = 60_000 + Math.ceil(limit / 500) * 30_000;
const flatResult = await execYtDlp(
[
'--flat-playlist',
'--dump-json',
'--playlist-items',
`1:${limit}`,
channel.url,
],
{ timeout: discoveryTimeout }
);
const flatEntries = parseJsonLines(flatResult.stdout);
const discoveredItems = flatEntries.map((entry) => mapEntry(entry));
// If discovery-only, skip Phase 2 entirely — caller gets fast results
if (discoveryOnly) {
return discoveredItems;
}
// ── Phase 2: Enrich new items with upload_date ──
const newItems = discoveredItems.filter(
(item) => !existingIds.has(item.platformContentId)
);
if (newItems.length === 0) {
// All items already known — return flat results as-is
return discoveredItems;
}
console.log(
`[youtube] Phase 2: enriching ${newItems.length} new items (${discoveredItems.length - newItems.length} already known)`
);
// Build a map of enriched data keyed by platformContentId
const enrichedMap = new Map<string, PlatformContentMetadata>();
for (let i = 0; i < newItems.length; i++) {
// Check cancellation between enrichment calls
if (signal?.aborted) {
console.log(`[youtube] Phase 2 aborted after ${i} enrichments`);
break;
}
const item = newItems[i];
// Rate limit delay between enrichment calls (skip before first)
if (i > 0 && rateLimitDelay > 0) {
await sleep(rateLimitDelay);
}
try {
const videoUrl = `https://www.youtube.com/watch?v=${item.platformContentId}`;
const enrichResult = await execYtDlp(
['--dump-json', '--no-playlist', videoUrl],
{ timeout: 15_000 }
);
const enrichedEntry = parseSingleJson(enrichResult.stdout);
const enrichedItem = mapEntry(enrichedEntry);
enrichedMap.set(item.platformContentId, enrichedItem);
console.log(
`[youtube] Enriched ${i + 1}/${newItems.length}: ${item.platformContentId}`
);
} catch (err) {
// Tolerate individual failures — item keeps flat-playlist data (publishedAt: null)
console.warn(
`[youtube] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}`
);
}
}
// Merge enriched data back into the full list
return discoveredItems.map((item) => {
const enriched = enrichedMap.get(item.platformContentId);
return enriched ?? item;
});
}
/**
* Fetch ALL content for a channel — no playlist-items limit.
* Used by back-catalog import. Extended timeout (300s) to handle
* large channels with thousands of videos.
*/
async fetchAllContent(
channel: Channel
): Promise<PlatformContentMetadata[]> {
const result = await execYtDlp(
[
'--flat-playlist',
'--dump-json',
channel.url,
],
{ timeout: 300_000 }
);
const entries = parseJsonLines(result.stdout);
return entries.map((entry) => mapEntry(entry));
}
/**
* Fetch playlists for a YouTube channel, with video-to-playlist mappings.
* Step 1: Enumerate playlists from the channel's /playlists tab.
* Step 2: For each playlist, fetch the video IDs it contains.
*/
async fetchPlaylists(
channel: Channel
): Promise<PlaylistDiscoveryResult[]> {
// Get playlist list from /playlists tab
const listResult = await execYtDlp(
['--flat-playlist', '--dump-json', `${channel.url}/playlists`],
{ timeout: 60_000 }
);
const playlistEntries = parseJsonLines(listResult.stdout);
// For each playlist, fetch its video IDs
const results: PlaylistDiscoveryResult[] = [];
for (const entry of playlistEntries) {
const e = entry as Record<string, unknown>;
const playlistId = e.id as string;
const title = (e.title as string) ?? 'Untitled Playlist';
if (!playlistId) continue;
const playlistUrl = `https://www.youtube.com/playlist?list=${playlistId}`;
const videoResult = await execYtDlp(
['--flat-playlist', '--dump-json', playlistUrl],
{ timeout: 60_000 }
);
const videoEntries = parseJsonLines(videoResult.stdout);
const videoIds = videoEntries
.map((v) => (v as Record<string, unknown>).id as string)
.filter(Boolean);
results.push({ platformPlaylistId: playlistId, title, videoIds });
}
return results;
}
}
// ── Helpers ──
/**
* Parse yt-dlp's upload_date (YYYYMMDD) into an ISO 8601 datetime string.
* Returns null for missing or malformed values.
*/
function parseUploadDate(raw: string | undefined): string | null {
if (!raw || raw.length !== 8) return null;
const y = raw.slice(0, 4);
const m = raw.slice(4, 6);
const d = raw.slice(6, 8);
return `${y}-${m}-${d}T00:00:00Z`;
}
function mapEntry(entry: unknown): PlatformContentMetadata {
const e = entry as Record<string, unknown>;
return {
platformContentId: (e.id as string) ?? '',
title: (e.title as string) ?? 'Untitled',
url: (e.url as string) ?? (e.webpage_url as string) ?? '',
contentType: detectContentType(e),
duration: typeof e.duration === 'number' ? e.duration : null,
thumbnailUrl: extractThumbnailUrl(e),
publishedAt: parseUploadDate(e.upload_date as string | undefined),
};
}
function detectContentType(
entry: Record<string, unknown>
): PlatformContentMetadata['contentType'] {
const liveStatus = entry.live_status as string | undefined;
if (liveStatus === 'is_live' || liveStatus === 'is_upcoming') {
return ContentType.Livestream;
}
return ContentType.Video;
}
function extractThumbnailUrl(entry: Record<string, unknown>): string | null {
if (typeof entry.thumbnail === 'string') return entry.thumbnail;
const thumbnails = entry.thumbnails as Array<{ url?: string }> | undefined;
if (thumbnails?.length) {
return thumbnails[thumbnails.length - 1]?.url ?? null;
}
return null;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}