- Default scanLimit increased to 500 (was 100, missing most channel content) - First scan (lastCheckedAt === null) uses max(scanLimit, 999) for full catalog - Discovery timeout scales with limit: 60s base + 30s per 500 items - Updated platform-settings-repository defaults to match
294 lines
9.6 KiB
TypeScript
294 lines
9.6 KiB
TypeScript
import type { PlatformSource, FetchRecentContentOptions } from './platform-source';
|
|
import type {
|
|
Channel,
|
|
PlatformSourceMetadata,
|
|
PlatformContentMetadata,
|
|
PlaylistDiscoveryResult,
|
|
} from '../types/index';
|
|
import { Platform, ContentType } from '../types/index';
|
|
import { execYtDlp, parseSingleJson, parseJsonLines } from './yt-dlp';
|
|
|
|
// ── URL Validation ──
|
|
|
|
const YOUTUBE_URL_PATTERNS = [
|
|
/^https?:\/\/(www\.)?youtube\.com\/@[\w.-]+/,
|
|
/^https?:\/\/(www\.)?youtube\.com\/channel\/[\w-]+/,
|
|
/^https?:\/\/(www\.)?youtube\.com\/c\/[\w.-]+/,
|
|
/^https?:\/\/(www\.)?youtube\.com\/user\/[\w.-]+/,
|
|
/^https?:\/\/youtu\.be\/[\w-]+/,
|
|
];
|
|
|
|
export function isYouTubeUrl(url: string): boolean {
|
|
return YOUTUBE_URL_PATTERNS.some((p) => p.test(url));
|
|
}
|
|
|
|
// ── Implementation ──
|
|
|
|
export class YouTubeSource implements PlatformSource {
|
|
async resolveChannel(url: string): Promise<PlatformSourceMetadata> {
|
|
const result = await execYtDlp(
|
|
['--dump-single-json', '--playlist-items', '0', '--flat-playlist', url],
|
|
{ timeout: 30_000 }
|
|
);
|
|
|
|
const data = parseSingleJson(result.stdout) as Record<string, unknown>;
|
|
|
|
const channelName =
|
|
(data.channel as string) ??
|
|
(data.uploader as string) ??
|
|
'Unknown Channel';
|
|
const channelId = (data.channel_id as string) ?? '';
|
|
const channelUrl =
|
|
(data.channel_url as string) ??
|
|
(data.uploader_url as string) ??
|
|
url;
|
|
|
|
// Pick the best thumbnail — yt-dlp returns an array sorted by quality
|
|
const thumbnails = data.thumbnails as Array<{ url?: string; width?: number }> | undefined;
|
|
const imageUrl = thumbnails?.length
|
|
? (thumbnails[thumbnails.length - 1]?.url ?? null)
|
|
: null;
|
|
|
|
// Extract enrichment metadata
|
|
const description = typeof data.description === 'string' ? data.description : null;
|
|
const subscriberCount = typeof data.channel_follower_count === 'number'
|
|
? data.channel_follower_count
|
|
: null;
|
|
|
|
// Banner: try channel_banner_url first, then look for wide thumbnails (>=1024px)
|
|
let bannerUrl: string | null = null;
|
|
if (typeof data.channel_banner_url === 'string') {
|
|
bannerUrl = data.channel_banner_url;
|
|
} else if (thumbnails?.length) {
|
|
const wideThumbnail = thumbnails.find((t) => (t.width ?? 0) >= 1024);
|
|
if (wideThumbnail?.url) bannerUrl = wideThumbnail.url;
|
|
}
|
|
|
|
return {
|
|
name: channelName,
|
|
platformId: channelId,
|
|
imageUrl,
|
|
url: channelUrl,
|
|
platform: Platform.YouTube,
|
|
bannerUrl,
|
|
description,
|
|
subscriberCount,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Hybrid two-phase fetch for YouTube content.
|
|
*
|
|
* Phase 1 (Discovery): `--flat-playlist` for fast enumeration — returns IDs
|
|
* and titles without per-video metadata fetches. Extremely fast even for
|
|
* large channels.
|
|
*
|
|
* Phase 2 (Enrichment): For items NOT in `existingIds`, fetch full metadata
|
|
* per-video to get `upload_date`. Respects `rateLimitDelay` between calls.
|
|
* Individual enrichment failures are tolerated — the item is returned with
|
|
* `publishedAt: null` rather than aborting the scan.
|
|
*/
|
|
async fetchRecentContent(
|
|
channel: Channel,
|
|
options?: FetchRecentContentOptions
|
|
): Promise<PlatformContentMetadata[]> {
|
|
const limit = options?.limit ?? 50;
|
|
const existingIds = options?.existingIds ?? new Set<string>();
|
|
const rateLimitDelay = options?.rateLimitDelay ?? 1000;
|
|
const discoveryOnly = options?.discoveryOnly ?? false;
|
|
const signal = options?.signal;
|
|
|
|
// ── Phase 1: Fast discovery via --flat-playlist ──
|
|
// Timeout scales with limit: 60s base + 30s per 500 items
|
|
const discoveryTimeout = 60_000 + Math.ceil(limit / 500) * 30_000;
|
|
const flatResult = await execYtDlp(
|
|
[
|
|
'--flat-playlist',
|
|
'--dump-json',
|
|
'--playlist-items',
|
|
`1:${limit}`,
|
|
channel.url,
|
|
],
|
|
{ timeout: discoveryTimeout }
|
|
);
|
|
|
|
const flatEntries = parseJsonLines(flatResult.stdout);
|
|
const discoveredItems = flatEntries.map((entry) => mapEntry(entry));
|
|
|
|
// If discovery-only, skip Phase 2 entirely — caller gets fast results
|
|
if (discoveryOnly) {
|
|
return discoveredItems;
|
|
}
|
|
|
|
// ── Phase 2: Enrich new items with upload_date ──
|
|
const newItems = discoveredItems.filter(
|
|
(item) => !existingIds.has(item.platformContentId)
|
|
);
|
|
|
|
if (newItems.length === 0) {
|
|
// All items already known — return flat results as-is
|
|
return discoveredItems;
|
|
}
|
|
|
|
console.log(
|
|
`[youtube] Phase 2: enriching ${newItems.length} new items (${discoveredItems.length - newItems.length} already known)`
|
|
);
|
|
|
|
// Build a map of enriched data keyed by platformContentId
|
|
const enrichedMap = new Map<string, PlatformContentMetadata>();
|
|
|
|
for (let i = 0; i < newItems.length; i++) {
|
|
// Check cancellation between enrichment calls
|
|
if (signal?.aborted) {
|
|
console.log(`[youtube] Phase 2 aborted after ${i} enrichments`);
|
|
break;
|
|
}
|
|
|
|
const item = newItems[i];
|
|
|
|
// Rate limit delay between enrichment calls (skip before first)
|
|
if (i > 0 && rateLimitDelay > 0) {
|
|
await sleep(rateLimitDelay);
|
|
}
|
|
|
|
try {
|
|
const videoUrl = `https://www.youtube.com/watch?v=${item.platformContentId}`;
|
|
const enrichResult = await execYtDlp(
|
|
['--dump-json', '--no-playlist', videoUrl],
|
|
{ timeout: 15_000 }
|
|
);
|
|
|
|
const enrichedEntry = parseSingleJson(enrichResult.stdout);
|
|
const enrichedItem = mapEntry(enrichedEntry);
|
|
enrichedMap.set(item.platformContentId, enrichedItem);
|
|
|
|
console.log(
|
|
`[youtube] Enriched ${i + 1}/${newItems.length}: ${item.platformContentId}`
|
|
);
|
|
} catch (err) {
|
|
// Tolerate individual failures — item keeps flat-playlist data (publishedAt: null)
|
|
console.warn(
|
|
`[youtube] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}`
|
|
);
|
|
}
|
|
}
|
|
|
|
// Merge enriched data back into the full list
|
|
return discoveredItems.map((item) => {
|
|
const enriched = enrichedMap.get(item.platformContentId);
|
|
return enriched ?? item;
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Fetch ALL content for a channel — no playlist-items limit.
|
|
* Used by back-catalog import. Extended timeout (300s) to handle
|
|
* large channels with thousands of videos.
|
|
*/
|
|
async fetchAllContent(
|
|
channel: Channel
|
|
): Promise<PlatformContentMetadata[]> {
|
|
const result = await execYtDlp(
|
|
[
|
|
'--flat-playlist',
|
|
'--dump-json',
|
|
channel.url,
|
|
],
|
|
{ timeout: 300_000 }
|
|
);
|
|
|
|
const entries = parseJsonLines(result.stdout);
|
|
|
|
return entries.map((entry) => mapEntry(entry));
|
|
}
|
|
|
|
/**
|
|
* Fetch playlists for a YouTube channel, with video-to-playlist mappings.
|
|
* Step 1: Enumerate playlists from the channel's /playlists tab.
|
|
* Step 2: For each playlist, fetch the video IDs it contains.
|
|
*/
|
|
async fetchPlaylists(
|
|
channel: Channel
|
|
): Promise<PlaylistDiscoveryResult[]> {
|
|
// Get playlist list from /playlists tab
|
|
const listResult = await execYtDlp(
|
|
['--flat-playlist', '--dump-json', `${channel.url}/playlists`],
|
|
{ timeout: 60_000 }
|
|
);
|
|
const playlistEntries = parseJsonLines(listResult.stdout);
|
|
|
|
// For each playlist, fetch its video IDs
|
|
const results: PlaylistDiscoveryResult[] = [];
|
|
for (const entry of playlistEntries) {
|
|
const e = entry as Record<string, unknown>;
|
|
const playlistId = e.id as string;
|
|
const title = (e.title as string) ?? 'Untitled Playlist';
|
|
|
|
if (!playlistId) continue;
|
|
|
|
const playlistUrl = `https://www.youtube.com/playlist?list=${playlistId}`;
|
|
const videoResult = await execYtDlp(
|
|
['--flat-playlist', '--dump-json', playlistUrl],
|
|
{ timeout: 60_000 }
|
|
);
|
|
const videoEntries = parseJsonLines(videoResult.stdout);
|
|
const videoIds = videoEntries
|
|
.map((v) => (v as Record<string, unknown>).id as string)
|
|
.filter(Boolean);
|
|
|
|
results.push({ platformPlaylistId: playlistId, title, videoIds });
|
|
}
|
|
|
|
return results;
|
|
}
|
|
}
|
|
|
|
// ── Helpers ──
|
|
|
|
/**
|
|
* Parse yt-dlp's upload_date (YYYYMMDD) into an ISO 8601 datetime string.
|
|
* Returns null for missing or malformed values.
|
|
*/
|
|
function parseUploadDate(raw: string | undefined): string | null {
|
|
if (!raw || raw.length !== 8) return null;
|
|
const y = raw.slice(0, 4);
|
|
const m = raw.slice(4, 6);
|
|
const d = raw.slice(6, 8);
|
|
return `${y}-${m}-${d}T00:00:00Z`;
|
|
}
|
|
|
|
function mapEntry(entry: unknown): PlatformContentMetadata {
|
|
const e = entry as Record<string, unknown>;
|
|
return {
|
|
platformContentId: (e.id as string) ?? '',
|
|
title: (e.title as string) ?? 'Untitled',
|
|
url: (e.url as string) ?? (e.webpage_url as string) ?? '',
|
|
contentType: detectContentType(e),
|
|
duration: typeof e.duration === 'number' ? e.duration : null,
|
|
thumbnailUrl: extractThumbnailUrl(e),
|
|
publishedAt: parseUploadDate(e.upload_date as string | undefined),
|
|
};
|
|
}
|
|
|
|
function detectContentType(
|
|
entry: Record<string, unknown>
|
|
): PlatformContentMetadata['contentType'] {
|
|
const liveStatus = entry.live_status as string | undefined;
|
|
if (liveStatus === 'is_live' || liveStatus === 'is_upcoming') {
|
|
return ContentType.Livestream;
|
|
}
|
|
return ContentType.Video;
|
|
}
|
|
|
|
function extractThumbnailUrl(entry: Record<string, unknown>): string | null {
|
|
if (typeof entry.thumbnail === 'string') return entry.thumbnail;
|
|
const thumbnails = entry.thumbnails as Array<{ url?: string }> | undefined;
|
|
if (thumbnails?.length) {
|
|
return thumbnails[thumbnails.length - 1]?.url ?? null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|