tubearr/src/sources/generic.ts
jlightner aa09bc089c feat: Generic platform + YouTube enhancements (chapters, SponsorBlock, thumbnails)
Generic Platform:
- New 'generic' platform type — catch-all for any URL yt-dlp supports
- GenericSource resolves channel metadata from any URL via yt-dlp extractors
- Content type auto-detection (video/audio/livestream) from yt-dlp metadata
- Works with Vimeo, Twitch, Bandcamp, Dailymotion, and 1000+ other sites
- Registered in both scheduler registry and channel route registry
- Frontend: indigo badge, URL detection fallback, AddChannelModal support

YouTube Enhancements:
- embedChapters: --embed-chapters flag on FormatProfile
- embedThumbnail: --embed-thumbnail flag on FormatProfile
- sponsorBlockRemove: --sponsorblock-remove with configurable categories
  (sponsor, selfpromo, interaction, intro, outro, preview, music_offtopic, filler)
- Migration 0011: adds columns to format_profiles table
- All three configurable per format profile via API and (future) Settings UI
2026-04-04 02:45:02 +00:00

205 lines
6.7 KiB
TypeScript

import type { Channel, PlatformSourceMetadata, PlatformContentMetadata, ContentType } from '../types/index';
import type { PlatformSource, FetchRecentContentOptions } from './platform-source';
import { execYtDlp, parseJsonLines, parseSingleJson } from './yt-dlp';
/**
* Generic platform source — catch-all for any URL yt-dlp supports.
*
* Works with Vimeo, Twitch VODs, Bandcamp, Dailymotion, Twitter/X,
* Instagram, TikTok, Reddit, news sites with embedded video, blogs,
* and hundreds of other sites yt-dlp can extract from.
*
* Unlike YouTube/SoundCloud sources which use channel-level enumeration,
* the Generic source treats the channel URL as a playlist/page to scrape.
* Content discovery uses yt-dlp's built-in extractors with no platform-specific logic.
*/
export class GenericSource implements PlatformSource {
/**
* Resolve a URL to channel-like metadata.
*
* For generic URLs, the "channel" is whatever yt-dlp identifies as the
* playlist/page/uploader. Falls back to the URL domain as the name
* if yt-dlp can't extract structured metadata.
*/
async resolveChannel(url: string): Promise<PlatformSourceMetadata> {
try {
const result = await execYtDlp(
[
'--dump-single-json',
'--playlist-items', '0',
'--flat-playlist',
url,
],
{ timeout: 30_000 }
);
const data = parseSingleJson(result.stdout) as Record<string, unknown>;
// yt-dlp returns various shapes depending on the site
const name = data.channel
?? data.uploader
?? data.playlist_title
?? data.title
?? new URL(url).hostname;
const platformId = data.channel_id
?? data.uploader_id
?? data.playlist_id
?? data.id
?? url;
const channelUrl = data.channel_url
?? data.uploader_url
?? data.webpage_url
?? url;
// Best thumbnail
const thumbnails = data.thumbnails as Array<{ url: string; width?: number }> | undefined;
const imageUrl = thumbnails?.length
? thumbnails[thumbnails.length - 1].url
: null;
return {
name: String(name),
platformId: String(platformId),
imageUrl,
url: String(channelUrl),
platform: 'generic' as const,
description: data.description ? String(data.description) : null,
};
} catch {
// Fallback: use URL domain as name, URL as identifier
const hostname = (() => {
try { return new URL(url).hostname; } catch { return 'Unknown'; }
})();
return {
name: hostname,
platformId: url,
imageUrl: null,
url,
platform: 'generic' as const,
};
}
}
/**
* Fetch content from a generic URL.
*
* Treats the channel URL as a page/playlist and enumerates items via
* --flat-playlist. Each item is a potential downloadable media file.
*/
async fetchRecentContent(
channel: Channel,
options?: FetchRecentContentOptions
): Promise<PlatformContentMetadata[]> {
const limit = options?.limit ?? 50;
const discoveryOnly = options?.discoveryOnly ?? false;
const existingIds = options?.existingIds ?? new Set<string>();
const rateLimitDelay = options?.rateLimitDelay ?? 2000;
const signal = options?.signal;
// Discovery: enumerate items from the URL
const discoveryTimeout = 60_000 + Math.ceil(limit / 500) * 30_000;
const flatResult = await execYtDlp(
[
'--flat-playlist',
'--dump-json',
'--playlist-items', `1:${limit}`,
channel.url,
],
{ timeout: discoveryTimeout }
);
const flatEntries = parseJsonLines(flatResult.stdout) as Record<string, unknown>[];
const discoveredItems = flatEntries.map((entry) => mapEntry(entry));
if (discoveryOnly) {
return discoveredItems;
}
// Enrichment: fetch full metadata for new items only
const newItems = discoveredItems.filter(
(item) => !existingIds.has(item.platformContentId)
);
if (newItems.length === 0) return discoveredItems;
console.log(
`[generic] Enriching ${newItems.length} new items (${discoveredItems.length - newItems.length} already known)`
);
const enrichedMap = new Map<string, PlatformContentMetadata>();
for (let i = 0; i < newItems.length; i++) {
if (signal?.aborted) {
console.log(`[generic] Enrichment aborted after ${i} items`);
break;
}
const item = newItems[i];
if (i > 0 && rateLimitDelay > 0) {
await sleep(rateLimitDelay);
}
try {
const enrichResult = await execYtDlp(
['--dump-json', '--no-playlist', item.url],
{ timeout: 30_000 }
);
const enrichedEntry = parseSingleJson(enrichResult.stdout) as Record<string, unknown>;
enrichedMap.set(item.platformContentId, mapEntry(enrichedEntry));
} catch (err) {
console.warn(
`[generic] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}`
);
}
}
return discoveredItems.map((item) => {
const enriched = enrichedMap.get(item.platformContentId);
return enriched ?? item;
});
}
}
// ── Helpers ──
function mapEntry(entry: Record<string, unknown>): PlatformContentMetadata {
const id = String(entry.id ?? entry.url ?? '');
const title = String(entry.title ?? entry.fulltitle ?? 'Untitled');
const url = String(entry.webpage_url ?? entry.url ?? entry.original_url ?? '');
// Content type detection
const liveStatus = entry.live_status as string | undefined;
const isLive = liveStatus === 'is_live' || liveStatus === 'is_upcoming';
const isAudio = entry._type === 'audio'
|| (entry.vcodec === 'none' && entry.acodec !== 'none')
|| /\.(mp3|flac|wav|ogg|opus|m4a|aac)$/i.test(url);
let contentType: ContentType = 'video';
if (isLive) contentType = 'livestream';
else if (isAudio) contentType = 'audio';
// Duration
const duration = typeof entry.duration === 'number' ? Math.round(entry.duration) : null;
// Thumbnail — best quality
const thumbnails = entry.thumbnails as Array<{ url: string }> | undefined;
const thumbnailUrl = thumbnails?.length
? thumbnails[thumbnails.length - 1].url
: (entry.thumbnail as string | undefined) ?? null;
// Published date
let publishedAt: string | null = null;
const uploadDate = entry.upload_date as string | undefined;
if (uploadDate && /^\d{8}$/.test(uploadDate)) {
publishedAt = `${uploadDate.slice(0, 4)}-${uploadDate.slice(4, 6)}-${uploadDate.slice(6, 8)}T00:00:00Z`;
}
return { platformContentId: id, title, url, contentType, duration, thumbnailUrl, publishedAt };
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}