All checks were successful
CI / test (push) Successful in 19s
- "src/services/scheduler.ts" - "src/services/missing-file-scanner.ts" GSD-Task: S08/T02
624 lines
21 KiB
TypeScript
624 lines
21 KiB
TypeScript
import { Cron } from 'croner';
|
|
import type { LibSQLDatabase } from 'drizzle-orm/libsql';
|
|
import type * as schema from '../db/schema/index';
|
|
import type { Channel, Platform, PlatformContentMetadata } from '../types/index';
|
|
import type { PlatformRegistry, PlatformSource, FetchRecentContentOptions } from '../sources/platform-source';
|
|
import type { RateLimiter } from './rate-limiter';
|
|
import { YtDlpError, execYtDlp, parseSingleJson } from '../sources/yt-dlp';
|
|
import type { EventBus } from './event-bus';
|
|
import { matchesKeywordFilter } from './keyword-filter';
|
|
import {
|
|
getEnabledChannels,
|
|
updateChannel,
|
|
} from '../db/repositories/channel-repository';
|
|
import {
|
|
createContentItem,
|
|
getRecentContentIds,
|
|
getContentByPlatformContentId,
|
|
updateContentItem,
|
|
} from '../db/repositories/content-repository';
|
|
import { getPlatformSettings } from '../db/repositories/platform-settings-repository';
|
|
import { upsertPlaylists } from '../db/repositories/playlist-repository';
|
|
|
|
// ── Types ──
|
|
|
|
export interface ChannelJobState {
|
|
channelId: number;
|
|
channelName: string;
|
|
platform: Platform;
|
|
isRunning: boolean;
|
|
nextRun: Date | null;
|
|
lastCheckedAt: string | null;
|
|
lastCheckStatus: string | null;
|
|
}
|
|
|
|
export interface SchedulerState {
|
|
running: boolean;
|
|
channelCount: number;
|
|
channels: ChannelJobState[];
|
|
}
|
|
|
|
export interface CheckChannelResult {
|
|
channelId: number;
|
|
channelName: string;
|
|
newItems: number;
|
|
totalFetched: number;
|
|
status: 'success' | 'error' | 'rate_limited' | 'already_running';
|
|
}
|
|
|
|
/** Optional configuration for the scheduler service. */
|
|
export interface SchedulerOptions {
|
|
/** Called when a new content item is inserted — used to auto-enqueue for download. */
|
|
onNewContent?: (contentItemId: number) => void;
|
|
/** Event bus for broadcasting scan lifecycle events to WebSocket clients. */
|
|
eventBus?: EventBus;
|
|
}
|
|
|
|
// ── Scheduler Service ──
|
|
|
|
/**
|
|
* Manages per-channel cron jobs for content monitoring.
|
|
*
|
|
* Loads enabled channels from the database, creates Cron jobs that periodically
|
|
* check for new content via platform sources, deduplicates against existing
|
|
* records, and inserts new items with `monitored` status.
|
|
*/
|
|
export class SchedulerService {
|
|
private readonly db: LibSQLDatabase<typeof schema>;
|
|
private readonly platformRegistry: PlatformRegistry;
|
|
private readonly rateLimiter: RateLimiter;
|
|
private readonly onNewContent?: (contentItemId: number) => void;
|
|
private readonly eventBus?: EventBus;
|
|
private readonly jobs = new Map<number, Cron>();
|
|
private readonly channelCache = new Map<number, Channel>();
|
|
private readonly activeChecks = new Set<number>();
|
|
private readonly activeAbortControllers = new Map<number, AbortController>();
|
|
private running = false;
|
|
|
|
constructor(
|
|
db: LibSQLDatabase<typeof schema>,
|
|
platformRegistry: PlatformRegistry,
|
|
rateLimiter: RateLimiter,
|
|
options?: SchedulerOptions
|
|
) {
|
|
this.db = db;
|
|
this.platformRegistry = platformRegistry;
|
|
this.rateLimiter = rateLimiter;
|
|
this.onNewContent = options?.onNewContent;
|
|
this.eventBus = options?.eventBus;
|
|
}
|
|
|
|
/**
|
|
* Load all enabled channels and create cron jobs for each.
|
|
* Returns the number of channels loaded.
|
|
*/
|
|
async start(): Promise<number> {
|
|
const channels = await getEnabledChannels(this.db);
|
|
|
|
for (const channel of channels) {
|
|
this.createJob(channel);
|
|
}
|
|
|
|
this.running = true;
|
|
console.log(`[scheduler] Started with ${channels.length} channels`);
|
|
return channels.length;
|
|
}
|
|
|
|
/** Stop all active cron jobs. */
|
|
stop(): void {
|
|
for (const [channelId, job] of this.jobs) {
|
|
job.stop();
|
|
console.log(`[scheduler] Stopped job for channel ${channelId}`);
|
|
}
|
|
this.jobs.clear();
|
|
this.channelCache.clear();
|
|
this.running = false;
|
|
console.log('[scheduler] Stopped');
|
|
}
|
|
|
|
/** Create a cron job for a newly added channel. */
|
|
addChannel(channel: Channel): void {
|
|
if (!channel.monitoringEnabled) return;
|
|
this.createJob(channel);
|
|
console.log(
|
|
`[scheduler] Added job for channel ${channel.id} ("${channel.name}") — interval: ${channel.checkInterval}m`
|
|
);
|
|
}
|
|
|
|
/** Stop and remove the cron job for a deleted channel. */
|
|
removeChannel(channelId: number): void {
|
|
const job = this.jobs.get(channelId);
|
|
if (job) {
|
|
job.stop();
|
|
this.jobs.delete(channelId);
|
|
this.channelCache.delete(channelId);
|
|
console.log(`[scheduler] Removed job for channel ${channelId}`);
|
|
}
|
|
}
|
|
|
|
/** Update a channel's cron job (remove old, create new with updated interval). */
|
|
updateChannel(channel: Channel): void {
|
|
this.removeChannel(channel.id);
|
|
if (channel.monitoringEnabled) {
|
|
this.createJob(channel);
|
|
console.log(
|
|
`[scheduler] Updated job for channel ${channel.id} ("${channel.name}") — interval: ${channel.checkInterval}m`
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check a channel for new content.
|
|
*
|
|
* 1. Check per-channel lock (reject overlap)
|
|
* 2. Acquire rate limiter slot for the platform
|
|
* 3. Fetch recent content via platform source
|
|
* 4. Deduplicate against existing content
|
|
* 5. Insert new items with `monitored` status
|
|
* 6. Update channel's lastCheckedAt and lastCheckStatus
|
|
*
|
|
* Returns a structured result with item counts and status.
|
|
*/
|
|
async checkChannel(channel: Channel, signal?: AbortSignal): Promise<CheckChannelResult> {
|
|
// Per-channel lock — reject overlap before any async work
|
|
if (this.activeChecks.has(channel.id)) {
|
|
console.log(
|
|
`[scheduler] Skipping channel ${channel.id} ("${channel.name}") — already running`
|
|
);
|
|
return {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: 0,
|
|
totalFetched: 0,
|
|
status: 'already_running',
|
|
};
|
|
}
|
|
|
|
// Create AbortController for this scan if no external signal provided
|
|
const abortController = new AbortController();
|
|
const effectiveSignal = signal ?? abortController.signal;
|
|
// Link external signal to our controller if provided
|
|
if (signal) {
|
|
signal.addEventListener('abort', () => abortController.abort(signal.reason), { once: true });
|
|
}
|
|
|
|
this.activeChecks.add(channel.id);
|
|
this.activeAbortControllers.set(channel.id, abortController);
|
|
|
|
// Emit scan:started before any async work
|
|
this.eventBus?.emitScan('scan:started', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
});
|
|
|
|
console.log(
|
|
`[scheduler] Checking channel ${channel.id} ("${channel.name}") on ${channel.platform}`
|
|
);
|
|
|
|
try {
|
|
// 1. Rate limit
|
|
await this.rateLimiter.acquire(channel.platform);
|
|
|
|
// 2. Get platform source
|
|
const source = this.platformRegistry.get(channel.platform);
|
|
if (!source) {
|
|
throw new Error(
|
|
`No platform source registered for "${channel.platform}"`
|
|
);
|
|
}
|
|
|
|
// 3. Load platform settings for scan limit and rate limit delay
|
|
const platformSettingsRow = await getPlatformSettings(this.db, channel.platform);
|
|
const baseScanLimit = platformSettingsRow?.scanLimit ?? 500;
|
|
const rateLimitDelay = platformSettingsRow?.rateLimitDelay ?? 1000;
|
|
|
|
// First scan (lastCheckedAt === null) → grab full catalog up to 999
|
|
const scanLimit = channel.lastCheckedAt === null
|
|
? Math.max(baseScanLimit, 999)
|
|
: baseScanLimit;
|
|
|
|
// 4. Load existing content IDs for dedup gating
|
|
const existingIds = new Set(
|
|
await getRecentContentIds(this.db, channel.id)
|
|
);
|
|
|
|
// 5. Fetch content — discovery-only (fast Phase 1, skip slow enrichment)
|
|
const fetchOptions: FetchRecentContentOptions = {
|
|
limit: scanLimit,
|
|
existingIds,
|
|
rateLimitDelay,
|
|
discoveryOnly: true,
|
|
signal: effectiveSignal,
|
|
};
|
|
const items: PlatformContentMetadata[] =
|
|
await source.fetchRecentContent(channel, fetchOptions);
|
|
|
|
// 6. Deduplicate — filter out items already known
|
|
const newItems = items.filter(
|
|
(item) => !existingIds.has(item.platformContentId)
|
|
);
|
|
|
|
// 6b. Apply keyword filter — exclude/include patterns from channel settings
|
|
const filteredItems = newItems.filter((item) =>
|
|
matchesKeywordFilter(item.title, channel.includeKeywords, channel.excludeKeywords)
|
|
);
|
|
if (filteredItems.length < newItems.length) {
|
|
console.log(
|
|
`[scheduler] Keyword filter: ${newItems.length - filteredItems.length} of ${newItems.length} new items filtered out for channel ${channel.id}`
|
|
);
|
|
}
|
|
|
|
// 7. Insert new items (check abort between each)
|
|
let insertedCount = 0;
|
|
for (const item of filteredItems) {
|
|
// Check if scan was cancelled
|
|
if (effectiveSignal.aborted) {
|
|
console.log(
|
|
`[scheduler] Scan cancelled for channel ${channel.id} ("${channel.name}") after ${insertedCount} items`
|
|
);
|
|
this.eventBus?.emitScan('scan:complete', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: insertedCount,
|
|
totalFetched: items.length,
|
|
});
|
|
return {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: insertedCount,
|
|
totalFetched: items.length,
|
|
status: 'success',
|
|
};
|
|
}
|
|
// Scheduler discovers *new* content (future), so 'all' and 'future' → monitored
|
|
const monitored = channel.monitoringMode === 'all' || channel.monitoringMode === 'future';
|
|
const created = await createContentItem(this.db, {
|
|
channelId: channel.id,
|
|
title: item.title,
|
|
platformContentId: item.platformContentId,
|
|
url: item.url,
|
|
contentType: item.contentType,
|
|
duration: item.duration,
|
|
thumbnailUrl: item.thumbnailUrl,
|
|
publishedAt: item.publishedAt ?? null,
|
|
status: 'monitored',
|
|
monitored,
|
|
});
|
|
if (created) {
|
|
insertedCount++;
|
|
// Broadcast the new item to WebSocket clients
|
|
this.eventBus?.emitScan('scan:item-discovered', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
item: created,
|
|
});
|
|
// Only auto-enqueue monitored items
|
|
if (this.onNewContent && created.monitored) {
|
|
this.onNewContent(created.id);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 8. Update channel status
|
|
await updateChannel(this.db, channel.id, {
|
|
lastCheckedAt: new Date().toISOString(),
|
|
lastCheckStatus: 'success',
|
|
});
|
|
|
|
this.rateLimiter.reportSuccess(channel.platform);
|
|
|
|
console.log(
|
|
`[scheduler] Check complete for channel ${channel.id}: ${insertedCount} new items (${items.length} fetched, ${existingIds.size} existing)`
|
|
);
|
|
|
|
this.eventBus?.emitScan('scan:complete', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: insertedCount,
|
|
totalFetched: items.length,
|
|
});
|
|
|
|
// 9. Background Phase 2: enrich newly inserted items with full metadata
|
|
// This runs after the scan result is returned — enrichment updates DB records
|
|
// and triggers a final cache invalidation when done.
|
|
if (insertedCount > 0 && !effectiveSignal.aborted) {
|
|
this.enrichNewItems(channel, filteredItems, existingIds, rateLimitDelay, source, effectiveSignal)
|
|
.catch((err) => {
|
|
console.error(
|
|
`[scheduler] Background enrichment failed for channel ${channel.id}:`,
|
|
err instanceof Error ? err.message : err
|
|
);
|
|
});
|
|
}
|
|
|
|
// 10. Best-effort playlist sync (K011 sidecar pattern)
|
|
// Runs after content scan succeeds — failure never affects the scan result.
|
|
if (source.fetchPlaylists && !effectiveSignal.aborted) {
|
|
this.syncPlaylists(channel, source)
|
|
.catch((err) => {
|
|
console.error(
|
|
`[scheduler] Playlist sync failed for channel ${channel.id}:`,
|
|
err instanceof Error ? err.message : err
|
|
);
|
|
});
|
|
}
|
|
|
|
return {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: insertedCount,
|
|
totalFetched: items.length,
|
|
status: 'success',
|
|
};
|
|
} catch (err) {
|
|
// Determine status based on error type
|
|
const isRateLimit =
|
|
err instanceof YtDlpError && err.isRateLimit;
|
|
const status = isRateLimit ? 'rate_limited' : 'error';
|
|
|
|
// Update channel status
|
|
try {
|
|
await updateChannel(this.db, channel.id, {
|
|
lastCheckedAt: new Date().toISOString(),
|
|
lastCheckStatus: status,
|
|
});
|
|
} catch (updateErr) {
|
|
console.error(
|
|
`[scheduler] Failed to update status for channel ${channel.id}:`,
|
|
updateErr
|
|
);
|
|
}
|
|
|
|
this.rateLimiter.reportError(channel.platform);
|
|
|
|
this.eventBus?.emitScan('scan:error', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
error: err instanceof Error ? err.message : String(err),
|
|
});
|
|
|
|
console.error(
|
|
`[scheduler] Check failed for channel ${channel.id} ("${channel.name}"): ${status}`,
|
|
err instanceof Error ? err.message : err
|
|
);
|
|
|
|
return {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: 0,
|
|
totalFetched: 0,
|
|
status,
|
|
};
|
|
} finally {
|
|
this.activeChecks.delete(channel.id);
|
|
this.activeAbortControllers.delete(channel.id);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check whether a channel scan is currently in progress.
|
|
*/
|
|
isScanning(channelId: number): boolean {
|
|
return this.activeChecks.has(channelId);
|
|
}
|
|
|
|
/**
|
|
* Cancel an in-progress scan for a channel.
|
|
* Returns true if a scan was running and was cancelled.
|
|
*/
|
|
cancelScan(channelId: number): boolean {
|
|
const controller = this.activeAbortControllers.get(channelId);
|
|
if (!controller) return false;
|
|
controller.abort('User cancelled');
|
|
console.log(`[scheduler] Scan cancel requested for channel ${channelId}`);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Get the current state of the scheduler for diagnostic inspection.
|
|
*/
|
|
getState(): SchedulerState {
|
|
const channels: ChannelJobState[] = [];
|
|
|
|
for (const [channelId, job] of this.jobs) {
|
|
const channel = this.channelCache.get(channelId);
|
|
channels.push({
|
|
channelId,
|
|
channelName: channel?.name ?? 'unknown',
|
|
platform: (channel?.platform ?? 'unknown') as Platform,
|
|
isRunning: job.isBusy(),
|
|
nextRun: job.nextRun() ?? null,
|
|
lastCheckedAt: channel?.lastCheckedAt ?? null,
|
|
lastCheckStatus: channel?.lastCheckStatus ?? null,
|
|
});
|
|
}
|
|
|
|
return {
|
|
running: this.running,
|
|
channelCount: this.jobs.size,
|
|
channels,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Background Phase 2: directly enrich newly discovered items by fetching
|
|
* full metadata per-video. Updates DB records with publishedAt, duration,
|
|
* and correct contentType. Emits scan:enrichment-complete when done.
|
|
*/
|
|
private async enrichNewItems(
|
|
channel: Channel,
|
|
discoveredItems: PlatformContentMetadata[],
|
|
existingIds: Set<string>,
|
|
rateLimitDelay: number,
|
|
_source: PlatformSource,
|
|
signal: AbortSignal,
|
|
): Promise<void> {
|
|
const newItems = discoveredItems
|
|
.filter((item) => !existingIds.has(item.platformContentId));
|
|
|
|
if (newItems.length === 0) return;
|
|
|
|
console.log(
|
|
`[scheduler] Phase 2: enriching ${newItems.length} items for channel ${channel.id}`
|
|
);
|
|
|
|
let enrichedCount = 0;
|
|
|
|
for (let i = 0; i < newItems.length; i++) {
|
|
if (signal.aborted) {
|
|
console.log(
|
|
`[scheduler] Phase 2 aborted for channel ${channel.id} after ${enrichedCount}/${newItems.length} items`
|
|
);
|
|
break;
|
|
}
|
|
|
|
const item = newItems[i];
|
|
|
|
// Rate limit delay between enrichment calls (skip before first)
|
|
if (i > 0 && rateLimitDelay > 0) {
|
|
await sleep(rateLimitDelay);
|
|
}
|
|
|
|
try {
|
|
// Use the item's original URL (platform-agnostic) instead of
|
|
// hardcoding YouTube — SoundCloud / generic items have their own URLs.
|
|
const contentUrl = item.url || `https://www.youtube.com/watch?v=${item.platformContentId}`;
|
|
const enrichResult = await execYtDlp(
|
|
['--dump-json', '--no-playlist', contentUrl],
|
|
{ timeout: 15_000 }
|
|
);
|
|
|
|
const enrichData = parseSingleJson(enrichResult.stdout) as Record<string, unknown>;
|
|
|
|
// Parse enriched fields
|
|
const publishedAt = parseUploadDate(enrichData.upload_date as string | undefined);
|
|
const duration = typeof enrichData.duration === 'number' ? enrichData.duration : null;
|
|
|
|
// Look up the DB record
|
|
const dbItem = await getContentByPlatformContentId(this.db, channel.id, item.platformContentId);
|
|
if (!dbItem) continue;
|
|
|
|
// Build update payload — only update fields that enrichment provides
|
|
const updates: Record<string, unknown> = {};
|
|
if (publishedAt && !dbItem.publishedAt) {
|
|
updates.publishedAt = publishedAt;
|
|
}
|
|
if (duration != null && dbItem.duration == null) {
|
|
updates.duration = duration;
|
|
}
|
|
|
|
if (Object.keys(updates).length > 0) {
|
|
await updateContentItem(this.db, dbItem.id, updates);
|
|
enrichedCount++;
|
|
}
|
|
} catch (err) {
|
|
// Tolerate individual enrichment failures
|
|
console.warn(
|
|
`[scheduler] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}`
|
|
);
|
|
}
|
|
}
|
|
|
|
console.log(
|
|
`[scheduler] Phase 2 complete for channel ${channel.id}: ${enrichedCount}/${newItems.length} items enriched`
|
|
);
|
|
|
|
// Notify clients that enrichment is done — they should refetch content
|
|
this.eventBus?.emitScan('scan:complete', {
|
|
channelId: channel.id,
|
|
channelName: channel.name,
|
|
newItems: enrichedCount,
|
|
totalFetched: newItems.length,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Best-effort playlist sync for a channel.
|
|
* Fetches playlists from the platform source and upserts them into the DB.
|
|
* Failure is logged but never propagated (K011 sidecar pattern).
|
|
*/
|
|
private async syncPlaylists(channel: Channel, source: PlatformSource): Promise<void> {
|
|
if (!source.fetchPlaylists) return;
|
|
|
|
try {
|
|
const discoveryResults = await source.fetchPlaylists(channel);
|
|
if (discoveryResults.length === 0) {
|
|
console.log(`[scheduler] Playlist sync: no playlists found for channel ${channel.id}`);
|
|
return;
|
|
}
|
|
|
|
const upserted = await upsertPlaylists(this.db, channel.id, discoveryResults);
|
|
console.log(
|
|
`[scheduler] Playlist sync complete for channel ${channel.id}: ${upserted.length} playlists synced`
|
|
);
|
|
} catch (err) {
|
|
console.error(
|
|
`[scheduler] Playlist sync error for channel ${channel.id}:`,
|
|
err instanceof Error ? err.message : err
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── Internal ──
|
|
|
|
/**
|
|
* Create a Cron job for a channel.
|
|
* Uses the interval option for arbitrary check intervals.
|
|
*/
|
|
private createJob(channel: Channel): void {
|
|
const intervalSeconds = channel.checkInterval * 60;
|
|
const cronPattern = minutesToCronPattern(channel.checkInterval);
|
|
|
|
const job = new Cron(
|
|
cronPattern,
|
|
{
|
|
protect: true, // Prevent overlapping runs
|
|
interval: intervalSeconds, // Minimum seconds between runs
|
|
},
|
|
async () => {
|
|
// Refresh channel from cache (it may have been updated)
|
|
const current = this.channelCache.get(channel.id) ?? channel;
|
|
await this.checkChannel(current);
|
|
}
|
|
);
|
|
|
|
this.jobs.set(channel.id, job);
|
|
this.channelCache.set(channel.id, channel);
|
|
}
|
|
}
|
|
|
|
// ── Helpers ──
|
|
|
|
/**
|
|
* Convert a check interval in minutes to a cron pattern.
|
|
* For intervals that divide evenly into 60, use `* /{n} * * * *`.
|
|
* For other intervals, use the closest reasonable pattern and rely on
|
|
* croner's `interval` option for exact timing.
|
|
*/
|
|
function minutesToCronPattern(minutes: number): string {
|
|
if (minutes <= 0) return '*/5 * * * *'; // Fallback: every 5 minutes
|
|
if (minutes < 60 && 60 % minutes === 0) {
|
|
return `*/${minutes} * * * *`;
|
|
}
|
|
if (minutes === 60) {
|
|
return '0 * * * *'; // Every hour
|
|
}
|
|
if (minutes < 60) {
|
|
// Arbitrary sub-hour interval — run every minute, use `interval` option
|
|
return '* * * * *';
|
|
}
|
|
// For intervals >= 60 minutes, run every hour and use `interval` option
|
|
return '0 * * * *';
|
|
}
|
|
|
|
function parseUploadDate(raw: string | undefined): string | null {
|
|
if (!raw || raw.length !== 8) return null;
|
|
const y = raw.slice(0, 4);
|
|
const m = raw.slice(4, 6);
|
|
const d = raw.slice(6, 8);
|
|
return `${y}-${m}-${d}T00:00:00Z`;
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|