fix: direct per-video enrichment and migration statement breakpoints

- Rewrote scheduler enrichNewItems to fetch metadata per-video directly instead of redundantly re-running Phase 1 discovery. Background enrichment now calls yt-dlp --dump-json per video ID, updating publishedAt and duration on DB records. Emits scan:complete when enrichment finishes so clients know to refetch. - Added missing --> statement-breakpoint markers to migration 0011. Without them, Drizzle concatenated the three ALTER TABLE statements and SQLite only executed the first one, leaving embed_thumbnail and sponsor_block_remove columns missing from format_profiles.
2026-04-04 08:23:23 +00:00 · 2026-04-04 08:23:23 +00:00 · ad16cc6141
commit ad16cc6141
parent 8d77ae248a
2 changed files with 81 additions and 49 deletions
--- a/drizzle/0011_add_youtube_enhancements.sql
+++ b/drizzle/0011_add_youtube_enhancements.sql
@ -1,4 +1,4 @@
 -- Add YouTube enhancement columns to format_profiles
-ALTER TABLE format_profiles ADD COLUMN embed_chapters INTEGER NOT NULL DEFAULT 0;
+ALTER TABLE format_profiles ADD COLUMN embed_chapters INTEGER NOT NULL DEFAULT 0;--> statement-breakpoint
-ALTER TABLE format_profiles ADD COLUMN embed_thumbnail INTEGER NOT NULL DEFAULT 0;
+ALTER TABLE format_profiles ADD COLUMN embed_thumbnail INTEGER NOT NULL DEFAULT 0;--> statement-breakpoint
-ALTER TABLE format_profiles ADD COLUMN sponsor_block_remove TEXT; -- comma-separated: 'sponsor,selfpromo,interaction,intro,outro,preview,music_offtopic,filler'
+ALTER TABLE format_profiles ADD COLUMN sponsor_block_remove TEXT;
--- a/src/services/scheduler.ts
+++ b/src/services/scheduler.ts
@ -4,7 +4,7 @@ import type * as schema from '../db/schema/index';
 import type { Channel, Platform, PlatformContentMetadata } from '../types/index';
 import type { PlatformRegistry, PlatformSource, FetchRecentContentOptions } from '../sources/platform-source';
 import type { RateLimiter } from './rate-limiter';
-import { YtDlpError } from '../sources/yt-dlp';
+import { YtDlpError, execYtDlp, parseSingleJson } from '../sources/yt-dlp';
 import type { EventBus } from './event-bus';
 import { matchesKeywordFilter } from './keyword-filter';
 import {
@ -428,73 +428,93 @@ export class SchedulerService {
  }
  /**
-   * Background Phase 2: re-fetch full metadata for newly discovered items
+   * Background Phase 2: directly enrich newly discovered items by fetching
-   * and update their DB records with enriched data (publishedAt, duration, etc).
+   * full metadata per-video. Updates DB records with publishedAt, duration,
-   * Runs after the main scan completes — items are already visible to the user.
+   * and correct contentType. Emits scan:enrichment-complete when done.
   */
  private async enrichNewItems(
    channel: Channel,
    discoveredItems: PlatformContentMetadata[],
    existingIds: Set<string>,
    rateLimitDelay: number,
-    source: PlatformSource,
+    _source: PlatformSource,
    signal: AbortSignal,
  ): Promise<void> {
-    const newPlatformIds = discoveredItems
+    const newItems = discoveredItems
-      .filter((item) => !existingIds.has(item.platformContentId))
+      .filter((item) => !existingIds.has(item.platformContentId));
      .map((item) => item.platformContentId);
-    if (newPlatformIds.length === 0) return;
+    if (newItems.length === 0) return;
    console.log(
-      `[scheduler] Phase 2: enriching ${newPlatformIds.length} items for channel ${channel.id}`
+      `[scheduler] Phase 2: enriching ${newItems.length} items for channel ${channel.id}`
    );
    // Re-fetch with enrichment (discoveryOnly: false)
    const enrichedItems = await source.fetchRecentContent(channel, {
      limit: newPlatformIds.length + existingIds.size,
      existingIds,
      rateLimitDelay,
      discoveryOnly: false,
      signal,
    });
    // Build lookup by platformContentId
    const enrichedMap = new Map<string, PlatformContentMetadata>();
    for (const item of enrichedItems) {
      enrichedMap.set(item.platformContentId, item);
    }
    // Update DB records with enriched data
    let enrichedCount = 0;
    for (const platformContentId of newPlatformIds) {
      if (signal.aborted) break;
-      const enriched = enrichedMap.get(platformContentId);
+    for (let i = 0; i < newItems.length; i++) {
-      if (!enriched) continue;
+      if (signal.aborted) {
-
+        console.log(
-      // Look up the DB record by platformContentId
+          `[scheduler] Phase 2 aborted for channel ${channel.id} after ${enrichedCount}/${newItems.length} items`
-      const dbItem = await getContentByPlatformContentId(this.db, channel.id, platformContentId);
+        );
-      if (!dbItem) continue;
+        break;
      // Only update if enrichment provides new data
      const updates: Record<string, unknown> = {};
      if (enriched.publishedAt && !dbItem.publishedAt) {
        updates.publishedAt = enriched.publishedAt;
      }
      if (enriched.duration != null && dbItem.duration == null) {
        updates.duration = enriched.duration;
      }
-      if (Object.keys(updates).length > 0) {
+      const item = newItems[i];
-        await updateContentItem(this.db, dbItem.id, updates);
+
-        enrichedCount++;
+      // Rate limit delay between enrichment calls (skip before first)
      if (i > 0 && rateLimitDelay > 0) {
        await sleep(rateLimitDelay);
      }
      try {
        const videoUrl = `https://www.youtube.com/watch?v=${item.platformContentId}`;
        const enrichResult = await execYtDlp(
          ['--dump-json', '--no-playlist', videoUrl],
          { timeout: 15_000 }
        );
        const enrichData = parseSingleJson(enrichResult.stdout) as Record<string, unknown>;
        // Parse enriched fields
        const publishedAt = parseUploadDate(enrichData.upload_date as string | undefined);
        const duration = typeof enrichData.duration === 'number' ? enrichData.duration : null;
        // Look up the DB record
        const dbItem = await getContentByPlatformContentId(this.db, channel.id, item.platformContentId);
        if (!dbItem) continue;
        // Build update payload — only update fields that enrichment provides
        const updates: Record<string, unknown> = {};
        if (publishedAt && !dbItem.publishedAt) {
          updates.publishedAt = publishedAt;
        }
        if (duration != null && dbItem.duration == null) {
          updates.duration = duration;
        }
        if (Object.keys(updates).length > 0) {
          await updateContentItem(this.db, dbItem.id, updates);
          enrichedCount++;
        }
      } catch (err) {
        // Tolerate individual enrichment failures
        console.warn(
          `[scheduler] Enrichment failed for ${item.platformContentId}: ${err instanceof Error ? err.message : err}`
        );
      }
    }
    console.log(
-      `[scheduler] Phase 2 complete for channel ${channel.id}: ${enrichedCount} items enriched`
+      `[scheduler] Phase 2 complete for channel ${channel.id}: ${enrichedCount}/${newItems.length} items enriched`
    );
    // Notify clients that enrichment is done — they should refetch content
    this.eventBus?.emitScan('scan:complete', {
      channelId: channel.id,
      channelName: channel.name,
      newItems: enrichedCount,
      totalFetched: newItems.length,
    });
  }
  // ── Internal ──
@ -548,3 +568,15 @@ function minutesToCronPattern(minutes: number): string {
  // For intervals >= 60 minutes, run every hour and use `interval` option
  return '0 * * * *';
 }
 function parseUploadDate(raw: string | undefined): string | null {
  if (!raw || raw.length !== 8) return null;
  const y = raw.slice(0, 4);
  const m = raw.slice(4, 6);
  const d = raw.slice(6, 8);
  return `${y}-${m}-${d}T00:00:00Z`;
 }
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }