feat: Wire keyword filter into scheduler scan flow — exclude/include pa…

- "src/services/scheduler.ts"
- "src/__tests__/scheduler.test.ts"
- "src/db/repositories/channel-repository.ts"

GSD-Task: S03/T03
This commit is contained in:
jlightner 2026-04-04 05:41:55 +00:00
parent cc031a78a9
commit 05045828d8
3 changed files with 189 additions and 2 deletions

View file

@ -659,6 +659,180 @@ describe('SchedulerService', () => {
scheduler.stop(); scheduler.stop();
}); });
// ── Keyword filter tests ──
it('excludes items matching excludeKeywords pattern', async () => {
const channel = await insertTestChannel({ excludeKeywords: 'shorts|#shorts' });
const scheduler = new SchedulerService(db, registry, rateLimiter);
const items: PlatformContentMetadata[] = [
{
platformContentId: `kf_exc_${channel.id}_1`,
title: 'Great Video About Coding',
url: 'https://www.youtube.com/watch?v=1',
contentType: 'video',
duration: 600,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_exc_${channel.id}_2`,
title: 'Quick shorts compilation',
url: 'https://www.youtube.com/watch?v=2',
contentType: 'video',
duration: 30,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_exc_${channel.id}_3`,
title: 'My Day #shorts vlog',
url: 'https://www.youtube.com/watch?v=3',
contentType: 'video',
duration: 15,
thumbnailUrl: null,
publishedAt: null,
},
];
mockFetchRecentContent.mockResolvedValueOnce(items);
const result = await scheduler.checkChannel(channel);
// Only the first item should pass the filter
expect(result.newItems).toBe(1);
expect(result.totalFetched).toBe(3);
const content = await getContentByChannelId(db, channel.id);
const inserted = content.filter(c =>
c.platformContentId.startsWith(`kf_exc_${channel.id}`)
);
expect(inserted.length).toBe(1);
expect(inserted[0].title).toBe('Great Video About Coding');
scheduler.stop();
});
it('includes only items matching includeKeywords pattern', async () => {
const channel = await insertTestChannel({ includeKeywords: 'tutorial|guide' });
const scheduler = new SchedulerService(db, registry, rateLimiter);
const items: PlatformContentMetadata[] = [
{
platformContentId: `kf_inc_${channel.id}_1`,
title: 'Python Tutorial for Beginners',
url: 'https://www.youtube.com/watch?v=1',
contentType: 'video',
duration: 1800,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_inc_${channel.id}_2`,
title: 'Random Vlog Day 5',
url: 'https://www.youtube.com/watch?v=2',
contentType: 'video',
duration: 300,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_inc_${channel.id}_3`,
title: 'Ultimate Guide to Docker',
url: 'https://www.youtube.com/watch?v=3',
contentType: 'video',
duration: 2400,
thumbnailUrl: null,
publishedAt: null,
},
];
mockFetchRecentContent.mockResolvedValueOnce(items);
const result = await scheduler.checkChannel(channel);
expect(result.newItems).toBe(2);
const content = await getContentByChannelId(db, channel.id);
const inserted = content.filter(c =>
c.platformContentId.startsWith(`kf_inc_${channel.id}`)
);
expect(inserted.length).toBe(2);
const titles = inserted.map(c => c.title);
expect(titles).toContain('Python Tutorial for Beginners');
expect(titles).toContain('Ultimate Guide to Docker');
scheduler.stop();
});
it('applies both include and exclude patterns together', async () => {
const channel = await insertTestChannel({
includeKeywords: 'tutorial',
excludeKeywords: 'shorts',
});
const scheduler = new SchedulerService(db, registry, rateLimiter);
const items: PlatformContentMetadata[] = [
{
platformContentId: `kf_both_${channel.id}_1`,
title: 'Tutorial: Getting Started',
url: 'https://www.youtube.com/watch?v=1',
contentType: 'video',
duration: 1800,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_both_${channel.id}_2`,
title: 'Tutorial shorts recap',
url: 'https://www.youtube.com/watch?v=2',
contentType: 'video',
duration: 30,
thumbnailUrl: null,
publishedAt: null,
},
{
platformContentId: `kf_both_${channel.id}_3`,
title: 'Random Gaming Stream',
url: 'https://www.youtube.com/watch?v=3',
contentType: 'video',
duration: 7200,
thumbnailUrl: null,
publishedAt: null,
},
];
mockFetchRecentContent.mockResolvedValueOnce(items);
const result = await scheduler.checkChannel(channel);
// Item 1: matches include, no exclude match → pass
// Item 2: matches include AND exclude → excluded (exclude wins)
// Item 3: doesn't match include → excluded
expect(result.newItems).toBe(1);
const content = await getContentByChannelId(db, channel.id);
const inserted = content.filter(c =>
c.platformContentId.startsWith(`kf_both_${channel.id}`)
);
expect(inserted.length).toBe(1);
expect(inserted[0].title).toBe('Tutorial: Getting Started');
scheduler.stop();
});
it('does not filter when no keywords are set', async () => {
const channel = await insertTestChannel({
includeKeywords: null,
excludeKeywords: null,
});
const scheduler = new SchedulerService(db, registry, rateLimiter);
mockFetchRecentContent.mockResolvedValueOnce(
makeCannedContent(4, `kf_none_${channel.id}`)
);
const result = await scheduler.checkChannel(channel);
expect(result.newItems).toBe(4);
scheduler.stop();
});
// ── monitoringMode-aware item creation tests ── // ── monitoringMode-aware item creation tests ──
it("creates items with monitored=false when channel monitoringMode is 'none'", async () => { it("creates items with monitored=false when channel monitoringMode is 'none'", async () => {

View file

@ -49,6 +49,8 @@ export async function createChannel(
bannerUrl: data.bannerUrl ?? null, bannerUrl: data.bannerUrl ?? null,
description: data.description ?? null, description: data.description ?? null,
subscriberCount: data.subscriberCount ?? null, subscriberCount: data.subscriberCount ?? null,
includeKeywords: data.includeKeywords ?? null,
excludeKeywords: data.excludeKeywords ?? null,
}) })
.returning(); .returning();

View file

@ -6,6 +6,7 @@ import type { PlatformRegistry, PlatformSource, FetchRecentContentOptions } from
import type { RateLimiter } from './rate-limiter'; import type { RateLimiter } from './rate-limiter';
import { YtDlpError } from '../sources/yt-dlp'; import { YtDlpError } from '../sources/yt-dlp';
import type { EventBus } from './event-bus'; import type { EventBus } from './event-bus';
import { matchesKeywordFilter } from './keyword-filter';
import { import {
getEnabledChannels, getEnabledChannels,
updateChannel, updateChannel,
@ -236,9 +237,19 @@ export class SchedulerService {
(item) => !existingIds.has(item.platformContentId) (item) => !existingIds.has(item.platformContentId)
); );
// 6b. Apply keyword filter — exclude/include patterns from channel settings
const filteredItems = newItems.filter((item) =>
matchesKeywordFilter(item.title, channel.includeKeywords, channel.excludeKeywords)
);
if (filteredItems.length < newItems.length) {
console.log(
`[scheduler] Keyword filter: ${newItems.length - filteredItems.length} of ${newItems.length} new items filtered out for channel ${channel.id}`
);
}
// 7. Insert new items (check abort between each) // 7. Insert new items (check abort between each)
let insertedCount = 0; let insertedCount = 0;
for (const item of newItems) { for (const item of filteredItems) {
// Check if scan was cancelled // Check if scan was cancelled
if (effectiveSignal.aborted) { if (effectiveSignal.aborted) {
console.log( console.log(
@ -310,7 +321,7 @@ export class SchedulerService {
// This runs after the scan result is returned — enrichment updates DB records // This runs after the scan result is returned — enrichment updates DB records
// and triggers a final cache invalidation when done. // and triggers a final cache invalidation when done.
if (insertedCount > 0 && !effectiveSignal.aborted) { if (insertedCount > 0 && !effectiveSignal.aborted) {
this.enrichNewItems(channel, newItems, existingIds, rateLimitDelay, source, effectiveSignal) this.enrichNewItems(channel, filteredItems, existingIds, rateLimitDelay, source, effectiveSignal)
.catch((err) => { .catch((err) => {
console.error( console.error(
`[scheduler] Background enrichment failed for channel ${channel.id}:`, `[scheduler] Background enrichment failed for channel ${channel.id}:`,