diff --git a/src/__tests__/keyword-filter.test.ts b/src/__tests__/keyword-filter.test.ts new file mode 100644 index 0000000..8e162f8 --- /dev/null +++ b/src/__tests__/keyword-filter.test.ts @@ -0,0 +1,252 @@ +import { describe, it, expect } from 'vitest'; +import { + matchesKeywordFilter, + parsePatterns, + patternMatches, +} from '../services/keyword-filter'; + +// ── parsePatterns ── + +describe('parsePatterns', () => { + it('returns empty array for null', () => { + expect(parsePatterns(null)).toEqual([]); + }); + + it('returns empty array for undefined', () => { + expect(parsePatterns(undefined)).toEqual([]); + }); + + it('returns empty array for empty string', () => { + expect(parsePatterns('')).toEqual([]); + }); + + it('splits pipe-separated patterns', () => { + expect(parsePatterns('shorts|live')).toEqual(['shorts', 'live']); + }); + + it('trims whitespace from patterns', () => { + expect(parsePatterns(' shorts | live ')).toEqual(['shorts', 'live']); + }); + + it('drops empty segments', () => { + expect(parsePatterns('shorts||live|')).toEqual(['shorts', 'live']); + }); + + it('handles single pattern', () => { + expect(parsePatterns('tutorial')).toEqual(['tutorial']); + }); +}); + +// ── patternMatches ── + +describe('patternMatches', () => { + describe('plain text (case-insensitive substring)', () => { + it('matches substring', () => { + expect(patternMatches('shorts', 'My Shorts Video')).toBe(true); + }); + + it('matches case-insensitively', () => { + expect(patternMatches('SHORTS', 'my shorts video')).toBe(true); + }); + + it('does not match when absent', () => { + expect(patternMatches('podcast', 'My Shorts Video')).toBe(false); + }); + + it('matches exact title', () => { + expect(patternMatches('hello world', 'Hello World')).toBe(true); + }); + }); + + describe('glob patterns (with *)', () => { + it('matches * as wildcard at start', () => { + expect(patternMatches('*shorts', 'My #shorts')).toBe(true); + }); + + it('matches * as wildcard at end', () => { + expect(patternMatches('Episode*', 'Episode 42: The Return')).toBe(true); + }); + + it('matches * in the middle', () => { + expect(patternMatches('EP*Review', 'EP42 Review')).toBe(true); + }); + + it('matches double wildcards', () => { + expect(patternMatches('*shorts*', 'My #shorts Video')).toBe(true); + }); + + it('rejects non-matching glob', () => { + expect(patternMatches('Episode*', 'My Shorts Video')).toBe(false); + }); + + it('glob is case-insensitive', () => { + expect(patternMatches('*SHORTS*', 'my shorts video')).toBe(true); + }); + + it('glob anchors to full title (no partial)', () => { + // Without wildcards around it, glob requires full match + expect(patternMatches('shorts', 'My Shorts Video')).toBe(true); // plain text, not glob + expect(patternMatches('short*', 'My Shorts Video')).toBe(false); // anchored: must start with "short" + }); + }); + + describe('regex patterns (/regex/)', () => { + it('matches regex pattern', () => { + expect(patternMatches('/^EP\\d+/', 'EP42 Review')).toBe(true); + }); + + it('regex is case-insensitive', () => { + expect(patternMatches('/episode/', 'Episode 5')).toBe(true); + }); + + it('rejects non-matching regex', () => { + expect(patternMatches('/^EP\\d+/', 'My Shorts Video')).toBe(false); + }); + + it('handles complex regex', () => { + expect(patternMatches('/shorts|#shorts/', 'Watch #shorts now')).toBe(true); + }); + + it('falls back to plain text on invalid regex', () => { + // Invalid regex with unbalanced bracket — should fall back to substring match + // The full pattern "/[invalid/" is matched as plain text (including slashes) + expect(patternMatches('/[invalid/', 'contains /[invalid/ text')).toBe(true); + expect(patternMatches('/[invalid/', 'no match here')).toBe(false); + }); + + it('rejects single slashes as plain text, not regex', () => { + // "//" is length 2, not > 2, so treated as plain text + expect(patternMatches('//', '// comment')).toBe(true); + }); + }); +}); + +// ── matchesKeywordFilter ── + +describe('keyword filter matching engine', () => { + describe('no filters', () => { + it('passes when both null', () => { + expect(matchesKeywordFilter('Any Title', null, null)).toBe(true); + }); + + it('passes when both undefined', () => { + expect(matchesKeywordFilter('Any Title', undefined, undefined)).toBe(true); + }); + + it('passes when both empty string', () => { + expect(matchesKeywordFilter('Any Title', '', '')).toBe(true); + }); + }); + + describe('exclude only', () => { + it('excludes matching title', () => { + expect(matchesKeywordFilter('My #shorts Video', null, '#shorts')).toBe(false); + }); + + it('passes non-matching title', () => { + expect(matchesKeywordFilter('Full Episode 1', null, '#shorts')).toBe(true); + }); + + it('excludes on any matching pattern', () => { + expect(matchesKeywordFilter('Live Stream Now', null, 'shorts|live')).toBe(false); + }); + + it('passes when no exclude patterns match', () => { + expect(matchesKeywordFilter('Full Episode 1', null, 'shorts|live')).toBe(true); + }); + }); + + describe('include only', () => { + it('passes when title matches include', () => { + expect(matchesKeywordFilter('Episode 42', 'episode', null)).toBe(true); + }); + + it('rejects when title matches none of includes', () => { + expect(matchesKeywordFilter('Random Video', 'episode|tutorial', null)).toBe(false); + }); + + it('passes when title matches at least one include', () => { + expect(matchesKeywordFilter('Tutorial: React', 'episode|tutorial', null)).toBe(true); + }); + }); + + describe('include + exclude combined', () => { + it('exclude takes priority over include', () => { + // Title matches include "episode" but also matches exclude "shorts" + expect(matchesKeywordFilter( + 'Episode 1 #shorts', + 'episode', + '#shorts', + )).toBe(false); + }); + + it('passes when matches include and not exclude', () => { + expect(matchesKeywordFilter( + 'Episode 42: Deep Dive', + 'episode', + '#shorts|live', + )).toBe(true); + }); + + it('rejects when matches neither include nor exclude', () => { + expect(matchesKeywordFilter( + 'Random Video', + 'episode|tutorial', + '#shorts', + )).toBe(false); + }); + }); + + describe('mixed pattern types', () => { + it('works with regex exclude and plain include', () => { + expect(matchesKeywordFilter( + 'EP42 Shorts Compilation', + 'EP*', + '/shorts/', + )).toBe(false); + }); + + it('works with glob include', () => { + expect(matchesKeywordFilter( + 'Episode 42: The Return', + 'Episode*', + null, + )).toBe(true); + }); + + it('works with regex include', () => { + expect(matchesKeywordFilter( + 'EP42 Review', + '/^EP\\d+/', + null, + )).toBe(true); + }); + }); + + describe('edge cases', () => { + it('handles empty title', () => { + expect(matchesKeywordFilter('', 'episode', null)).toBe(false); + }); + + it('handles empty title with no filters', () => { + expect(matchesKeywordFilter('', null, null)).toBe(true); + }); + + it('handles special regex chars in plain text pattern', () => { + // The "." in plain text should match literally as substring + expect(matchesKeywordFilter('version 2.0 release', '2.0', null)).toBe(true); + }); + + it('handles pipe char in regex pattern', () => { + expect(matchesKeywordFilter( + 'Watch shorts now', + '/shorts|clips/', + null, + )).toBe(true); + }); + + it('whitespace-only patterns are dropped', () => { + expect(matchesKeywordFilter('Any Title', ' | ', null)).toBe(true); + }); + }); +}); diff --git a/src/services/keyword-filter.ts b/src/services/keyword-filter.ts new file mode 100644 index 0000000..cce9aeb --- /dev/null +++ b/src/services/keyword-filter.ts @@ -0,0 +1,128 @@ +// ── Keyword Filter Matching Engine ── +// +// Evaluates video/content titles against per-channel include/exclude keyword +// patterns. Patterns are pipe-separated strings stored in the DB; each +// individual pattern can be: +// • plain text → case-insensitive substring match +// • glob with * → converted to regex (e.g. "*shorts*" matches "My Shorts Video") +// • /regex/ → evaluated as a JS RegExp (case-insensitive) + +/** + * Parse a pipe-separated pattern string into individual trimmed patterns. + * Blank entries are silently dropped. + * + * Regex-aware: pipes inside `/regex/` delimiters are preserved as part of the + * pattern (e.g. `/shorts|clips/` stays as one pattern, not split into two). + */ +export function parsePatterns(raw: string | null | undefined): string[] { + if (!raw) return []; + + const patterns: string[] = []; + let current = ''; + let inRegex = false; + + for (let i = 0; i < raw.length; i++) { + const ch = raw[i]; + + if (ch === '/' && !inRegex && current.trim() === '') { + // Entering regex mode — slash at the start of a new pattern segment + inRegex = true; + current += ch; + } else if (ch === '/' && inRegex) { + // Closing regex delimiter + current += ch; + inRegex = false; + } else if (ch === '|' && !inRegex) { + // Pipe separator outside regex — flush current pattern + const trimmed = current.trim(); + if (trimmed.length > 0) patterns.push(trimmed); + current = ''; + } else { + current += ch; + } + } + + // Flush remaining + const trimmed = current.trim(); + if (trimmed.length > 0) patterns.push(trimmed); + + return patterns; +} + +/** + * Test whether a single pattern matches a title. + * + * Pattern types: + * - /regex/ → JS regular expression (case-insensitive) + * - *glob* → wildcard matching (case-insensitive) + * - plain → case-insensitive substring contains + */ +export function patternMatches(pattern: string, title: string): boolean { + // Regex pattern: /something/ + if (pattern.startsWith('/') && pattern.endsWith('/') && pattern.length > 2) { + try { + const regex = new RegExp(pattern.slice(1, -1), 'i'); + return regex.test(title); + } catch { + // Invalid regex — treat as a plain-text match + return title.toLowerCase().includes(pattern.toLowerCase()); + } + } + + // Glob pattern: contains at least one * + if (pattern.includes('*')) { + const regexSource = pattern + .split('*') + .map(escapeRegex) + .join('.*'); + const regex = new RegExp(`^${regexSource}$`, 'i'); + return regex.test(title); + } + + // Plain text: case-insensitive substring match + return title.toLowerCase().includes(pattern.toLowerCase()); +} + +/** + * Evaluate a title against include and exclude keyword patterns. + * + * Logic: + * 1. If excludePatterns is set and title matches ANY exclude pattern → false + * 2. If includePatterns is set, title must match AT LEAST ONE include → true/false + * 3. If neither is set → true (all titles pass) + * + * @param title The content title to evaluate + * @param includePatterns Pipe-separated include patterns (null = no filter) + * @param excludePatterns Pipe-separated exclude patterns (null = no filter) + * @returns true if the title should be enqueued, false if filtered out + */ +export function matchesKeywordFilter( + title: string, + includePatterns: string | null | undefined, + excludePatterns: string | null | undefined, +): boolean { + const excludes = parsePatterns(excludePatterns); + const includes = parsePatterns(includePatterns); + + // Exclude check first — any match rejects + if (excludes.length > 0) { + for (const pattern of excludes) { + if (patternMatches(pattern, title)) { + return false; + } + } + } + + // Include check — at least one must match (if set) + if (includes.length > 0) { + return includes.some((pattern) => patternMatches(pattern, title)); + } + + // No filters → pass + return true; +} + +/** Escape special regex characters in a string. */ +function escapeRegex(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +}