test: Implement matchesKeywordFilter engine with pipe-separated pattern…

- "src/services/keyword-filter.ts"
- "src/__tests__/keyword-filter.test.ts"

GSD-Task: S03/T02
This commit is contained in:
jlightner 2026-04-04 05:38:37 +00:00
parent 8d133024a5
commit cc031a78a9
2 changed files with 380 additions and 0 deletions

View file

@ -0,0 +1,252 @@
import { describe, it, expect } from 'vitest';
import {
matchesKeywordFilter,
parsePatterns,
patternMatches,
} from '../services/keyword-filter';
// ── parsePatterns ──
describe('parsePatterns', () => {
it('returns empty array for null', () => {
expect(parsePatterns(null)).toEqual([]);
});
it('returns empty array for undefined', () => {
expect(parsePatterns(undefined)).toEqual([]);
});
it('returns empty array for empty string', () => {
expect(parsePatterns('')).toEqual([]);
});
it('splits pipe-separated patterns', () => {
expect(parsePatterns('shorts|live')).toEqual(['shorts', 'live']);
});
it('trims whitespace from patterns', () => {
expect(parsePatterns(' shorts | live ')).toEqual(['shorts', 'live']);
});
it('drops empty segments', () => {
expect(parsePatterns('shorts||live|')).toEqual(['shorts', 'live']);
});
it('handles single pattern', () => {
expect(parsePatterns('tutorial')).toEqual(['tutorial']);
});
});
// ── patternMatches ──
describe('patternMatches', () => {
describe('plain text (case-insensitive substring)', () => {
it('matches substring', () => {
expect(patternMatches('shorts', 'My Shorts Video')).toBe(true);
});
it('matches case-insensitively', () => {
expect(patternMatches('SHORTS', 'my shorts video')).toBe(true);
});
it('does not match when absent', () => {
expect(patternMatches('podcast', 'My Shorts Video')).toBe(false);
});
it('matches exact title', () => {
expect(patternMatches('hello world', 'Hello World')).toBe(true);
});
});
describe('glob patterns (with *)', () => {
it('matches * as wildcard at start', () => {
expect(patternMatches('*shorts', 'My #shorts')).toBe(true);
});
it('matches * as wildcard at end', () => {
expect(patternMatches('Episode*', 'Episode 42: The Return')).toBe(true);
});
it('matches * in the middle', () => {
expect(patternMatches('EP*Review', 'EP42 Review')).toBe(true);
});
it('matches double wildcards', () => {
expect(patternMatches('*shorts*', 'My #shorts Video')).toBe(true);
});
it('rejects non-matching glob', () => {
expect(patternMatches('Episode*', 'My Shorts Video')).toBe(false);
});
it('glob is case-insensitive', () => {
expect(patternMatches('*SHORTS*', 'my shorts video')).toBe(true);
});
it('glob anchors to full title (no partial)', () => {
// Without wildcards around it, glob requires full match
expect(patternMatches('shorts', 'My Shorts Video')).toBe(true); // plain text, not glob
expect(patternMatches('short*', 'My Shorts Video')).toBe(false); // anchored: must start with "short"
});
});
describe('regex patterns (/regex/)', () => {
it('matches regex pattern', () => {
expect(patternMatches('/^EP\\d+/', 'EP42 Review')).toBe(true);
});
it('regex is case-insensitive', () => {
expect(patternMatches('/episode/', 'Episode 5')).toBe(true);
});
it('rejects non-matching regex', () => {
expect(patternMatches('/^EP\\d+/', 'My Shorts Video')).toBe(false);
});
it('handles complex regex', () => {
expect(patternMatches('/shorts|#shorts/', 'Watch #shorts now')).toBe(true);
});
it('falls back to plain text on invalid regex', () => {
// Invalid regex with unbalanced bracket — should fall back to substring match
// The full pattern "/[invalid/" is matched as plain text (including slashes)
expect(patternMatches('/[invalid/', 'contains /[invalid/ text')).toBe(true);
expect(patternMatches('/[invalid/', 'no match here')).toBe(false);
});
it('rejects single slashes as plain text, not regex', () => {
// "//" is length 2, not > 2, so treated as plain text
expect(patternMatches('//', '// comment')).toBe(true);
});
});
});
// ── matchesKeywordFilter ──
describe('keyword filter matching engine', () => {
describe('no filters', () => {
it('passes when both null', () => {
expect(matchesKeywordFilter('Any Title', null, null)).toBe(true);
});
it('passes when both undefined', () => {
expect(matchesKeywordFilter('Any Title', undefined, undefined)).toBe(true);
});
it('passes when both empty string', () => {
expect(matchesKeywordFilter('Any Title', '', '')).toBe(true);
});
});
describe('exclude only', () => {
it('excludes matching title', () => {
expect(matchesKeywordFilter('My #shorts Video', null, '#shorts')).toBe(false);
});
it('passes non-matching title', () => {
expect(matchesKeywordFilter('Full Episode 1', null, '#shorts')).toBe(true);
});
it('excludes on any matching pattern', () => {
expect(matchesKeywordFilter('Live Stream Now', null, 'shorts|live')).toBe(false);
});
it('passes when no exclude patterns match', () => {
expect(matchesKeywordFilter('Full Episode 1', null, 'shorts|live')).toBe(true);
});
});
describe('include only', () => {
it('passes when title matches include', () => {
expect(matchesKeywordFilter('Episode 42', 'episode', null)).toBe(true);
});
it('rejects when title matches none of includes', () => {
expect(matchesKeywordFilter('Random Video', 'episode|tutorial', null)).toBe(false);
});
it('passes when title matches at least one include', () => {
expect(matchesKeywordFilter('Tutorial: React', 'episode|tutorial', null)).toBe(true);
});
});
describe('include + exclude combined', () => {
it('exclude takes priority over include', () => {
// Title matches include "episode" but also matches exclude "shorts"
expect(matchesKeywordFilter(
'Episode 1 #shorts',
'episode',
'#shorts',
)).toBe(false);
});
it('passes when matches include and not exclude', () => {
expect(matchesKeywordFilter(
'Episode 42: Deep Dive',
'episode',
'#shorts|live',
)).toBe(true);
});
it('rejects when matches neither include nor exclude', () => {
expect(matchesKeywordFilter(
'Random Video',
'episode|tutorial',
'#shorts',
)).toBe(false);
});
});
describe('mixed pattern types', () => {
it('works with regex exclude and plain include', () => {
expect(matchesKeywordFilter(
'EP42 Shorts Compilation',
'EP*',
'/shorts/',
)).toBe(false);
});
it('works with glob include', () => {
expect(matchesKeywordFilter(
'Episode 42: The Return',
'Episode*',
null,
)).toBe(true);
});
it('works with regex include', () => {
expect(matchesKeywordFilter(
'EP42 Review',
'/^EP\\d+/',
null,
)).toBe(true);
});
});
describe('edge cases', () => {
it('handles empty title', () => {
expect(matchesKeywordFilter('', 'episode', null)).toBe(false);
});
it('handles empty title with no filters', () => {
expect(matchesKeywordFilter('', null, null)).toBe(true);
});
it('handles special regex chars in plain text pattern', () => {
// The "." in plain text should match literally as substring
expect(matchesKeywordFilter('version 2.0 release', '2.0', null)).toBe(true);
});
it('handles pipe char in regex pattern', () => {
expect(matchesKeywordFilter(
'Watch shorts now',
'/shorts|clips/',
null,
)).toBe(true);
});
it('whitespace-only patterns are dropped', () => {
expect(matchesKeywordFilter('Any Title', ' | ', null)).toBe(true);
});
});
});

View file

@ -0,0 +1,128 @@
// ── Keyword Filter Matching Engine ──
//
// Evaluates video/content titles against per-channel include/exclude keyword
// patterns. Patterns are pipe-separated strings stored in the DB; each
// individual pattern can be:
// • plain text → case-insensitive substring match
// • glob with * → converted to regex (e.g. "*shorts*" matches "My Shorts Video")
// • /regex/ → evaluated as a JS RegExp (case-insensitive)
/**
* Parse a pipe-separated pattern string into individual trimmed patterns.
* Blank entries are silently dropped.
*
* Regex-aware: pipes inside `/regex/` delimiters are preserved as part of the
* pattern (e.g. `/shorts|clips/` stays as one pattern, not split into two).
*/
export function parsePatterns(raw: string | null | undefined): string[] {
if (!raw) return [];
const patterns: string[] = [];
let current = '';
let inRegex = false;
for (let i = 0; i < raw.length; i++) {
const ch = raw[i];
if (ch === '/' && !inRegex && current.trim() === '') {
// Entering regex mode — slash at the start of a new pattern segment
inRegex = true;
current += ch;
} else if (ch === '/' && inRegex) {
// Closing regex delimiter
current += ch;
inRegex = false;
} else if (ch === '|' && !inRegex) {
// Pipe separator outside regex — flush current pattern
const trimmed = current.trim();
if (trimmed.length > 0) patterns.push(trimmed);
current = '';
} else {
current += ch;
}
}
// Flush remaining
const trimmed = current.trim();
if (trimmed.length > 0) patterns.push(trimmed);
return patterns;
}
/**
* Test whether a single pattern matches a title.
*
* Pattern types:
* - /regex/ JS regular expression (case-insensitive)
* - *glob* wildcard matching (case-insensitive)
* - plain case-insensitive substring contains
*/
export function patternMatches(pattern: string, title: string): boolean {
// Regex pattern: /something/
if (pattern.startsWith('/') && pattern.endsWith('/') && pattern.length > 2) {
try {
const regex = new RegExp(pattern.slice(1, -1), 'i');
return regex.test(title);
} catch {
// Invalid regex — treat as a plain-text match
return title.toLowerCase().includes(pattern.toLowerCase());
}
}
// Glob pattern: contains at least one *
if (pattern.includes('*')) {
const regexSource = pattern
.split('*')
.map(escapeRegex)
.join('.*');
const regex = new RegExp(`^${regexSource}$`, 'i');
return regex.test(title);
}
// Plain text: case-insensitive substring match
return title.toLowerCase().includes(pattern.toLowerCase());
}
/**
* Evaluate a title against include and exclude keyword patterns.
*
* Logic:
* 1. If excludePatterns is set and title matches ANY exclude pattern false
* 2. If includePatterns is set, title must match AT LEAST ONE include true/false
* 3. If neither is set true (all titles pass)
*
* @param title The content title to evaluate
* @param includePatterns Pipe-separated include patterns (null = no filter)
* @param excludePatterns Pipe-separated exclude patterns (null = no filter)
* @returns true if the title should be enqueued, false if filtered out
*/
export function matchesKeywordFilter(
title: string,
includePatterns: string | null | undefined,
excludePatterns: string | null | undefined,
): boolean {
const excludes = parsePatterns(excludePatterns);
const includes = parsePatterns(includePatterns);
// Exclude check first — any match rejects
if (excludes.length > 0) {
for (const pattern of excludes) {
if (patternMatches(pattern, title)) {
return false;
}
}
}
// Include check — at least one must match (if set)
if (includes.length > 0) {
return includes.some((pattern) => patternMatches(pattern, title));
}
// No filters → pass
return true;
}
/** Escape special regex characters in a string. */
function escapeRegex(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}