test: Implement matchesKeywordFilter engine with pipe-separated pattern…
- "src/services/keyword-filter.ts" - "src/__tests__/keyword-filter.test.ts" GSD-Task: S03/T02
This commit is contained in:
parent
8d133024a5
commit
cc031a78a9
2 changed files with 380 additions and 0 deletions
252
src/__tests__/keyword-filter.test.ts
Normal file
252
src/__tests__/keyword-filter.test.ts
Normal file
|
|
@ -0,0 +1,252 @@
|
|||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
matchesKeywordFilter,
|
||||
parsePatterns,
|
||||
patternMatches,
|
||||
} from '../services/keyword-filter';
|
||||
|
||||
// ── parsePatterns ──
|
||||
|
||||
describe('parsePatterns', () => {
|
||||
it('returns empty array for null', () => {
|
||||
expect(parsePatterns(null)).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns empty array for undefined', () => {
|
||||
expect(parsePatterns(undefined)).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns empty array for empty string', () => {
|
||||
expect(parsePatterns('')).toEqual([]);
|
||||
});
|
||||
|
||||
it('splits pipe-separated patterns', () => {
|
||||
expect(parsePatterns('shorts|live')).toEqual(['shorts', 'live']);
|
||||
});
|
||||
|
||||
it('trims whitespace from patterns', () => {
|
||||
expect(parsePatterns(' shorts | live ')).toEqual(['shorts', 'live']);
|
||||
});
|
||||
|
||||
it('drops empty segments', () => {
|
||||
expect(parsePatterns('shorts||live|')).toEqual(['shorts', 'live']);
|
||||
});
|
||||
|
||||
it('handles single pattern', () => {
|
||||
expect(parsePatterns('tutorial')).toEqual(['tutorial']);
|
||||
});
|
||||
});
|
||||
|
||||
// ── patternMatches ──
|
||||
|
||||
describe('patternMatches', () => {
|
||||
describe('plain text (case-insensitive substring)', () => {
|
||||
it('matches substring', () => {
|
||||
expect(patternMatches('shorts', 'My Shorts Video')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches case-insensitively', () => {
|
||||
expect(patternMatches('SHORTS', 'my shorts video')).toBe(true);
|
||||
});
|
||||
|
||||
it('does not match when absent', () => {
|
||||
expect(patternMatches('podcast', 'My Shorts Video')).toBe(false);
|
||||
});
|
||||
|
||||
it('matches exact title', () => {
|
||||
expect(patternMatches('hello world', 'Hello World')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('glob patterns (with *)', () => {
|
||||
it('matches * as wildcard at start', () => {
|
||||
expect(patternMatches('*shorts', 'My #shorts')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches * as wildcard at end', () => {
|
||||
expect(patternMatches('Episode*', 'Episode 42: The Return')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches * in the middle', () => {
|
||||
expect(patternMatches('EP*Review', 'EP42 Review')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches double wildcards', () => {
|
||||
expect(patternMatches('*shorts*', 'My #shorts Video')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects non-matching glob', () => {
|
||||
expect(patternMatches('Episode*', 'My Shorts Video')).toBe(false);
|
||||
});
|
||||
|
||||
it('glob is case-insensitive', () => {
|
||||
expect(patternMatches('*SHORTS*', 'my shorts video')).toBe(true);
|
||||
});
|
||||
|
||||
it('glob anchors to full title (no partial)', () => {
|
||||
// Without wildcards around it, glob requires full match
|
||||
expect(patternMatches('shorts', 'My Shorts Video')).toBe(true); // plain text, not glob
|
||||
expect(patternMatches('short*', 'My Shorts Video')).toBe(false); // anchored: must start with "short"
|
||||
});
|
||||
});
|
||||
|
||||
describe('regex patterns (/regex/)', () => {
|
||||
it('matches regex pattern', () => {
|
||||
expect(patternMatches('/^EP\\d+/', 'EP42 Review')).toBe(true);
|
||||
});
|
||||
|
||||
it('regex is case-insensitive', () => {
|
||||
expect(patternMatches('/episode/', 'Episode 5')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects non-matching regex', () => {
|
||||
expect(patternMatches('/^EP\\d+/', 'My Shorts Video')).toBe(false);
|
||||
});
|
||||
|
||||
it('handles complex regex', () => {
|
||||
expect(patternMatches('/shorts|#shorts/', 'Watch #shorts now')).toBe(true);
|
||||
});
|
||||
|
||||
it('falls back to plain text on invalid regex', () => {
|
||||
// Invalid regex with unbalanced bracket — should fall back to substring match
|
||||
// The full pattern "/[invalid/" is matched as plain text (including slashes)
|
||||
expect(patternMatches('/[invalid/', 'contains /[invalid/ text')).toBe(true);
|
||||
expect(patternMatches('/[invalid/', 'no match here')).toBe(false);
|
||||
});
|
||||
|
||||
it('rejects single slashes as plain text, not regex', () => {
|
||||
// "//" is length 2, not > 2, so treated as plain text
|
||||
expect(patternMatches('//', '// comment')).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ── matchesKeywordFilter ──
|
||||
|
||||
describe('keyword filter matching engine', () => {
|
||||
describe('no filters', () => {
|
||||
it('passes when both null', () => {
|
||||
expect(matchesKeywordFilter('Any Title', null, null)).toBe(true);
|
||||
});
|
||||
|
||||
it('passes when both undefined', () => {
|
||||
expect(matchesKeywordFilter('Any Title', undefined, undefined)).toBe(true);
|
||||
});
|
||||
|
||||
it('passes when both empty string', () => {
|
||||
expect(matchesKeywordFilter('Any Title', '', '')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('exclude only', () => {
|
||||
it('excludes matching title', () => {
|
||||
expect(matchesKeywordFilter('My #shorts Video', null, '#shorts')).toBe(false);
|
||||
});
|
||||
|
||||
it('passes non-matching title', () => {
|
||||
expect(matchesKeywordFilter('Full Episode 1', null, '#shorts')).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes on any matching pattern', () => {
|
||||
expect(matchesKeywordFilter('Live Stream Now', null, 'shorts|live')).toBe(false);
|
||||
});
|
||||
|
||||
it('passes when no exclude patterns match', () => {
|
||||
expect(matchesKeywordFilter('Full Episode 1', null, 'shorts|live')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('include only', () => {
|
||||
it('passes when title matches include', () => {
|
||||
expect(matchesKeywordFilter('Episode 42', 'episode', null)).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects when title matches none of includes', () => {
|
||||
expect(matchesKeywordFilter('Random Video', 'episode|tutorial', null)).toBe(false);
|
||||
});
|
||||
|
||||
it('passes when title matches at least one include', () => {
|
||||
expect(matchesKeywordFilter('Tutorial: React', 'episode|tutorial', null)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('include + exclude combined', () => {
|
||||
it('exclude takes priority over include', () => {
|
||||
// Title matches include "episode" but also matches exclude "shorts"
|
||||
expect(matchesKeywordFilter(
|
||||
'Episode 1 #shorts',
|
||||
'episode',
|
||||
'#shorts',
|
||||
)).toBe(false);
|
||||
});
|
||||
|
||||
it('passes when matches include and not exclude', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'Episode 42: Deep Dive',
|
||||
'episode',
|
||||
'#shorts|live',
|
||||
)).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects when matches neither include nor exclude', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'Random Video',
|
||||
'episode|tutorial',
|
||||
'#shorts',
|
||||
)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('mixed pattern types', () => {
|
||||
it('works with regex exclude and plain include', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'EP42 Shorts Compilation',
|
||||
'EP*',
|
||||
'/shorts/',
|
||||
)).toBe(false);
|
||||
});
|
||||
|
||||
it('works with glob include', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'Episode 42: The Return',
|
||||
'Episode*',
|
||||
null,
|
||||
)).toBe(true);
|
||||
});
|
||||
|
||||
it('works with regex include', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'EP42 Review',
|
||||
'/^EP\\d+/',
|
||||
null,
|
||||
)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('edge cases', () => {
|
||||
it('handles empty title', () => {
|
||||
expect(matchesKeywordFilter('', 'episode', null)).toBe(false);
|
||||
});
|
||||
|
||||
it('handles empty title with no filters', () => {
|
||||
expect(matchesKeywordFilter('', null, null)).toBe(true);
|
||||
});
|
||||
|
||||
it('handles special regex chars in plain text pattern', () => {
|
||||
// The "." in plain text should match literally as substring
|
||||
expect(matchesKeywordFilter('version 2.0 release', '2.0', null)).toBe(true);
|
||||
});
|
||||
|
||||
it('handles pipe char in regex pattern', () => {
|
||||
expect(matchesKeywordFilter(
|
||||
'Watch shorts now',
|
||||
'/shorts|clips/',
|
||||
null,
|
||||
)).toBe(true);
|
||||
});
|
||||
|
||||
it('whitespace-only patterns are dropped', () => {
|
||||
expect(matchesKeywordFilter('Any Title', ' | ', null)).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
128
src/services/keyword-filter.ts
Normal file
128
src/services/keyword-filter.ts
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
// ── Keyword Filter Matching Engine ──
|
||||
//
|
||||
// Evaluates video/content titles against per-channel include/exclude keyword
|
||||
// patterns. Patterns are pipe-separated strings stored in the DB; each
|
||||
// individual pattern can be:
|
||||
// • plain text → case-insensitive substring match
|
||||
// • glob with * → converted to regex (e.g. "*shorts*" matches "My Shorts Video")
|
||||
// • /regex/ → evaluated as a JS RegExp (case-insensitive)
|
||||
|
||||
/**
|
||||
* Parse a pipe-separated pattern string into individual trimmed patterns.
|
||||
* Blank entries are silently dropped.
|
||||
*
|
||||
* Regex-aware: pipes inside `/regex/` delimiters are preserved as part of the
|
||||
* pattern (e.g. `/shorts|clips/` stays as one pattern, not split into two).
|
||||
*/
|
||||
export function parsePatterns(raw: string | null | undefined): string[] {
|
||||
if (!raw) return [];
|
||||
|
||||
const patterns: string[] = [];
|
||||
let current = '';
|
||||
let inRegex = false;
|
||||
|
||||
for (let i = 0; i < raw.length; i++) {
|
||||
const ch = raw[i];
|
||||
|
||||
if (ch === '/' && !inRegex && current.trim() === '') {
|
||||
// Entering regex mode — slash at the start of a new pattern segment
|
||||
inRegex = true;
|
||||
current += ch;
|
||||
} else if (ch === '/' && inRegex) {
|
||||
// Closing regex delimiter
|
||||
current += ch;
|
||||
inRegex = false;
|
||||
} else if (ch === '|' && !inRegex) {
|
||||
// Pipe separator outside regex — flush current pattern
|
||||
const trimmed = current.trim();
|
||||
if (trimmed.length > 0) patterns.push(trimmed);
|
||||
current = '';
|
||||
} else {
|
||||
current += ch;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
const trimmed = current.trim();
|
||||
if (trimmed.length > 0) patterns.push(trimmed);
|
||||
|
||||
return patterns;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test whether a single pattern matches a title.
|
||||
*
|
||||
* Pattern types:
|
||||
* - /regex/ → JS regular expression (case-insensitive)
|
||||
* - *glob* → wildcard matching (case-insensitive)
|
||||
* - plain → case-insensitive substring contains
|
||||
*/
|
||||
export function patternMatches(pattern: string, title: string): boolean {
|
||||
// Regex pattern: /something/
|
||||
if (pattern.startsWith('/') && pattern.endsWith('/') && pattern.length > 2) {
|
||||
try {
|
||||
const regex = new RegExp(pattern.slice(1, -1), 'i');
|
||||
return regex.test(title);
|
||||
} catch {
|
||||
// Invalid regex — treat as a plain-text match
|
||||
return title.toLowerCase().includes(pattern.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
// Glob pattern: contains at least one *
|
||||
if (pattern.includes('*')) {
|
||||
const regexSource = pattern
|
||||
.split('*')
|
||||
.map(escapeRegex)
|
||||
.join('.*');
|
||||
const regex = new RegExp(`^${regexSource}$`, 'i');
|
||||
return regex.test(title);
|
||||
}
|
||||
|
||||
// Plain text: case-insensitive substring match
|
||||
return title.toLowerCase().includes(pattern.toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a title against include and exclude keyword patterns.
|
||||
*
|
||||
* Logic:
|
||||
* 1. If excludePatterns is set and title matches ANY exclude pattern → false
|
||||
* 2. If includePatterns is set, title must match AT LEAST ONE include → true/false
|
||||
* 3. If neither is set → true (all titles pass)
|
||||
*
|
||||
* @param title The content title to evaluate
|
||||
* @param includePatterns Pipe-separated include patterns (null = no filter)
|
||||
* @param excludePatterns Pipe-separated exclude patterns (null = no filter)
|
||||
* @returns true if the title should be enqueued, false if filtered out
|
||||
*/
|
||||
export function matchesKeywordFilter(
|
||||
title: string,
|
||||
includePatterns: string | null | undefined,
|
||||
excludePatterns: string | null | undefined,
|
||||
): boolean {
|
||||
const excludes = parsePatterns(excludePatterns);
|
||||
const includes = parsePatterns(includePatterns);
|
||||
|
||||
// Exclude check first — any match rejects
|
||||
if (excludes.length > 0) {
|
||||
for (const pattern of excludes) {
|
||||
if (patternMatches(pattern, title)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Include check — at least one must match (if set)
|
||||
if (includes.length > 0) {
|
||||
return includes.some((pattern) => patternMatches(pattern, title));
|
||||
}
|
||||
|
||||
// No filters → pass
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Escape special regex characters in a string. */
|
||||
function escapeRegex(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue