test: Implement matchesKeywordFilter engine with pipe-separated pattern…

- "src/services/keyword-filter.ts" - "src/__tests__/keyword-filter.test.ts" GSD-Task: S03/T02
2026-04-04 05:38:37 +00:00 · 2026-04-04 05:38:37 +00:00 · cc031a78a9
commit cc031a78a9
parent 8d133024a5
2 changed files with 380 additions and 0 deletions
--- a/src/tests/keyword-filter.test.ts
+++ b/src/tests/keyword-filter.test.ts
@ -0,0 +1,252 @@
+import { describe, it, expect } from 'vitest';
+import {
+  matchesKeywordFilter,
+  parsePatterns,
+  patternMatches,
+} from '../services/keyword-filter';
+
+// ── parsePatterns ──
+
+describe('parsePatterns', () => {
+  it('returns empty array for null', () => {
+    expect(parsePatterns(null)).toEqual([]);
+  });
+
+  it('returns empty array for undefined', () => {
+    expect(parsePatterns(undefined)).toEqual([]);
+  });
+
+  it('returns empty array for empty string', () => {
+    expect(parsePatterns('')).toEqual([]);
+  });
+
+  it('splits pipe-separated patterns', () => {
+    expect(parsePatterns('shorts|live')).toEqual(['shorts', 'live']);
+  });
+
+  it('trims whitespace from patterns', () => {
+    expect(parsePatterns(' shorts | live ')).toEqual(['shorts', 'live']);
+  });
+
+  it('drops empty segments', () => {
+    expect(parsePatterns('shorts||live|')).toEqual(['shorts', 'live']);
+  });
+
+  it('handles single pattern', () => {
+    expect(parsePatterns('tutorial')).toEqual(['tutorial']);
+  });
+});
+
+// ── patternMatches ──
+
+describe('patternMatches', () => {
+  describe('plain text (case-insensitive substring)', () => {
+    it('matches substring', () => {
+      expect(patternMatches('shorts', 'My Shorts Video')).toBe(true);
+    });
+
+    it('matches case-insensitively', () => {
+      expect(patternMatches('SHORTS', 'my shorts video')).toBe(true);
+    });
+
+    it('does not match when absent', () => {
+      expect(patternMatches('podcast', 'My Shorts Video')).toBe(false);
+    });
+
+    it('matches exact title', () => {
+      expect(patternMatches('hello world', 'Hello World')).toBe(true);
+    });
+  });
+
+  describe('glob patterns (with *)', () => {
+    it('matches * as wildcard at start', () => {
+      expect(patternMatches('*shorts', 'My #shorts')).toBe(true);
+    });
+
+    it('matches * as wildcard at end', () => {
+      expect(patternMatches('Episode*', 'Episode 42: The Return')).toBe(true);
+    });
+
+    it('matches * in the middle', () => {
+      expect(patternMatches('EP*Review', 'EP42 Review')).toBe(true);
+    });
+
+    it('matches double wildcards', () => {
+      expect(patternMatches('*shorts*', 'My #shorts Video')).toBe(true);
+    });
+
+    it('rejects non-matching glob', () => {
+      expect(patternMatches('Episode*', 'My Shorts Video')).toBe(false);
+    });
+
+    it('glob is case-insensitive', () => {
+      expect(patternMatches('*SHORTS*', 'my shorts video')).toBe(true);
+    });
+
+    it('glob anchors to full title (no partial)', () => {
+      // Without wildcards around it, glob requires full match
+      expect(patternMatches('shorts', 'My Shorts Video')).toBe(true); // plain text, not glob
+      expect(patternMatches('short*', 'My Shorts Video')).toBe(false); // anchored: must start with "short"
+    });
+  });
+
+  describe('regex patterns (/regex/)', () => {
+    it('matches regex pattern', () => {
+      expect(patternMatches('/^EP\\d+/', 'EP42 Review')).toBe(true);
+    });
+
+    it('regex is case-insensitive', () => {
+      expect(patternMatches('/episode/', 'Episode 5')).toBe(true);
+    });
+
+    it('rejects non-matching regex', () => {
+      expect(patternMatches('/^EP\\d+/', 'My Shorts Video')).toBe(false);
+    });
+
+    it('handles complex regex', () => {
+      expect(patternMatches('/shorts|#shorts/', 'Watch #shorts now')).toBe(true);
+    });
+
+    it('falls back to plain text on invalid regex', () => {
+      // Invalid regex with unbalanced bracket — should fall back to substring match
+      // The full pattern "/[invalid/" is matched as plain text (including slashes)
+      expect(patternMatches('/[invalid/', 'contains /[invalid/ text')).toBe(true);
+      expect(patternMatches('/[invalid/', 'no match here')).toBe(false);
+    });
+
+    it('rejects single slashes as plain text, not regex', () => {
+      // "//" is length 2, not > 2, so treated as plain text
+      expect(patternMatches('//', '// comment')).toBe(true);
+    });
+  });
+});
+
+// ── matchesKeywordFilter ──
+
+describe('keyword filter matching engine', () => {
+  describe('no filters', () => {
+    it('passes when both null', () => {
+      expect(matchesKeywordFilter('Any Title', null, null)).toBe(true);
+    });
+
+    it('passes when both undefined', () => {
+      expect(matchesKeywordFilter('Any Title', undefined, undefined)).toBe(true);
+    });
+
+    it('passes when both empty string', () => {
+      expect(matchesKeywordFilter('Any Title', '', '')).toBe(true);
+    });
+  });
+
+  describe('exclude only', () => {
+    it('excludes matching title', () => {
+      expect(matchesKeywordFilter('My #shorts Video', null, '#shorts')).toBe(false);
+    });
+
+    it('passes non-matching title', () => {
+      expect(matchesKeywordFilter('Full Episode 1', null, '#shorts')).toBe(true);
+    });
+
+    it('excludes on any matching pattern', () => {
+      expect(matchesKeywordFilter('Live Stream Now', null, 'shorts|live')).toBe(false);
+    });
+
+    it('passes when no exclude patterns match', () => {
+      expect(matchesKeywordFilter('Full Episode 1', null, 'shorts|live')).toBe(true);
+    });
+  });
+
+  describe('include only', () => {
+    it('passes when title matches include', () => {
+      expect(matchesKeywordFilter('Episode 42', 'episode', null)).toBe(true);
+    });
+
+    it('rejects when title matches none of includes', () => {
+      expect(matchesKeywordFilter('Random Video', 'episode|tutorial', null)).toBe(false);
+    });
+
+    it('passes when title matches at least one include', () => {
+      expect(matchesKeywordFilter('Tutorial: React', 'episode|tutorial', null)).toBe(true);
+    });
+  });
+
+  describe('include + exclude combined', () => {
+    it('exclude takes priority over include', () => {
+      // Title matches include "episode" but also matches exclude "shorts"
+      expect(matchesKeywordFilter(
+        'Episode 1 #shorts',
+        'episode',
+        '#shorts',
+      )).toBe(false);
+    });
+
+    it('passes when matches include and not exclude', () => {
+      expect(matchesKeywordFilter(
+        'Episode 42: Deep Dive',
+        'episode',
+        '#shorts|live',
+      )).toBe(true);
+    });
+
+    it('rejects when matches neither include nor exclude', () => {
+      expect(matchesKeywordFilter(
+        'Random Video',
+        'episode|tutorial',
+        '#shorts',
+      )).toBe(false);
+    });
+  });
+
+  describe('mixed pattern types', () => {
+    it('works with regex exclude and plain include', () => {
+      expect(matchesKeywordFilter(
+        'EP42 Shorts Compilation',
+        'EP*',
+        '/shorts/',
+      )).toBe(false);
+    });
+
+    it('works with glob include', () => {
+      expect(matchesKeywordFilter(
+        'Episode 42: The Return',
+        'Episode*',
+        null,
+      )).toBe(true);
+    });
+
+    it('works with regex include', () => {
+      expect(matchesKeywordFilter(
+        'EP42 Review',
+        '/^EP\\d+/',
+        null,
+      )).toBe(true);
+    });
+  });
+
+  describe('edge cases', () => {
+    it('handles empty title', () => {
+      expect(matchesKeywordFilter('', 'episode', null)).toBe(false);
+    });
+
+    it('handles empty title with no filters', () => {
+      expect(matchesKeywordFilter('', null, null)).toBe(true);
+    });
+
+    it('handles special regex chars in plain text pattern', () => {
+      // The "." in plain text should match literally as substring
+      expect(matchesKeywordFilter('version 2.0 release', '2.0', null)).toBe(true);
+    });
+
+    it('handles pipe char in regex pattern', () => {
+      expect(matchesKeywordFilter(
+        'Watch shorts now',
+        '/shorts|clips/',
+        null,
+      )).toBe(true);
+    });
+
+    it('whitespace-only patterns are dropped', () => {
+      expect(matchesKeywordFilter('Any Title', '  |  ', null)).toBe(true);
+    });
+  });
+});
--- a/src/services/keyword-filter.ts
+++ b/src/services/keyword-filter.ts
@ -0,0 +1,128 @@
+// ── Keyword Filter Matching Engine ──
+//
+// Evaluates video/content titles against per-channel include/exclude keyword
+// patterns. Patterns are pipe-separated strings stored in the DB; each
+// individual pattern can be:
+//   • plain text  → case-insensitive substring match
+//   • glob with * → converted to regex (e.g. "*shorts*" matches "My Shorts Video")
+//   • /regex/     → evaluated as a JS RegExp (case-insensitive)
+
+/**
+ * Parse a pipe-separated pattern string into individual trimmed patterns.
+ * Blank entries are silently dropped.
+ *
+ * Regex-aware: pipes inside `/regex/` delimiters are preserved as part of the
+ * pattern (e.g. `/shorts|clips/` stays as one pattern, not split into two).
+ */
+export function parsePatterns(raw: string | null | undefined): string[] {
+  if (!raw) return [];
+
+  const patterns: string[] = [];
+  let current = '';
+  let inRegex = false;
+
+  for (let i = 0; i < raw.length; i++) {
+    const ch = raw[i];
+
+    if (ch === '/' && !inRegex && current.trim() === '') {
+      // Entering regex mode — slash at the start of a new pattern segment
+      inRegex = true;
+      current += ch;
+    } else if (ch === '/' && inRegex) {
+      // Closing regex delimiter
+      current += ch;
+      inRegex = false;
+    } else if (ch === '|' && !inRegex) {
+      // Pipe separator outside regex — flush current pattern
+      const trimmed = current.trim();
+      if (trimmed.length > 0) patterns.push(trimmed);
+      current = '';
+    } else {
+      current += ch;
+    }
+  }
+
+  // Flush remaining
+  const trimmed = current.trim();
+  if (trimmed.length > 0) patterns.push(trimmed);
+
+  return patterns;
+}
+
+/**
+ * Test whether a single pattern matches a title.
+ *
+ * Pattern types:
+ *  - /regex/  → JS regular expression (case-insensitive)
+ *  - *glob*   → wildcard matching (case-insensitive)
+ *  - plain    → case-insensitive substring contains
+ */
+export function patternMatches(pattern: string, title: string): boolean {
+  // Regex pattern: /something/
+  if (pattern.startsWith('/') && pattern.endsWith('/') && pattern.length > 2) {
+    try {
+      const regex = new RegExp(pattern.slice(1, -1), 'i');
+      return regex.test(title);
+    } catch {
+      // Invalid regex — treat as a plain-text match
+      return title.toLowerCase().includes(pattern.toLowerCase());
+    }
+  }
+
+  // Glob pattern: contains at least one *
+  if (pattern.includes('*')) {
+    const regexSource = pattern
+      .split('*')
+      .map(escapeRegex)
+      .join('.*');
+    const regex = new RegExp(`^${regexSource}$`, 'i');
+    return regex.test(title);
+  }
+
+  // Plain text: case-insensitive substring match
+  return title.toLowerCase().includes(pattern.toLowerCase());
+}
+
+/**
+ * Evaluate a title against include and exclude keyword patterns.
+ *
+ * Logic:
+ *  1. If excludePatterns is set and title matches ANY exclude pattern → false
+ *  2. If includePatterns is set, title must match AT LEAST ONE include → true/false
+ *  3. If neither is set → true (all titles pass)
+ *
+ * @param title            The content title to evaluate
+ * @param includePatterns  Pipe-separated include patterns (null = no filter)
+ * @param excludePatterns  Pipe-separated exclude patterns (null = no filter)
+ * @returns true if the title should be enqueued, false if filtered out
+ */
+export function matchesKeywordFilter(
+  title: string,
+  includePatterns: string | null | undefined,
+  excludePatterns: string | null | undefined,
+): boolean {
+  const excludes = parsePatterns(excludePatterns);
+  const includes = parsePatterns(includePatterns);
+
+  // Exclude check first — any match rejects
+  if (excludes.length > 0) {
+    for (const pattern of excludes) {
+      if (patternMatches(pattern, title)) {
+        return false;
+      }
+    }
+  }
+
+  // Include check — at least one must match (if set)
+  if (includes.length > 0) {
+    return includes.some((pattern) => patternMatches(pattern, title));
+  }
+
+  // No filters → pass
+  return true;
+}
+
+/** Escape special regex characters in a string. */
+function escapeRegex(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}