From 2d7b812c6a885faa4858707483b2a31bbb106da0 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sat, 4 Apr 2026 05:33:04 +0000 Subject: [PATCH] =?UTF-8?q?test:=20Implemented=20pure-function=20scoring?= =?UTF-8?q?=20engine=20with=207=20weighted=20dimensio=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/pipeline/highlight_scorer.py" - "backend/pipeline/test_highlight_scorer.py" GSD-Task: S04/T02 --- .gsd/milestones/M021/slices/S04/S04-PLAN.md | 2 +- .../M021/slices/S04/tasks/T01-VERIFY.json | 9 + .../M021/slices/S04/tasks/T02-SUMMARY.md | 79 ++++++ backend/pipeline/highlight_scorer.py | 244 ++++++++++++++++++ backend/pipeline/test_highlight_scorer.py | 244 ++++++++++++++++++ 5 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 .gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json create mode 100644 .gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md create mode 100644 backend/pipeline/highlight_scorer.py create mode 100644 backend/pipeline/test_highlight_scorer.py diff --git a/.gsd/milestones/M021/slices/S04/S04-PLAN.md b/.gsd/milestones/M021/slices/S04/S04-PLAN.md index 0fb5424..c780cd6 100644 --- a/.gsd/milestones/M021/slices/S04/S04-PLAN.md +++ b/.gsd/milestones/M021/slices/S04/S04-PLAN.md @@ -30,7 +30,7 @@ - Estimate: 30m - Files: backend/models.py, alembic/versions/019_add_highlight_candidates.py, backend/pipeline/highlight_schemas.py - Verify: python -c "from backend.models import HighlightCandidate, HighlightStatus; print('OK')" && python -c "from backend.pipeline.highlight_schemas import HighlightCandidateResponse, HighlightScoreBreakdown, HighlightBatchResult; print('OK')" -- [ ] **T02: Implement highlight scoring engine with unit tests** — Build the pure-function scoring engine that takes KeyMoment data + context and returns a scored HighlightCandidate. This is the riskiest piece — if scores are garbage, the whole feature is useless. Unit tests with realistic fixture data prove the heuristic produces sensible orderings. +- [x] **T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings** — Build the pure-function scoring engine that takes KeyMoment data + context and returns a scored HighlightCandidate. This is the riskiest piece — if scores are garbage, the whole feature is useless. Unit tests with realistic fixture data prove the heuristic produces sensible orderings. ## Steps diff --git a/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json b/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json new file mode 100644 index 0000000..f993334 --- /dev/null +++ b/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json @@ -0,0 +1,9 @@ +{ + "schemaVersion": 1, + "taskId": "T01", + "unitId": "M021/S04/T01", + "timestamp": 1775280636911, + "passed": true, + "discoverySource": "none", + "checks": [] +} diff --git a/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md b/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md new file mode 100644 index 0000000..f1eab7e --- /dev/null +++ b/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md @@ -0,0 +1,79 @@ +--- +id: T02 +parent: S04 +milestone: M021 +provides: [] +requires: [] +affects: [] +key_files: ["backend/pipeline/highlight_scorer.py", "backend/pipeline/test_highlight_scorer.py"] +key_decisions: ["Mapped 7 scoring dimensions to HighlightScoreBreakdown schema fields for downstream compatibility", "Duration fitness uses piecewise linear rather than Gaussian bell curve for predictability"] +patterns_established: [] +drill_down_paths: [] +observability_surfaces: [] +duration: "" +verification_result: "All 28 tests pass. Score ordering: ideal > mediocre > poor confirmed. Edge cases with None/empty/extreme values all produce scores in [0,1]. Slice-level imports of models and schemas verified." +completed_at: 2026-04-04T05:33:01.169Z +blocker_discovered: false +--- + +# T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings + +> Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings + +## What Happened +--- +id: T02 +parent: S04 +milestone: M021 +key_files: + - backend/pipeline/highlight_scorer.py + - backend/pipeline/test_highlight_scorer.py +key_decisions: + - Mapped 7 scoring dimensions to HighlightScoreBreakdown schema fields for downstream compatibility + - Duration fitness uses piecewise linear rather than Gaussian bell curve for predictability +duration: "" +verification_result: passed +completed_at: 2026-04-04T05:33:01.170Z +blocker_discovered: false +--- + +# T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings + +**Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings** + +## What Happened + +Created backend/pipeline/highlight_scorer.py with score_moment() pure function accepting KeyMoment fields + context as keyword args, returning composite score [0,1] with 7-dimension breakdown and duration_secs. Seven scoring dimensions: duration_fitness (0.25 weight, piecewise linear bell curve 30-60s peak), content_type_weight (0.20), specificity_density (0.20, regex-based unit/ratio counting), plugin_richness (0.10), transcript_energy (0.10, teaching-phrase detection), source_quality_weight (0.10), video_type_weight (0.05). Weights verified to sum to 1.0. Created 28 pytest tests across 8 test classes covering ideal/mediocre/poor ordering, edge cases, None handling, and per-function behavior. + +## Verification + +All 28 tests pass. Score ordering: ideal > mediocre > poor confirmed. Edge cases with None/empty/extreme values all produce scores in [0,1]. Slice-level imports of models and schemas verified. + +## Verification Evidence + +| # | Command | Exit Code | Verdict | Duration | +|---|---------|-----------|---------|----------| +| 1 | `python -m pytest backend/pipeline/test_highlight_scorer.py -v` | 0 | ✅ pass | 50ms | +| 2 | `PYTHONPATH=backend python -c "from backend.models import HighlightCandidate, HighlightStatus; print('OK')"` | 0 | ✅ pass | 500ms | +| 3 | `python -c "from backend.pipeline.highlight_schemas import HighlightCandidateResponse, HighlightScoreBreakdown, HighlightBatchResult; print('OK')"` | 0 | ✅ pass | 400ms | + + +## Deviations + +None. + +## Known Issues + +None. + +## Files Created/Modified + +- `backend/pipeline/highlight_scorer.py` +- `backend/pipeline/test_highlight_scorer.py` + + +## Deviations +None. + +## Known Issues +None. diff --git a/backend/pipeline/highlight_scorer.py b/backend/pipeline/highlight_scorer.py new file mode 100644 index 0000000..af712d3 --- /dev/null +++ b/backend/pipeline/highlight_scorer.py @@ -0,0 +1,244 @@ +"""Heuristic scoring engine for highlight candidate detection. + +Takes KeyMoment data + context (source quality, video content type) and +returns a composite score in [0, 1] with a 7-dimension breakdown. + +The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py: + duration_score, content_density_score, technique_relevance_score, + position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score +""" + +from __future__ import annotations + +import math +import re +from typing import Any + + +# ── Weights per dimension (must sum to 1.0) ────────────────────────────────── + +_WEIGHTS: dict[str, float] = { + "duration_score": 0.25, + "content_density_score": 0.20, + "technique_relevance_score": 0.20, + "plugin_diversity_score": 0.10, + "engagement_proxy_score": 0.10, + "position_score": 0.10, # mapped from source_quality + "uniqueness_score": 0.05, # mapped from video_type +} + +assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0" + + +# ── Individual scoring functions ───────────────────────────────────────────── + +def _duration_fitness(duration_secs: float) -> float: + """Bell-curve around 30-60s sweet spot. + + Peak at 30-60s (score 1.0), penalty below 15s and above 120s, + zero above 300s. + """ + if duration_secs <= 0: + return 0.0 + if duration_secs >= 300: + return 0.0 + + # Sweet spot: 30-60s → 1.0 + if 30 <= duration_secs <= 60: + return 1.0 + + # Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s + # with steeper penalty below 15s + if duration_secs < 30: + if duration_secs < 15: + return duration_secs / 30.0 # 0→0.5 over 0-15s + return 0.5 + (duration_secs - 15) / 30.0 # 0.5→1.0 over 15-30s + + # Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s + return max(0.0, 1.0 - (duration_secs - 60) / 240.0) + + +def _content_type_weight(content_type: str | None) -> float: + """Score based on KeyMoment content_type. + + technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4 + """ + mapping = { + "technique": 1.0, + "settings": 0.8, + "workflow": 0.6, + "reasoning": 0.4, + } + return mapping.get(content_type or "", 0.5) + + +def _specificity_density(summary: str | None) -> float: + """Score based on specificity signals in the summary. + + Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios) + normalized by summary length. + """ + if not summary: + return 0.0 + + words = summary.split() + word_count = len(words) + if word_count == 0: + return 0.0 + + # Patterns that indicate specificity + specificity_patterns = [ + r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b", # units + r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b", # ratios like 3/4 + r"\b\d{2,}\b", # multi-digit numbers + r"\b\d+\.\d+\b", # decimal numbers + ] + + hits = 0 + for pattern in specificity_patterns: + hits += len(re.findall(pattern, summary, re.IGNORECASE)) + + # Normalize: ~1 specific value per 10 words is high density + density = hits / (word_count / 10.0) + return min(density, 1.0) + + +def _plugin_richness(plugins: list[str] | None) -> float: + """Score based on number of plugins mentioned. + + min(len(plugins) / 3, 1.0) + """ + if not plugins: + return 0.0 + return min(len(plugins) / 3.0, 1.0) + + +def _transcript_energy(raw_transcript: str | None) -> float: + """Score based on teaching/engagement phrases in transcript. + + Counts teaching phrases ('the trick is', 'notice how', 'because', + 'I always', 'the key is', 'what I do') normalized by transcript + word count. + """ + if not raw_transcript: + return 0.0 + + words = raw_transcript.split() + word_count = len(words) + if word_count == 0: + return 0.0 + + teaching_phrases = [ + "the trick is", + "notice how", + "because", + "i always", + "the key is", + "what i do", + "important thing", + "you want to", + "make sure", + "here's why", + ] + + text_lower = raw_transcript.lower() + hits = sum(text_lower.count(phrase) for phrase in teaching_phrases) + + # Normalize: ~1 phrase per 50 words is high energy + energy = hits / (word_count / 50.0) + return min(energy, 1.0) + + +def _source_quality_weight(source_quality: str | None) -> float: + """Score based on TechniquePage source_quality. + + structured=1.0, mixed=0.7, unstructured=0.4, None=0.5 + """ + mapping = { + "structured": 1.0, + "mixed": 0.7, + "unstructured": 0.4, + } + return mapping.get(source_quality or "", 0.5) + + +def _video_type_weight(video_content_type: str | None) -> float: + """Score based on SourceVideo content_type. + + tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3 + """ + mapping = { + "tutorial": 1.0, + "breakdown": 0.9, + "livestream": 0.5, + "short_form": 0.3, + } + return mapping.get(video_content_type or "", 0.5) + + +# ── Main scoring function ─────────────────────────────────────────────────── + +def score_moment( + *, + start_time: float, + end_time: float, + content_type: str | None = None, + summary: str | None = None, + plugins: list[str] | None = None, + raw_transcript: str | None = None, + source_quality: str | None = None, + video_content_type: str | None = None, +) -> dict[str, Any]: + """Score a KeyMoment for highlight potential. + + Parameters + ---------- + start_time : float + Moment start in seconds. + end_time : float + Moment end in seconds. + content_type : str | None + KeyMoment content type (technique, settings, workflow, reasoning). + summary : str | None + KeyMoment summary text. + plugins : list[str] | None + Plugins mentioned in the moment. + raw_transcript : str | None + Raw transcript text of the moment. + source_quality : str | None + TechniquePage source quality (structured, mixed, unstructured). + video_content_type : str | None + SourceVideo content type (tutorial, breakdown, livestream, short_form). + + Returns + ------- + dict with keys: + score : float in [0.0, 1.0] + score_breakdown : dict mapping dimension names to float scores + duration_secs : float + """ + duration_secs = max(0.0, end_time - start_time) + + breakdown = { + "duration_score": _duration_fitness(duration_secs), + "content_density_score": _specificity_density(summary), + "technique_relevance_score": _content_type_weight(content_type), + "plugin_diversity_score": _plugin_richness(plugins), + "engagement_proxy_score": _transcript_energy(raw_transcript), + "position_score": _source_quality_weight(source_quality), + "uniqueness_score": _video_type_weight(video_content_type), + } + + # Weighted composite + composite = sum( + breakdown[dim] * weight for dim, weight in _WEIGHTS.items() + ) + + # Clamp to [0, 1] for safety + composite = max(0.0, min(1.0, composite)) + + return { + "score": composite, + "score_breakdown": breakdown, + "duration_secs": duration_secs, + } diff --git a/backend/pipeline/test_highlight_scorer.py b/backend/pipeline/test_highlight_scorer.py new file mode 100644 index 0000000..4d1c5d9 --- /dev/null +++ b/backend/pipeline/test_highlight_scorer.py @@ -0,0 +1,244 @@ +"""Tests for the highlight scoring engine. + +Verifies heuristic scoring produces sensible orderings and handles +edge cases gracefully. +""" + +from __future__ import annotations + +import pytest + +from backend.pipeline.highlight_scorer import ( + _content_type_weight, + _duration_fitness, + _plugin_richness, + _source_quality_weight, + _specificity_density, + _transcript_energy, + _video_type_weight, + score_moment, +) + + +# ── Fixture helpers ────────────────────────────────────────────────────────── + +def _ideal_moment() -> dict: + """45s technique moment, 3 plugins, specific summary, structured source.""" + return dict( + start_time=10.0, + end_time=55.0, # 45s duration + content_type="technique", + summary=( + "Set the compressor threshold to -18 dB with a 4:1 ratio, " + "then boost the high shelf at 12 kHz by 3.5 dB using FabFilter Pro-Q 3." + ), + plugins=["FabFilter Pro-Q 3", "SSL G-Bus Compressor", "Valhalla Room"], + raw_transcript=( + "The trick is to set the threshold low enough. Notice how " + "the compressor grabs the transients. Because we want to preserve " + "the dynamics, I always back off the ratio. The key is finding " + "that sweet spot where it's controlling but not squashing." + ), + source_quality="structured", + video_content_type="tutorial", + ) + + +def _mediocre_moment() -> dict: + """90s settings moment, 1 plugin, decent summary, mixed source.""" + return dict( + start_time=120.0, + end_time=210.0, # 90s duration + content_type="settings", + summary="Adjust the EQ settings for the vocal track to get a clearer sound.", + plugins=["FabFilter Pro-Q 3"], + raw_transcript=( + "So here we're just going to adjust this. I think it sounds " + "better when we cut some of the low end. Let me show you what " + "I mean. Yeah, that's better." + ), + source_quality="mixed", + video_content_type="breakdown", + ) + + +def _poor_moment() -> dict: + """300s reasoning moment, 0 plugins, vague summary, unstructured source.""" + return dict( + start_time=0.0, + end_time=300.0, # 300s duration → zero for duration_fitness + content_type="reasoning", + summary="General discussion about mixing philosophy and approach.", + plugins=[], + raw_transcript=( + "I think mixing is really about taste. Everyone has their own " + "approach. Some people like it loud, some people like it quiet. " + "There's no right or wrong way to do it really." + ), + source_quality="unstructured", + video_content_type="livestream", + ) + + +# ── Tests ──────────────────────────────────────────────────────────────────── + +class TestScoreMoment: + def test_ideal_moment_scores_high(self): + result = score_moment(**_ideal_moment()) + assert result["score"] > 0.7, f"Ideal moment scored {result['score']}, expected > 0.7" + + def test_poor_moment_scores_low(self): + result = score_moment(**_poor_moment()) + assert result["score"] < 0.4, f"Poor moment scored {result['score']}, expected < 0.4" + + def test_ordering_is_sensible(self): + ideal = score_moment(**_ideal_moment()) + mediocre = score_moment(**_mediocre_moment()) + poor = score_moment(**_poor_moment()) + + assert ideal["score"] > mediocre["score"] > poor["score"], ( + f"Expected ideal ({ideal['score']:.3f}) > " + f"mediocre ({mediocre['score']:.3f}) > " + f"poor ({poor['score']:.3f})" + ) + + def test_score_bounds(self): + """All scores in [0.0, 1.0] for edge cases.""" + edge_cases = [ + dict(start_time=0, end_time=0, summary="", plugins=None, raw_transcript=None), + dict(start_time=0, end_time=500, summary=None, plugins=[], raw_transcript=""), + dict(start_time=0, end_time=45, summary="x" * 10000, plugins=["a"] * 100), + dict(start_time=100, end_time=100), # zero duration + ] + for kwargs in edge_cases: + result = score_moment(**kwargs) + assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of bounds for {kwargs}" + for dim, val in result["score_breakdown"].items(): + assert 0.0 <= val <= 1.0, f"{dim}={val} out of bounds for {kwargs}" + + def test_missing_optional_fields(self): + """None raw_transcript and None plugins don't crash.""" + result = score_moment( + start_time=10.0, + end_time=55.0, + content_type="technique", + summary="A summary.", + plugins=None, + raw_transcript=None, + source_quality=None, + video_content_type=None, + ) + assert 0.0 <= result["score"] <= 1.0 + assert result["duration_secs"] == 45.0 + assert len(result["score_breakdown"]) == 7 + + def test_returns_duration_secs(self): + result = score_moment(start_time=10.0, end_time=55.0) + assert result["duration_secs"] == 45.0 + + def test_breakdown_has_seven_dimensions(self): + result = score_moment(**_ideal_moment()) + assert len(result["score_breakdown"]) == 7 + expected_keys = { + "duration_score", "content_density_score", "technique_relevance_score", + "plugin_diversity_score", "engagement_proxy_score", "position_score", + "uniqueness_score", + } + assert set(result["score_breakdown"].keys()) == expected_keys + + +class TestDurationFitness: + def test_bell_curve_peak(self): + """45s scores higher than 10s, 10s scores higher than 400s.""" + assert _duration_fitness(45) > _duration_fitness(10) + assert _duration_fitness(10) > _duration_fitness(400) + + def test_sweet_spot(self): + assert _duration_fitness(30) == 1.0 + assert _duration_fitness(45) == 1.0 + assert _duration_fitness(60) == 1.0 + + def test_zero_at_extremes(self): + assert _duration_fitness(0) == 0.0 + assert _duration_fitness(300) == 0.0 + assert _duration_fitness(500) == 0.0 + + def test_negative_duration(self): + assert _duration_fitness(-10) == 0.0 + + +class TestContentTypeWeight: + def test_technique_highest(self): + assert _content_type_weight("technique") == 1.0 + + def test_reasoning_lowest_known(self): + assert _content_type_weight("reasoning") == 0.4 + + def test_unknown_gets_default(self): + assert _content_type_weight("unknown") == 0.5 + assert _content_type_weight(None) == 0.5 + + +class TestSpecificityDensity: + def test_specific_summary_scores_high(self): + summary = "Set threshold to -18 dB with 4:1 ratio, boost 12 kHz by 3.5 dB" + score = _specificity_density(summary) + assert score > 0.5 + + def test_vague_summary_scores_low(self): + score = _specificity_density("General discussion about mixing philosophy.") + assert score < 0.3 + + def test_empty_returns_zero(self): + assert _specificity_density("") == 0.0 + assert _specificity_density(None) == 0.0 + + +class TestPluginRichness: + def test_three_plugins_maxes_out(self): + assert _plugin_richness(["a", "b", "c"]) == 1.0 + + def test_more_than_three_capped(self): + assert _plugin_richness(["a", "b", "c", "d"]) == 1.0 + + def test_empty(self): + assert _plugin_richness([]) == 0.0 + assert _plugin_richness(None) == 0.0 + + +class TestTranscriptEnergy: + def test_teaching_phrases_score_high(self): + transcript = ( + "The trick is to notice how the compressor behaves. " + "Because we want dynamics, I always set it gently. The key is balance." + ) + score = _transcript_energy(transcript) + assert score > 0.5 + + def test_bland_transcript_scores_low(self): + transcript = "And then we adjust this slider here. Okay that sounds fine." + score = _transcript_energy(transcript) + assert score < 0.3 + + def test_empty(self): + assert _transcript_energy("") == 0.0 + assert _transcript_energy(None) == 0.0 + + +class TestSourceQualityWeight: + def test_structured_highest(self): + assert _source_quality_weight("structured") == 1.0 + + def test_none_default(self): + assert _source_quality_weight(None) == 0.5 + + +class TestVideoTypeWeight: + def test_tutorial_highest(self): + assert _video_type_weight("tutorial") == 1.0 + + def test_short_form_lowest(self): + assert _video_type_weight("short_form") == 0.3 + + def test_none_default(self): + assert _video_type_weight(None) == 0.5