From 2d7b812c6a885faa4858707483b2a31bbb106da0 Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Sat, 4 Apr 2026 05:33:04 +0000
Subject: [PATCH] =?UTF-8?q?test:=20Implemented=20pure-function=20scoring?=
 =?UTF-8?q?=20engine=20with=207=20weighted=20dimensio=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- "backend/pipeline/highlight_scorer.py"
- "backend/pipeline/test_highlight_scorer.py"

GSD-Task: S04/T02
---
 .gsd/milestones/M021/slices/S04/S04-PLAN.md   |   2 +-
 .../M021/slices/S04/tasks/T01-VERIFY.json     |   9 +
 .../M021/slices/S04/tasks/T02-SUMMARY.md      |  79 ++++++
 backend/pipeline/highlight_scorer.py          | 244 ++++++++++++++++++
 backend/pipeline/test_highlight_scorer.py     | 244 ++++++++++++++++++
 5 files changed, 577 insertions(+), 1 deletion(-)
 create mode 100644 .gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json
 create mode 100644 .gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md
 create mode 100644 backend/pipeline/highlight_scorer.py
 create mode 100644 backend/pipeline/test_highlight_scorer.py

diff --git a/.gsd/milestones/M021/slices/S04/S04-PLAN.md b/.gsd/milestones/M021/slices/S04/S04-PLAN.md
index 0fb5424..c780cd6 100644
--- a/.gsd/milestones/M021/slices/S04/S04-PLAN.md
+++ b/.gsd/milestones/M021/slices/S04/S04-PLAN.md
@@ -30,7 +30,7 @@
   - Estimate: 30m
   - Files: backend/models.py, alembic/versions/019_add_highlight_candidates.py, backend/pipeline/highlight_schemas.py
   - Verify: python -c "from backend.models import HighlightCandidate, HighlightStatus; print('OK')" && python -c "from backend.pipeline.highlight_schemas import HighlightCandidateResponse, HighlightScoreBreakdown, HighlightBatchResult; print('OK')"
-- [ ] **T02: Implement highlight scoring engine with unit tests** — Build the pure-function scoring engine that takes KeyMoment data + context and returns a scored HighlightCandidate. This is the riskiest piece — if scores are garbage, the whole feature is useless. Unit tests with realistic fixture data prove the heuristic produces sensible orderings.
+- [x] **T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings** — Build the pure-function scoring engine that takes KeyMoment data + context and returns a scored HighlightCandidate. This is the riskiest piece — if scores are garbage, the whole feature is useless. Unit tests with realistic fixture data prove the heuristic produces sensible orderings.
 
 ## Steps
 
diff --git a/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json b/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json
new file mode 100644
index 0000000..f993334
--- /dev/null
+++ b/.gsd/milestones/M021/slices/S04/tasks/T01-VERIFY.json
@@ -0,0 +1,9 @@
+{
+  "schemaVersion": 1,
+  "taskId": "T01",
+  "unitId": "M021/S04/T01",
+  "timestamp": 1775280636911,
+  "passed": true,
+  "discoverySource": "none",
+  "checks": []
+}
diff --git a/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md b/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md
new file mode 100644
index 0000000..f1eab7e
--- /dev/null
+++ b/.gsd/milestones/M021/slices/S04/tasks/T02-SUMMARY.md
@@ -0,0 +1,79 @@
+---
+id: T02
+parent: S04
+milestone: M021
+provides: []
+requires: []
+affects: []
+key_files: ["backend/pipeline/highlight_scorer.py", "backend/pipeline/test_highlight_scorer.py"]
+key_decisions: ["Mapped 7 scoring dimensions to HighlightScoreBreakdown schema fields for downstream compatibility", "Duration fitness uses piecewise linear rather than Gaussian bell curve for predictability"]
+patterns_established: []
+drill_down_paths: []
+observability_surfaces: []
+duration: ""
+verification_result: "All 28 tests pass. Score ordering: ideal > mediocre > poor confirmed. Edge cases with None/empty/extreme values all produce scores in [0,1]. Slice-level imports of models and schemas verified."
+completed_at: 2026-04-04T05:33:01.169Z
+blocker_discovered: false
+---
+
+# T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings
+
+> Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings
+
+## What Happened
+---
+id: T02
+parent: S04
+milestone: M021
+key_files:
+  - backend/pipeline/highlight_scorer.py
+  - backend/pipeline/test_highlight_scorer.py
+key_decisions:
+  - Mapped 7 scoring dimensions to HighlightScoreBreakdown schema fields for downstream compatibility
+  - Duration fitness uses piecewise linear rather than Gaussian bell curve for predictability
+duration: ""
+verification_result: passed
+completed_at: 2026-04-04T05:33:01.170Z
+blocker_discovered: false
+---
+
+# T02: Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings
+
+**Implemented pure-function scoring engine with 7 weighted dimensions and 28 unit tests proving sensible orderings**
+
+## What Happened
+
+Created backend/pipeline/highlight_scorer.py with score_moment() pure function accepting KeyMoment fields + context as keyword args, returning composite score [0,1] with 7-dimension breakdown and duration_secs. Seven scoring dimensions: duration_fitness (0.25 weight, piecewise linear bell curve 30-60s peak), content_type_weight (0.20), specificity_density (0.20, regex-based unit/ratio counting), plugin_richness (0.10), transcript_energy (0.10, teaching-phrase detection), source_quality_weight (0.10), video_type_weight (0.05). Weights verified to sum to 1.0. Created 28 pytest tests across 8 test classes covering ideal/mediocre/poor ordering, edge cases, None handling, and per-function behavior.
+
+## Verification
+
+All 28 tests pass. Score ordering: ideal > mediocre > poor confirmed. Edge cases with None/empty/extreme values all produce scores in [0,1]. Slice-level imports of models and schemas verified.
+
+## Verification Evidence
+
+| # | Command | Exit Code | Verdict | Duration |
+|---|---------|-----------|---------|----------|
+| 1 | `python -m pytest backend/pipeline/test_highlight_scorer.py -v` | 0 | ✅ pass | 50ms |
+| 2 | `PYTHONPATH=backend python -c "from backend.models import HighlightCandidate, HighlightStatus; print('OK')"` | 0 | ✅ pass | 500ms |
+| 3 | `python -c "from backend.pipeline.highlight_schemas import HighlightCandidateResponse, HighlightScoreBreakdown, HighlightBatchResult; print('OK')"` | 0 | ✅ pass | 400ms |
+
+
+## Deviations
+
+None.
+
+## Known Issues
+
+None.
+
+## Files Created/Modified
+
+- `backend/pipeline/highlight_scorer.py`
+- `backend/pipeline/test_highlight_scorer.py`
+
+
+## Deviations
+None.
+
+## Known Issues
+None.
diff --git a/backend/pipeline/highlight_scorer.py b/backend/pipeline/highlight_scorer.py
new file mode 100644
index 0000000..af712d3
--- /dev/null
+++ b/backend/pipeline/highlight_scorer.py
@@ -0,0 +1,244 @@
+"""Heuristic scoring engine for highlight candidate detection.
+
+Takes KeyMoment data + context (source quality, video content type) and
+returns a composite score in [0, 1] with a 7-dimension breakdown.
+
+The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py:
+  duration_score, content_density_score, technique_relevance_score,
+  position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score
+"""
+
+from __future__ import annotations
+
+import math
+import re
+from typing import Any
+
+
+# ── Weights per dimension (must sum to 1.0) ──────────────────────────────────
+
+_WEIGHTS: dict[str, float] = {
+    "duration_score": 0.25,
+    "content_density_score": 0.20,
+    "technique_relevance_score": 0.20,
+    "plugin_diversity_score": 0.10,
+    "engagement_proxy_score": 0.10,
+    "position_score": 0.10,          # mapped from source_quality
+    "uniqueness_score": 0.05,        # mapped from video_type
+}
+
+assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
+
+
+# ── Individual scoring functions ─────────────────────────────────────────────
+
+def _duration_fitness(duration_secs: float) -> float:
+    """Bell-curve around 30-60s sweet spot.
+
+    Peak at 30-60s (score 1.0), penalty below 15s and above 120s,
+    zero above 300s.
+    """
+    if duration_secs <= 0:
+        return 0.0
+    if duration_secs >= 300:
+        return 0.0
+
+    # Sweet spot: 30-60s → 1.0
+    if 30 <= duration_secs <= 60:
+        return 1.0
+
+    # Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s
+    # with steeper penalty below 15s
+    if duration_secs < 30:
+        if duration_secs < 15:
+            return duration_secs / 30.0  # 0→0.5 over 0-15s
+        return 0.5 + (duration_secs - 15) / 30.0  # 0.5→1.0 over 15-30s
+
+    # Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s
+    return max(0.0, 1.0 - (duration_secs - 60) / 240.0)
+
+
+def _content_type_weight(content_type: str | None) -> float:
+    """Score based on KeyMoment content_type.
+
+    technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4
+    """
+    mapping = {
+        "technique": 1.0,
+        "settings": 0.8,
+        "workflow": 0.6,
+        "reasoning": 0.4,
+    }
+    return mapping.get(content_type or "", 0.5)
+
+
+def _specificity_density(summary: str | None) -> float:
+    """Score based on specificity signals in the summary.
+
+    Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios)
+    normalized by summary length.
+    """
+    if not summary:
+        return 0.0
+
+    words = summary.split()
+    word_count = len(words)
+    if word_count == 0:
+        return 0.0
+
+    # Patterns that indicate specificity
+    specificity_patterns = [
+        r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b",  # units
+        r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b",                # ratios like 3/4
+        r"\b\d{2,}\b",                                     # multi-digit numbers
+        r"\b\d+\.\d+\b",                                   # decimal numbers
+    ]
+
+    hits = 0
+    for pattern in specificity_patterns:
+        hits += len(re.findall(pattern, summary, re.IGNORECASE))
+
+    # Normalize: ~1 specific value per 10 words is high density
+    density = hits / (word_count / 10.0)
+    return min(density, 1.0)
+
+
+def _plugin_richness(plugins: list[str] | None) -> float:
+    """Score based on number of plugins mentioned.
+
+    min(len(plugins) / 3, 1.0)
+    """
+    if not plugins:
+        return 0.0
+    return min(len(plugins) / 3.0, 1.0)
+
+
+def _transcript_energy(raw_transcript: str | None) -> float:
+    """Score based on teaching/engagement phrases in transcript.
+
+    Counts teaching phrases ('the trick is', 'notice how', 'because',
+    'I always', 'the key is', 'what I do') normalized by transcript
+    word count.
+    """
+    if not raw_transcript:
+        return 0.0
+
+    words = raw_transcript.split()
+    word_count = len(words)
+    if word_count == 0:
+        return 0.0
+
+    teaching_phrases = [
+        "the trick is",
+        "notice how",
+        "because",
+        "i always",
+        "the key is",
+        "what i do",
+        "important thing",
+        "you want to",
+        "make sure",
+        "here's why",
+    ]
+
+    text_lower = raw_transcript.lower()
+    hits = sum(text_lower.count(phrase) for phrase in teaching_phrases)
+
+    # Normalize: ~1 phrase per 50 words is high energy
+    energy = hits / (word_count / 50.0)
+    return min(energy, 1.0)
+
+
+def _source_quality_weight(source_quality: str | None) -> float:
+    """Score based on TechniquePage source_quality.
+
+    structured=1.0, mixed=0.7, unstructured=0.4, None=0.5
+    """
+    mapping = {
+        "structured": 1.0,
+        "mixed": 0.7,
+        "unstructured": 0.4,
+    }
+    return mapping.get(source_quality or "", 0.5)
+
+
+def _video_type_weight(video_content_type: str | None) -> float:
+    """Score based on SourceVideo content_type.
+
+    tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3
+    """
+    mapping = {
+        "tutorial": 1.0,
+        "breakdown": 0.9,
+        "livestream": 0.5,
+        "short_form": 0.3,
+    }
+    return mapping.get(video_content_type or "", 0.5)
+
+
+# ── Main scoring function ───────────────────────────────────────────────────
+
+def score_moment(
+    *,
+    start_time: float,
+    end_time: float,
+    content_type: str | None = None,
+    summary: str | None = None,
+    plugins: list[str] | None = None,
+    raw_transcript: str | None = None,
+    source_quality: str | None = None,
+    video_content_type: str | None = None,
+) -> dict[str, Any]:
+    """Score a KeyMoment for highlight potential.
+
+    Parameters
+    ----------
+    start_time : float
+        Moment start in seconds.
+    end_time : float
+        Moment end in seconds.
+    content_type : str | None
+        KeyMoment content type (technique, settings, workflow, reasoning).
+    summary : str | None
+        KeyMoment summary text.
+    plugins : list[str] | None
+        Plugins mentioned in the moment.
+    raw_transcript : str | None
+        Raw transcript text of the moment.
+    source_quality : str | None
+        TechniquePage source quality (structured, mixed, unstructured).
+    video_content_type : str | None
+        SourceVideo content type (tutorial, breakdown, livestream, short_form).
+
+    Returns
+    -------
+    dict with keys:
+        score : float in [0.0, 1.0]
+        score_breakdown : dict mapping dimension names to float scores
+        duration_secs : float
+    """
+    duration_secs = max(0.0, end_time - start_time)
+
+    breakdown = {
+        "duration_score": _duration_fitness(duration_secs),
+        "content_density_score": _specificity_density(summary),
+        "technique_relevance_score": _content_type_weight(content_type),
+        "plugin_diversity_score": _plugin_richness(plugins),
+        "engagement_proxy_score": _transcript_energy(raw_transcript),
+        "position_score": _source_quality_weight(source_quality),
+        "uniqueness_score": _video_type_weight(video_content_type),
+    }
+
+    # Weighted composite
+    composite = sum(
+        breakdown[dim] * weight for dim, weight in _WEIGHTS.items()
+    )
+
+    # Clamp to [0, 1] for safety
+    composite = max(0.0, min(1.0, composite))
+
+    return {
+        "score": composite,
+        "score_breakdown": breakdown,
+        "duration_secs": duration_secs,
+    }
diff --git a/backend/pipeline/test_highlight_scorer.py b/backend/pipeline/test_highlight_scorer.py
new file mode 100644
index 0000000..4d1c5d9
--- /dev/null
+++ b/backend/pipeline/test_highlight_scorer.py
@@ -0,0 +1,244 @@
+"""Tests for the highlight scoring engine.
+
+Verifies heuristic scoring produces sensible orderings and handles
+edge cases gracefully.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from backend.pipeline.highlight_scorer import (
+    _content_type_weight,
+    _duration_fitness,
+    _plugin_richness,
+    _source_quality_weight,
+    _specificity_density,
+    _transcript_energy,
+    _video_type_weight,
+    score_moment,
+)
+
+
+# ── Fixture helpers ──────────────────────────────────────────────────────────
+
+def _ideal_moment() -> dict:
+    """45s technique moment, 3 plugins, specific summary, structured source."""
+    return dict(
+        start_time=10.0,
+        end_time=55.0,  # 45s duration
+        content_type="technique",
+        summary=(
+            "Set the compressor threshold to -18 dB with a 4:1 ratio, "
+            "then boost the high shelf at 12 kHz by 3.5 dB using FabFilter Pro-Q 3."
+        ),
+        plugins=["FabFilter Pro-Q 3", "SSL G-Bus Compressor", "Valhalla Room"],
+        raw_transcript=(
+            "The trick is to set the threshold low enough. Notice how "
+            "the compressor grabs the transients. Because we want to preserve "
+            "the dynamics, I always back off the ratio. The key is finding "
+            "that sweet spot where it's controlling but not squashing."
+        ),
+        source_quality="structured",
+        video_content_type="tutorial",
+    )
+
+
+def _mediocre_moment() -> dict:
+    """90s settings moment, 1 plugin, decent summary, mixed source."""
+    return dict(
+        start_time=120.0,
+        end_time=210.0,  # 90s duration
+        content_type="settings",
+        summary="Adjust the EQ settings for the vocal track to get a clearer sound.",
+        plugins=["FabFilter Pro-Q 3"],
+        raw_transcript=(
+            "So here we're just going to adjust this. I think it sounds "
+            "better when we cut some of the low end. Let me show you what "
+            "I mean. Yeah, that's better."
+        ),
+        source_quality="mixed",
+        video_content_type="breakdown",
+    )
+
+
+def _poor_moment() -> dict:
+    """300s reasoning moment, 0 plugins, vague summary, unstructured source."""
+    return dict(
+        start_time=0.0,
+        end_time=300.0,  # 300s duration → zero for duration_fitness
+        content_type="reasoning",
+        summary="General discussion about mixing philosophy and approach.",
+        plugins=[],
+        raw_transcript=(
+            "I think mixing is really about taste. Everyone has their own "
+            "approach. Some people like it loud, some people like it quiet. "
+            "There's no right or wrong way to do it really."
+        ),
+        source_quality="unstructured",
+        video_content_type="livestream",
+    )
+
+
+# ── Tests ────────────────────────────────────────────────────────────────────
+
+class TestScoreMoment:
+    def test_ideal_moment_scores_high(self):
+        result = score_moment(**_ideal_moment())
+        assert result["score"] > 0.7, f"Ideal moment scored {result['score']}, expected > 0.7"
+
+    def test_poor_moment_scores_low(self):
+        result = score_moment(**_poor_moment())
+        assert result["score"] < 0.4, f"Poor moment scored {result['score']}, expected < 0.4"
+
+    def test_ordering_is_sensible(self):
+        ideal = score_moment(**_ideal_moment())
+        mediocre = score_moment(**_mediocre_moment())
+        poor = score_moment(**_poor_moment())
+
+        assert ideal["score"] > mediocre["score"] > poor["score"], (
+            f"Expected ideal ({ideal['score']:.3f}) > "
+            f"mediocre ({mediocre['score']:.3f}) > "
+            f"poor ({poor['score']:.3f})"
+        )
+
+    def test_score_bounds(self):
+        """All scores in [0.0, 1.0] for edge cases."""
+        edge_cases = [
+            dict(start_time=0, end_time=0, summary="", plugins=None, raw_transcript=None),
+            dict(start_time=0, end_time=500, summary=None, plugins=[], raw_transcript=""),
+            dict(start_time=0, end_time=45, summary="x" * 10000, plugins=["a"] * 100),
+            dict(start_time=100, end_time=100),  # zero duration
+        ]
+        for kwargs in edge_cases:
+            result = score_moment(**kwargs)
+            assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of bounds for {kwargs}"
+            for dim, val in result["score_breakdown"].items():
+                assert 0.0 <= val <= 1.0, f"{dim}={val} out of bounds for {kwargs}"
+
+    def test_missing_optional_fields(self):
+        """None raw_transcript and None plugins don't crash."""
+        result = score_moment(
+            start_time=10.0,
+            end_time=55.0,
+            content_type="technique",
+            summary="A summary.",
+            plugins=None,
+            raw_transcript=None,
+            source_quality=None,
+            video_content_type=None,
+        )
+        assert 0.0 <= result["score"] <= 1.0
+        assert result["duration_secs"] == 45.0
+        assert len(result["score_breakdown"]) == 7
+
+    def test_returns_duration_secs(self):
+        result = score_moment(start_time=10.0, end_time=55.0)
+        assert result["duration_secs"] == 45.0
+
+    def test_breakdown_has_seven_dimensions(self):
+        result = score_moment(**_ideal_moment())
+        assert len(result["score_breakdown"]) == 7
+        expected_keys = {
+            "duration_score", "content_density_score", "technique_relevance_score",
+            "plugin_diversity_score", "engagement_proxy_score", "position_score",
+            "uniqueness_score",
+        }
+        assert set(result["score_breakdown"].keys()) == expected_keys
+
+
+class TestDurationFitness:
+    def test_bell_curve_peak(self):
+        """45s scores higher than 10s, 10s scores higher than 400s."""
+        assert _duration_fitness(45) > _duration_fitness(10)
+        assert _duration_fitness(10) > _duration_fitness(400)
+
+    def test_sweet_spot(self):
+        assert _duration_fitness(30) == 1.0
+        assert _duration_fitness(45) == 1.0
+        assert _duration_fitness(60) == 1.0
+
+    def test_zero_at_extremes(self):
+        assert _duration_fitness(0) == 0.0
+        assert _duration_fitness(300) == 0.0
+        assert _duration_fitness(500) == 0.0
+
+    def test_negative_duration(self):
+        assert _duration_fitness(-10) == 0.0
+
+
+class TestContentTypeWeight:
+    def test_technique_highest(self):
+        assert _content_type_weight("technique") == 1.0
+
+    def test_reasoning_lowest_known(self):
+        assert _content_type_weight("reasoning") == 0.4
+
+    def test_unknown_gets_default(self):
+        assert _content_type_weight("unknown") == 0.5
+        assert _content_type_weight(None) == 0.5
+
+
+class TestSpecificityDensity:
+    def test_specific_summary_scores_high(self):
+        summary = "Set threshold to -18 dB with 4:1 ratio, boost 12 kHz by 3.5 dB"
+        score = _specificity_density(summary)
+        assert score > 0.5
+
+    def test_vague_summary_scores_low(self):
+        score = _specificity_density("General discussion about mixing philosophy.")
+        assert score < 0.3
+
+    def test_empty_returns_zero(self):
+        assert _specificity_density("") == 0.0
+        assert _specificity_density(None) == 0.0
+
+
+class TestPluginRichness:
+    def test_three_plugins_maxes_out(self):
+        assert _plugin_richness(["a", "b", "c"]) == 1.0
+
+    def test_more_than_three_capped(self):
+        assert _plugin_richness(["a", "b", "c", "d"]) == 1.0
+
+    def test_empty(self):
+        assert _plugin_richness([]) == 0.0
+        assert _plugin_richness(None) == 0.0
+
+
+class TestTranscriptEnergy:
+    def test_teaching_phrases_score_high(self):
+        transcript = (
+            "The trick is to notice how the compressor behaves. "
+            "Because we want dynamics, I always set it gently. The key is balance."
+        )
+        score = _transcript_energy(transcript)
+        assert score > 0.5
+
+    def test_bland_transcript_scores_low(self):
+        transcript = "And then we adjust this slider here. Okay that sounds fine."
+        score = _transcript_energy(transcript)
+        assert score < 0.3
+
+    def test_empty(self):
+        assert _transcript_energy("") == 0.0
+        assert _transcript_energy(None) == 0.0
+
+
+class TestSourceQualityWeight:
+    def test_structured_highest(self):
+        assert _source_quality_weight("structured") == 1.0
+
+    def test_none_default(self):
+        assert _source_quality_weight(None) == 0.5
+
+
+class TestVideoTypeWeight:
+    def test_tutorial_highest(self):
+        assert _video_type_weight("tutorial") == 1.0
+
+    def test_short_form_lowest(self):
+        assert _video_type_weight("short_form") == 0.3
+
+    def test_none_default(self):
+        assert _video_type_weight(None) == 0.5