diff --git a/backend/pipeline/highlight_schemas.py b/backend/pipeline/highlight_schemas.py index 983ede6..32163c5 100644 --- a/backend/pipeline/highlight_schemas.py +++ b/backend/pipeline/highlight_schemas.py @@ -25,6 +25,18 @@ class HighlightScoreBreakdown(BaseModel): uniqueness_score: float = Field(description="Score based on title/topic distinctness among siblings") engagement_proxy_score: float = Field(description="Proxy engagement signal from summary quality/length") plugin_diversity_score: float = Field(description="Score based on breadth of plugins/tools mentioned") + speech_rate_variance_score: float = Field( + default=0.5, + description="Score based on speech rate variation (emphasis shifts) from word timings", + ) + pause_density_score: float = Field( + default=0.5, + description="Score based on strategic pause frequency from word timings", + ) + speaking_pace_score: float = Field( + default=0.5, + description="Score based on words-per-second fitness for teaching pace", + ) class HighlightCandidateResponse(BaseModel): diff --git a/backend/pipeline/highlight_scorer.py b/backend/pipeline/highlight_scorer.py index af712d3..5a45261 100644 --- a/backend/pipeline/highlight_scorer.py +++ b/backend/pipeline/highlight_scorer.py @@ -1,30 +1,35 @@ """Heuristic scoring engine for highlight candidate detection. Takes KeyMoment data + context (source quality, video content type) and -returns a composite score in [0, 1] with a 7-dimension breakdown. +returns a composite score in [0, 1] with a 10-dimension breakdown. The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py: duration_score, content_density_score, technique_relevance_score, - position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score + position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score, + speech_rate_variance_score, pause_density_score, speaking_pace_score """ from __future__ import annotations import math import re +import statistics from typing import Any # ── Weights per dimension (must sum to 1.0) ────────────────────────────────── _WEIGHTS: dict[str, float] = { - "duration_score": 0.25, - "content_density_score": 0.20, - "technique_relevance_score": 0.20, - "plugin_diversity_score": 0.10, - "engagement_proxy_score": 0.10, - "position_score": 0.10, # mapped from source_quality - "uniqueness_score": 0.05, # mapped from video_type + "duration_score": 0.20, + "content_density_score": 0.15, + "technique_relevance_score": 0.15, + "plugin_diversity_score": 0.08, + "engagement_proxy_score": 0.08, + "position_score": 0.08, # mapped from source_quality + "uniqueness_score": 0.04, # mapped from video_type + "speech_rate_variance_score": 0.08, + "pause_density_score": 0.07, + "speaking_pace_score": 0.07, } assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0" @@ -176,6 +181,163 @@ def _video_type_weight(video_content_type: str | None) -> float: return mapping.get(video_content_type or "", 0.5) +# ── Audio proxy scoring functions ───────────────────────────────────────────── + +def extract_word_timings( + transcript_data: list[dict[str, Any]], + start_time: float, + end_time: float, +) -> list[dict[str, Any]]: + """Extract word-level timing dicts from transcript segments within a time window. + + Parameters + ---------- + transcript_data : list[dict] + Parsed transcript JSON — list of segments, each with a ``words`` array. + Each word dict must have ``start`` and ``end`` float fields (seconds). + start_time : float + Window start in seconds (inclusive). + end_time : float + Window end in seconds (inclusive). + + Returns + ------- + list[dict] — word-timing dicts whose ``start`` falls within [start_time, end_time]. + """ + if not transcript_data: + return [] + + words: list[dict[str, Any]] = [] + for segment in transcript_data: + seg_words = segment.get("words") + if not seg_words: + continue + for w in seg_words: + w_start = w.get("start") + if w_start is None: + continue + if start_time <= w_start <= end_time: + words.append(w) + return words + + +def _speech_rate_variance(word_timings: list[dict[str, Any]] | None) -> float: + """Compute normalized stdev of words-per-second in sliding windows. + + High variance indicates emphasis shifts (speeding up / slowing down), + which correlates with engaging highlights. + + Uses 5-second sliding windows with 2.5-second step. + Returns 0.5 (neutral) when word_timings is None or insufficient data. + """ + if not word_timings or len(word_timings) < 4: + return 0.5 + + # Determine time span + first_start = word_timings[0].get("start", 0.0) + last_start = word_timings[-1].get("start", 0.0) + span = last_start - first_start + if span < 5.0: + return 0.5 + + # Compute WPS in 5s sliding windows with 2.5s step + window_size = 5.0 + step = 2.5 + wps_values: list[float] = [] + + t = first_start + while t + window_size <= last_start + 0.01: + count = sum( + 1 for w in word_timings + if t <= w.get("start", 0.0) < t + window_size + ) + wps_values.append(count / window_size) + t += step + + if len(wps_values) < 2: + return 0.5 + + mean_wps = statistics.mean(wps_values) + if mean_wps < 0.01: + return 0.5 + + stdev = statistics.stdev(wps_values) + # Normalize: coefficient of variation, capped at 1.0 + # CV of ~0.3-0.5 is typical for varied speech; >0.5 is high variance + cv = stdev / mean_wps + return min(cv / 0.6, 1.0) + + +def _pause_density(word_timings: list[dict[str, Any]] | None) -> float: + """Count strategic pauses normalized by duration. + + Inter-word gaps >0.5s and inter-segment gaps >1.0s indicate deliberate + pauses for emphasis, which correlate with better highlights. + + Returns 0.5 (neutral) when word_timings is None or insufficient data. + """ + if not word_timings or len(word_timings) < 2: + return 0.5 + + first_start = word_timings[0].get("start", 0.0) + last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0)) + duration = last_end - first_start + if duration < 1.0: + return 0.5 + + short_pauses = 0 # >0.5s gaps + long_pauses = 0 # >1.0s gaps + + for i in range(1, len(word_timings)): + prev_end = word_timings[i - 1].get("end", word_timings[i - 1].get("start", 0.0)) + curr_start = word_timings[i].get("start", 0.0) + gap = curr_start - prev_end + + if gap > 1.0: + long_pauses += 1 + elif gap > 0.5: + short_pauses += 1 + + # Weight long pauses more heavily + weighted_pauses = short_pauses + long_pauses * 2.0 + # Normalize: ~2-4 weighted pauses per 30s is good density + density = weighted_pauses / (duration / 15.0) + return min(density, 1.0) + + +def _speaking_pace_fitness(word_timings: list[dict[str, Any]] | None) -> float: + """Bell-curve score around 3-5 words-per-second optimal teaching pace. + + 3-5 WPS is the sweet spot for tutorial content — fast enough to be + engaging, slow enough for comprehension. Returns 0.5 (neutral) when + word_timings is None or insufficient data. + """ + if not word_timings or len(word_timings) < 2: + return 0.5 + + first_start = word_timings[0].get("start", 0.0) + last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0)) + duration = last_end - first_start + if duration < 1.0: + return 0.5 + + wps = len(word_timings) / duration + + # Sweet spot: 3-5 WPS → 1.0 + if 3.0 <= wps <= 5.0: + return 1.0 + + # Below sweet spot: linear ramp from 0 at 0 WPS to 1.0 at 3 WPS + if wps < 3.0: + return max(0.0, wps / 3.0) + + # Above sweet spot: decay from 1.0 at 5 WPS to 0.0 at 10 WPS + if wps > 5.0: + return max(0.0, 1.0 - (wps - 5.0) / 5.0) + + return 0.5 # unreachable, but defensive + + # ── Main scoring function ─────────────────────────────────────────────────── def score_moment( @@ -188,6 +350,7 @@ def score_moment( raw_transcript: str | None = None, source_quality: str | None = None, video_content_type: str | None = None, + word_timings: list[dict[str, Any]] | None = None, ) -> dict[str, Any]: """Score a KeyMoment for highlight potential. @@ -209,6 +372,9 @@ def score_moment( TechniquePage source quality (structured, mixed, unstructured). video_content_type : str | None SourceVideo content type (tutorial, breakdown, livestream, short_form). + word_timings : list[dict] | None + Word-level timing dicts with ``start`` and ``end`` keys (seconds). + When None, audio proxy dimensions score 0.5 (neutral). Returns ------- @@ -227,6 +393,9 @@ def score_moment( "engagement_proxy_score": _transcript_energy(raw_transcript), "position_score": _source_quality_weight(source_quality), "uniqueness_score": _video_type_weight(video_content_type), + "speech_rate_variance_score": _speech_rate_variance(word_timings), + "pause_density_score": _pause_density(word_timings), + "speaking_pace_score": _speaking_pace_fitness(word_timings), } # Weighted composite diff --git a/backend/pipeline/test_highlight_scorer.py b/backend/pipeline/test_highlight_scorer.py index 4d1c5d9..9a8d9b6 100644 --- a/backend/pipeline/test_highlight_scorer.py +++ b/backend/pipeline/test_highlight_scorer.py @@ -11,11 +11,15 @@ import pytest from backend.pipeline.highlight_scorer import ( _content_type_weight, _duration_fitness, + _pause_density, _plugin_richness, _source_quality_weight, + _speaking_pace_fitness, _specificity_density, + _speech_rate_variance, _transcript_energy, _video_type_weight, + extract_word_timings, score_moment, ) @@ -80,6 +84,50 @@ def _poor_moment() -> dict: ) +def _make_word_timings( + start: float = 0.0, + count: int = 40, + wps: float = 4.0, + pause_every: int | None = None, + pause_duration: float = 0.8, +) -> list[dict]: + """Generate synthetic word-timing dicts for testing. + + Parameters + ---------- + start : float + Start time in seconds. + count : int + Number of words to generate. + wps : float + Words per second (base rate). + pause_every : int | None + Insert a pause every N words. None = no pauses. + pause_duration : float + Duration of each pause in seconds. + """ + timings = [] + t = start + word_dur = 1.0 / wps * 0.7 # 70% speaking, 30% normal gap + gap = 1.0 / wps * 0.3 + + for i in range(count): + timings.append({"word": f"word{i}", "start": t, "end": t + word_dur}) + t += word_dur + gap + if pause_every and (i + 1) % pause_every == 0: + t += pause_duration + return timings + + +def _make_transcript_segments(word_timings: list[dict], words_per_segment: int = 10) -> list[dict]: + """Group word timings into transcript segments for extract_word_timings tests.""" + segments = [] + for i in range(0, len(word_timings), words_per_segment): + chunk = word_timings[i : i + words_per_segment] + segments.append({"words": chunk}) + return segments + + # ── Tests ──────────────────────────────────────────────────────────────────── class TestScoreMoment: @@ -130,22 +178,41 @@ class TestScoreMoment: ) assert 0.0 <= result["score"] <= 1.0 assert result["duration_secs"] == 45.0 - assert len(result["score_breakdown"]) == 7 + assert len(result["score_breakdown"]) == 10 def test_returns_duration_secs(self): result = score_moment(start_time=10.0, end_time=55.0) assert result["duration_secs"] == 45.0 - def test_breakdown_has_seven_dimensions(self): + def test_breakdown_has_ten_dimensions(self): result = score_moment(**_ideal_moment()) - assert len(result["score_breakdown"]) == 7 + assert len(result["score_breakdown"]) == 10 expected_keys = { "duration_score", "content_density_score", "technique_relevance_score", "plugin_diversity_score", "engagement_proxy_score", "position_score", - "uniqueness_score", + "uniqueness_score", "speech_rate_variance_score", "pause_density_score", + "speaking_pace_score", } assert set(result["score_breakdown"].keys()) == expected_keys + def test_without_word_timings_audio_dims_are_neutral(self): + """When word_timings is None, audio proxy dimensions score 0.5.""" + result = score_moment(start_time=10.0, end_time=55.0) + bd = result["score_breakdown"] + assert bd["speech_rate_variance_score"] == 0.5 + assert bd["pause_density_score"] == 0.5 + assert bd["speaking_pace_score"] == 0.5 + + def test_with_word_timings_changes_score(self): + """Providing word_timings should shift the composite score vs without.""" + base = _ideal_moment() + without = score_moment(**base) + # Add word timings at a good teaching pace (~4 WPS) with some pauses + timings = _make_word_timings(start=10.0, count=120, wps=4.0, pause_every=15) + with_timings = score_moment(**base, word_timings=timings) + # Scores should differ since audio dims are no longer neutral + assert with_timings["score"] != without["score"] + class TestDurationFitness: def test_bell_curve_peak(self): @@ -242,3 +309,213 @@ class TestVideoTypeWeight: def test_none_default(self): assert _video_type_weight(None) == 0.5 + + +# ── Audio proxy function tests ─────────────────────────────────────────────── + + +class TestExtractWordTimings: + def test_filters_by_time_window(self): + words = _make_word_timings(start=0.0, count=40, wps=4.0) + segments = _make_transcript_segments(words) + # Extract window 2.0–5.0s + result = extract_word_timings(segments, start_time=2.0, end_time=5.0) + for w in result: + assert 2.0 <= w["start"] <= 5.0 + + def test_returns_all_when_window_covers_entire_range(self): + words = _make_word_timings(start=0.0, count=20, wps=4.0) + segments = _make_transcript_segments(words) + result = extract_word_timings(segments, start_time=0.0, end_time=100.0) + assert len(result) == 20 + + def test_empty_transcript_data(self): + assert extract_word_timings([], start_time=0.0, end_time=10.0) == [] + + def test_no_words_in_window(self): + words = _make_word_timings(start=0.0, count=10, wps=4.0) + segments = _make_transcript_segments(words) + # Window far beyond the word timings + result = extract_word_timings(segments, start_time=100.0, end_time=200.0) + assert result == [] + + def test_segments_without_words_key(self): + """Segments missing 'words' are skipped gracefully.""" + segments = [{"text": "hello"}, {"words": [{"start": 1.0, "end": 1.2, "word": "a"}]}] + result = extract_word_timings(segments, start_time=0.0, end_time=10.0) + assert len(result) == 1 + + def test_words_without_start_are_skipped(self): + segments = [{"words": [{"end": 1.2, "word": "a"}, {"start": 2.0, "end": 2.2, "word": "b"}]}] + result = extract_word_timings(segments, start_time=0.0, end_time=10.0) + assert len(result) == 1 + assert result[0]["word"] == "b" + + +class TestSpeechRateVariance: + def test_none_returns_neutral(self): + assert _speech_rate_variance(None) == 0.5 + + def test_too_few_words_returns_neutral(self): + timings = _make_word_timings(count=3, wps=4.0) + assert _speech_rate_variance(timings) == 0.5 + + def test_short_span_returns_neutral(self): + """Words spanning <5s should return neutral.""" + timings = _make_word_timings(count=10, wps=4.0, start=0.0) + # 10 words at 4 WPS = 2.5s span → too short + assert _speech_rate_variance(timings) == 0.5 + + def test_uniform_pace_scores_low(self): + """Steady 4 WPS for 30s → low variance.""" + timings = _make_word_timings(start=0.0, count=120, wps=4.0) + score = _speech_rate_variance(timings) + assert score < 0.4, f"Uniform pace scored {score}, expected < 0.4" + + def test_varied_pace_scores_higher(self): + """Alternating fast/slow sections → higher variance.""" + timings = [] + t = 0.0 + # Fast section: 6 WPS for 10s + for i in range(60): + dur = 0.12 + timings.append({"word": f"w{i}", "start": t, "end": t + dur}) + t += 1.0 / 6.0 + # Slow section: 2 WPS for 10s + for i in range(20): + dur = 0.3 + timings.append({"word": f"w{60+i}", "start": t, "end": t + dur}) + t += 0.5 + score = _speech_rate_variance(timings) + uniform_score = _speech_rate_variance( + _make_word_timings(start=0.0, count=80, wps=4.0) + ) + assert score > uniform_score, ( + f"Varied pace ({score:.3f}) should be > uniform ({uniform_score:.3f})" + ) + + def test_score_bounded(self): + timings = _make_word_timings(start=0.0, count=200, wps=4.0) + score = _speech_rate_variance(timings) + assert 0.0 <= score <= 1.0 + + +class TestPauseDensity: + def test_none_returns_neutral(self): + assert _pause_density(None) == 0.5 + + def test_single_word_returns_neutral(self): + assert _pause_density([{"start": 0.0, "end": 0.2}]) == 0.5 + + def test_no_pauses_scores_zero(self): + """Continuous speech with no gaps >0.5s → 0.""" + timings = _make_word_timings(start=0.0, count=60, wps=4.0) + score = _pause_density(timings) + assert score == 0.0 + + def test_frequent_pauses_scores_high(self): + """Pauses every 5 words → high density.""" + timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=5, pause_duration=0.8) + score = _pause_density(timings) + assert score > 0.5, f"Frequent pauses scored {score}, expected > 0.5" + + def test_long_pauses_weighted_more(self): + """One 1.5s pause should score higher than one 0.6s pause in a longer segment.""" + # Build timings with one long pause at midpoint — 60 words for longer duration + long_pause = [] + t = 0.0 + for i in range(60): + long_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15}) + t += 0.25 + if i == 29: + t += 1.5 # long pause >1.0s + # Build timings with one short pause — same word count + short_pause = [] + t = 0.0 + for i in range(60): + short_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15}) + t += 0.25 + if i == 29: + t += 0.6 # short pause >0.5s but <1.0s + assert _pause_density(long_pause) > _pause_density(short_pause) + + def test_score_bounded(self): + timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=3, pause_duration=1.5) + score = _pause_density(timings) + assert 0.0 <= score <= 1.0 + + +class TestSpeakingPaceFitness: + def test_none_returns_neutral(self): + assert _speaking_pace_fitness(None) == 0.5 + + def test_single_word_returns_neutral(self): + assert _speaking_pace_fitness([{"start": 0.0, "end": 0.2}]) == 0.5 + + def test_optimal_pace_scores_high(self): + """4 WPS (optimal teaching pace) → 1.0.""" + timings = _make_word_timings(start=0.0, count=40, wps=4.0) + score = _speaking_pace_fitness(timings) + assert score == 1.0, f"4 WPS scored {score}, expected 1.0" + + def test_three_wps_is_sweet_spot_edge(self): + timings = _make_word_timings(start=0.0, count=30, wps=3.0) + score = _speaking_pace_fitness(timings) + assert score == 1.0 + + def test_five_wps_is_sweet_spot_edge(self): + timings = _make_word_timings(start=0.0, count=50, wps=5.0) + score = _speaking_pace_fitness(timings) + assert score > 0.95, f"5 WPS scored {score}, expected near 1.0" + + def test_too_slow_scores_lower(self): + """1.5 WPS → below sweet spot.""" + timings = _make_word_timings(start=0.0, count=15, wps=1.5) + score = _speaking_pace_fitness(timings) + assert 0.4 < score < 0.6, f"1.5 WPS scored {score}, expected ~0.5" + + def test_too_fast_scores_lower(self): + """8 WPS → above sweet spot.""" + timings = _make_word_timings(start=0.0, count=80, wps=8.0) + score = _speaking_pace_fitness(timings) + assert 0.0 < score < 1.0 + + def test_very_fast_scores_zero(self): + """10+ WPS → 0.""" + timings = _make_word_timings(start=0.0, count=110, wps=11.0) + score = _speaking_pace_fitness(timings) + assert score == 0.0 + + def test_zero_wps_scores_zero(self): + """Very short duration → neutral.""" + timings = [{"start": 0.0, "end": 0.01}, {"start": 0.005, "end": 0.015}] + score = _speaking_pace_fitness(timings) + # Duration ~0.015s → too short → 0.5 (neutral) + assert score == 0.5 + + def test_score_bounded(self): + for wps in [0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0]: + timings = _make_word_timings(start=0.0, count=max(10, int(wps * 10)), wps=wps) + score = _speaking_pace_fitness(timings) + assert 0.0 <= score <= 1.0, f"WPS {wps} scored {score} out of bounds" + + +class TestBackwardCompatibility: + """Ensure the weight rebalancing doesn't break existing relative orderings.""" + + def test_ideal_still_beats_poor(self): + ideal = score_moment(**_ideal_moment()) + poor = score_moment(**_poor_moment()) + assert ideal["score"] > poor["score"] + + def test_ideal_still_above_threshold(self): + result = score_moment(**_ideal_moment()) + assert result["score"] > 0.6, f"Ideal scored {result['score']}, expected > 0.6" + + def test_poor_still_below_threshold(self): + result = score_moment(**_poor_moment()) + assert result["score"] < 0.45, f"Poor scored {result['score']}, expected < 0.45" + + def test_weights_sum_to_one(self): + from backend.pipeline.highlight_scorer import _WEIGHTS + assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9