- "backend/pipeline/highlight_scorer.py" - "backend/pipeline/highlight_schemas.py" - "backend/pipeline/test_highlight_scorer.py" GSD-Task: S05/T01
521 lines
20 KiB
Python
521 lines
20 KiB
Python
"""Tests for the highlight scoring engine.
|
||
|
||
Verifies heuristic scoring produces sensible orderings and handles
|
||
edge cases gracefully.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import pytest
|
||
|
||
from backend.pipeline.highlight_scorer import (
|
||
_content_type_weight,
|
||
_duration_fitness,
|
||
_pause_density,
|
||
_plugin_richness,
|
||
_source_quality_weight,
|
||
_speaking_pace_fitness,
|
||
_specificity_density,
|
||
_speech_rate_variance,
|
||
_transcript_energy,
|
||
_video_type_weight,
|
||
extract_word_timings,
|
||
score_moment,
|
||
)
|
||
|
||
|
||
# ── Fixture helpers ──────────────────────────────────────────────────────────
|
||
|
||
def _ideal_moment() -> dict:
|
||
"""45s technique moment, 3 plugins, specific summary, structured source."""
|
||
return dict(
|
||
start_time=10.0,
|
||
end_time=55.0, # 45s duration
|
||
content_type="technique",
|
||
summary=(
|
||
"Set the compressor threshold to -18 dB with a 4:1 ratio, "
|
||
"then boost the high shelf at 12 kHz by 3.5 dB using FabFilter Pro-Q 3."
|
||
),
|
||
plugins=["FabFilter Pro-Q 3", "SSL G-Bus Compressor", "Valhalla Room"],
|
||
raw_transcript=(
|
||
"The trick is to set the threshold low enough. Notice how "
|
||
"the compressor grabs the transients. Because we want to preserve "
|
||
"the dynamics, I always back off the ratio. The key is finding "
|
||
"that sweet spot where it's controlling but not squashing."
|
||
),
|
||
source_quality="structured",
|
||
video_content_type="tutorial",
|
||
)
|
||
|
||
|
||
def _mediocre_moment() -> dict:
|
||
"""90s settings moment, 1 plugin, decent summary, mixed source."""
|
||
return dict(
|
||
start_time=120.0,
|
||
end_time=210.0, # 90s duration
|
||
content_type="settings",
|
||
summary="Adjust the EQ settings for the vocal track to get a clearer sound.",
|
||
plugins=["FabFilter Pro-Q 3"],
|
||
raw_transcript=(
|
||
"So here we're just going to adjust this. I think it sounds "
|
||
"better when we cut some of the low end. Let me show you what "
|
||
"I mean. Yeah, that's better."
|
||
),
|
||
source_quality="mixed",
|
||
video_content_type="breakdown",
|
||
)
|
||
|
||
|
||
def _poor_moment() -> dict:
|
||
"""300s reasoning moment, 0 plugins, vague summary, unstructured source."""
|
||
return dict(
|
||
start_time=0.0,
|
||
end_time=300.0, # 300s duration → zero for duration_fitness
|
||
content_type="reasoning",
|
||
summary="General discussion about mixing philosophy and approach.",
|
||
plugins=[],
|
||
raw_transcript=(
|
||
"I think mixing is really about taste. Everyone has their own "
|
||
"approach. Some people like it loud, some people like it quiet. "
|
||
"There's no right or wrong way to do it really."
|
||
),
|
||
source_quality="unstructured",
|
||
video_content_type="livestream",
|
||
)
|
||
|
||
|
||
def _make_word_timings(
|
||
start: float = 0.0,
|
||
count: int = 40,
|
||
wps: float = 4.0,
|
||
pause_every: int | None = None,
|
||
pause_duration: float = 0.8,
|
||
) -> list[dict]:
|
||
"""Generate synthetic word-timing dicts for testing.
|
||
|
||
Parameters
|
||
----------
|
||
start : float
|
||
Start time in seconds.
|
||
count : int
|
||
Number of words to generate.
|
||
wps : float
|
||
Words per second (base rate).
|
||
pause_every : int | None
|
||
Insert a pause every N words. None = no pauses.
|
||
pause_duration : float
|
||
Duration of each pause in seconds.
|
||
"""
|
||
timings = []
|
||
t = start
|
||
word_dur = 1.0 / wps * 0.7 # 70% speaking, 30% normal gap
|
||
gap = 1.0 / wps * 0.3
|
||
|
||
for i in range(count):
|
||
timings.append({"word": f"word{i}", "start": t, "end": t + word_dur})
|
||
t += word_dur + gap
|
||
if pause_every and (i + 1) % pause_every == 0:
|
||
t += pause_duration
|
||
return timings
|
||
|
||
|
||
def _make_transcript_segments(word_timings: list[dict], words_per_segment: int = 10) -> list[dict]:
|
||
"""Group word timings into transcript segments for extract_word_timings tests."""
|
||
segments = []
|
||
for i in range(0, len(word_timings), words_per_segment):
|
||
chunk = word_timings[i : i + words_per_segment]
|
||
segments.append({"words": chunk})
|
||
return segments
|
||
|
||
|
||
# ── Tests ────────────────────────────────────────────────────────────────────
|
||
|
||
class TestScoreMoment:
|
||
def test_ideal_moment_scores_high(self):
|
||
result = score_moment(**_ideal_moment())
|
||
assert result["score"] > 0.7, f"Ideal moment scored {result['score']}, expected > 0.7"
|
||
|
||
def test_poor_moment_scores_low(self):
|
||
result = score_moment(**_poor_moment())
|
||
assert result["score"] < 0.4, f"Poor moment scored {result['score']}, expected < 0.4"
|
||
|
||
def test_ordering_is_sensible(self):
|
||
ideal = score_moment(**_ideal_moment())
|
||
mediocre = score_moment(**_mediocre_moment())
|
||
poor = score_moment(**_poor_moment())
|
||
|
||
assert ideal["score"] > mediocre["score"] > poor["score"], (
|
||
f"Expected ideal ({ideal['score']:.3f}) > "
|
||
f"mediocre ({mediocre['score']:.3f}) > "
|
||
f"poor ({poor['score']:.3f})"
|
||
)
|
||
|
||
def test_score_bounds(self):
|
||
"""All scores in [0.0, 1.0] for edge cases."""
|
||
edge_cases = [
|
||
dict(start_time=0, end_time=0, summary="", plugins=None, raw_transcript=None),
|
||
dict(start_time=0, end_time=500, summary=None, plugins=[], raw_transcript=""),
|
||
dict(start_time=0, end_time=45, summary="x" * 10000, plugins=["a"] * 100),
|
||
dict(start_time=100, end_time=100), # zero duration
|
||
]
|
||
for kwargs in edge_cases:
|
||
result = score_moment(**kwargs)
|
||
assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of bounds for {kwargs}"
|
||
for dim, val in result["score_breakdown"].items():
|
||
assert 0.0 <= val <= 1.0, f"{dim}={val} out of bounds for {kwargs}"
|
||
|
||
def test_missing_optional_fields(self):
|
||
"""None raw_transcript and None plugins don't crash."""
|
||
result = score_moment(
|
||
start_time=10.0,
|
||
end_time=55.0,
|
||
content_type="technique",
|
||
summary="A summary.",
|
||
plugins=None,
|
||
raw_transcript=None,
|
||
source_quality=None,
|
||
video_content_type=None,
|
||
)
|
||
assert 0.0 <= result["score"] <= 1.0
|
||
assert result["duration_secs"] == 45.0
|
||
assert len(result["score_breakdown"]) == 10
|
||
|
||
def test_returns_duration_secs(self):
|
||
result = score_moment(start_time=10.0, end_time=55.0)
|
||
assert result["duration_secs"] == 45.0
|
||
|
||
def test_breakdown_has_ten_dimensions(self):
|
||
result = score_moment(**_ideal_moment())
|
||
assert len(result["score_breakdown"]) == 10
|
||
expected_keys = {
|
||
"duration_score", "content_density_score", "technique_relevance_score",
|
||
"plugin_diversity_score", "engagement_proxy_score", "position_score",
|
||
"uniqueness_score", "speech_rate_variance_score", "pause_density_score",
|
||
"speaking_pace_score",
|
||
}
|
||
assert set(result["score_breakdown"].keys()) == expected_keys
|
||
|
||
def test_without_word_timings_audio_dims_are_neutral(self):
|
||
"""When word_timings is None, audio proxy dimensions score 0.5."""
|
||
result = score_moment(start_time=10.0, end_time=55.0)
|
||
bd = result["score_breakdown"]
|
||
assert bd["speech_rate_variance_score"] == 0.5
|
||
assert bd["pause_density_score"] == 0.5
|
||
assert bd["speaking_pace_score"] == 0.5
|
||
|
||
def test_with_word_timings_changes_score(self):
|
||
"""Providing word_timings should shift the composite score vs without."""
|
||
base = _ideal_moment()
|
||
without = score_moment(**base)
|
||
# Add word timings at a good teaching pace (~4 WPS) with some pauses
|
||
timings = _make_word_timings(start=10.0, count=120, wps=4.0, pause_every=15)
|
||
with_timings = score_moment(**base, word_timings=timings)
|
||
# Scores should differ since audio dims are no longer neutral
|
||
assert with_timings["score"] != without["score"]
|
||
|
||
|
||
class TestDurationFitness:
|
||
def test_bell_curve_peak(self):
|
||
"""45s scores higher than 10s, 10s scores higher than 400s."""
|
||
assert _duration_fitness(45) > _duration_fitness(10)
|
||
assert _duration_fitness(10) > _duration_fitness(400)
|
||
|
||
def test_sweet_spot(self):
|
||
assert _duration_fitness(30) == 1.0
|
||
assert _duration_fitness(45) == 1.0
|
||
assert _duration_fitness(60) == 1.0
|
||
|
||
def test_zero_at_extremes(self):
|
||
assert _duration_fitness(0) == 0.0
|
||
assert _duration_fitness(300) == 0.0
|
||
assert _duration_fitness(500) == 0.0
|
||
|
||
def test_negative_duration(self):
|
||
assert _duration_fitness(-10) == 0.0
|
||
|
||
|
||
class TestContentTypeWeight:
|
||
def test_technique_highest(self):
|
||
assert _content_type_weight("technique") == 1.0
|
||
|
||
def test_reasoning_lowest_known(self):
|
||
assert _content_type_weight("reasoning") == 0.4
|
||
|
||
def test_unknown_gets_default(self):
|
||
assert _content_type_weight("unknown") == 0.5
|
||
assert _content_type_weight(None) == 0.5
|
||
|
||
|
||
class TestSpecificityDensity:
|
||
def test_specific_summary_scores_high(self):
|
||
summary = "Set threshold to -18 dB with 4:1 ratio, boost 12 kHz by 3.5 dB"
|
||
score = _specificity_density(summary)
|
||
assert score > 0.5
|
||
|
||
def test_vague_summary_scores_low(self):
|
||
score = _specificity_density("General discussion about mixing philosophy.")
|
||
assert score < 0.3
|
||
|
||
def test_empty_returns_zero(self):
|
||
assert _specificity_density("") == 0.0
|
||
assert _specificity_density(None) == 0.0
|
||
|
||
|
||
class TestPluginRichness:
|
||
def test_three_plugins_maxes_out(self):
|
||
assert _plugin_richness(["a", "b", "c"]) == 1.0
|
||
|
||
def test_more_than_three_capped(self):
|
||
assert _plugin_richness(["a", "b", "c", "d"]) == 1.0
|
||
|
||
def test_empty(self):
|
||
assert _plugin_richness([]) == 0.0
|
||
assert _plugin_richness(None) == 0.0
|
||
|
||
|
||
class TestTranscriptEnergy:
|
||
def test_teaching_phrases_score_high(self):
|
||
transcript = (
|
||
"The trick is to notice how the compressor behaves. "
|
||
"Because we want dynamics, I always set it gently. The key is balance."
|
||
)
|
||
score = _transcript_energy(transcript)
|
||
assert score > 0.5
|
||
|
||
def test_bland_transcript_scores_low(self):
|
||
transcript = "And then we adjust this slider here. Okay that sounds fine."
|
||
score = _transcript_energy(transcript)
|
||
assert score < 0.3
|
||
|
||
def test_empty(self):
|
||
assert _transcript_energy("") == 0.0
|
||
assert _transcript_energy(None) == 0.0
|
||
|
||
|
||
class TestSourceQualityWeight:
|
||
def test_structured_highest(self):
|
||
assert _source_quality_weight("structured") == 1.0
|
||
|
||
def test_none_default(self):
|
||
assert _source_quality_weight(None) == 0.5
|
||
|
||
|
||
class TestVideoTypeWeight:
|
||
def test_tutorial_highest(self):
|
||
assert _video_type_weight("tutorial") == 1.0
|
||
|
||
def test_short_form_lowest(self):
|
||
assert _video_type_weight("short_form") == 0.3
|
||
|
||
def test_none_default(self):
|
||
assert _video_type_weight(None) == 0.5
|
||
|
||
|
||
# ── Audio proxy function tests ───────────────────────────────────────────────
|
||
|
||
|
||
class TestExtractWordTimings:
|
||
def test_filters_by_time_window(self):
|
||
words = _make_word_timings(start=0.0, count=40, wps=4.0)
|
||
segments = _make_transcript_segments(words)
|
||
# Extract window 2.0–5.0s
|
||
result = extract_word_timings(segments, start_time=2.0, end_time=5.0)
|
||
for w in result:
|
||
assert 2.0 <= w["start"] <= 5.0
|
||
|
||
def test_returns_all_when_window_covers_entire_range(self):
|
||
words = _make_word_timings(start=0.0, count=20, wps=4.0)
|
||
segments = _make_transcript_segments(words)
|
||
result = extract_word_timings(segments, start_time=0.0, end_time=100.0)
|
||
assert len(result) == 20
|
||
|
||
def test_empty_transcript_data(self):
|
||
assert extract_word_timings([], start_time=0.0, end_time=10.0) == []
|
||
|
||
def test_no_words_in_window(self):
|
||
words = _make_word_timings(start=0.0, count=10, wps=4.0)
|
||
segments = _make_transcript_segments(words)
|
||
# Window far beyond the word timings
|
||
result = extract_word_timings(segments, start_time=100.0, end_time=200.0)
|
||
assert result == []
|
||
|
||
def test_segments_without_words_key(self):
|
||
"""Segments missing 'words' are skipped gracefully."""
|
||
segments = [{"text": "hello"}, {"words": [{"start": 1.0, "end": 1.2, "word": "a"}]}]
|
||
result = extract_word_timings(segments, start_time=0.0, end_time=10.0)
|
||
assert len(result) == 1
|
||
|
||
def test_words_without_start_are_skipped(self):
|
||
segments = [{"words": [{"end": 1.2, "word": "a"}, {"start": 2.0, "end": 2.2, "word": "b"}]}]
|
||
result = extract_word_timings(segments, start_time=0.0, end_time=10.0)
|
||
assert len(result) == 1
|
||
assert result[0]["word"] == "b"
|
||
|
||
|
||
class TestSpeechRateVariance:
|
||
def test_none_returns_neutral(self):
|
||
assert _speech_rate_variance(None) == 0.5
|
||
|
||
def test_too_few_words_returns_neutral(self):
|
||
timings = _make_word_timings(count=3, wps=4.0)
|
||
assert _speech_rate_variance(timings) == 0.5
|
||
|
||
def test_short_span_returns_neutral(self):
|
||
"""Words spanning <5s should return neutral."""
|
||
timings = _make_word_timings(count=10, wps=4.0, start=0.0)
|
||
# 10 words at 4 WPS = 2.5s span → too short
|
||
assert _speech_rate_variance(timings) == 0.5
|
||
|
||
def test_uniform_pace_scores_low(self):
|
||
"""Steady 4 WPS for 30s → low variance."""
|
||
timings = _make_word_timings(start=0.0, count=120, wps=4.0)
|
||
score = _speech_rate_variance(timings)
|
||
assert score < 0.4, f"Uniform pace scored {score}, expected < 0.4"
|
||
|
||
def test_varied_pace_scores_higher(self):
|
||
"""Alternating fast/slow sections → higher variance."""
|
||
timings = []
|
||
t = 0.0
|
||
# Fast section: 6 WPS for 10s
|
||
for i in range(60):
|
||
dur = 0.12
|
||
timings.append({"word": f"w{i}", "start": t, "end": t + dur})
|
||
t += 1.0 / 6.0
|
||
# Slow section: 2 WPS for 10s
|
||
for i in range(20):
|
||
dur = 0.3
|
||
timings.append({"word": f"w{60+i}", "start": t, "end": t + dur})
|
||
t += 0.5
|
||
score = _speech_rate_variance(timings)
|
||
uniform_score = _speech_rate_variance(
|
||
_make_word_timings(start=0.0, count=80, wps=4.0)
|
||
)
|
||
assert score > uniform_score, (
|
||
f"Varied pace ({score:.3f}) should be > uniform ({uniform_score:.3f})"
|
||
)
|
||
|
||
def test_score_bounded(self):
|
||
timings = _make_word_timings(start=0.0, count=200, wps=4.0)
|
||
score = _speech_rate_variance(timings)
|
||
assert 0.0 <= score <= 1.0
|
||
|
||
|
||
class TestPauseDensity:
|
||
def test_none_returns_neutral(self):
|
||
assert _pause_density(None) == 0.5
|
||
|
||
def test_single_word_returns_neutral(self):
|
||
assert _pause_density([{"start": 0.0, "end": 0.2}]) == 0.5
|
||
|
||
def test_no_pauses_scores_zero(self):
|
||
"""Continuous speech with no gaps >0.5s → 0."""
|
||
timings = _make_word_timings(start=0.0, count=60, wps=4.0)
|
||
score = _pause_density(timings)
|
||
assert score == 0.0
|
||
|
||
def test_frequent_pauses_scores_high(self):
|
||
"""Pauses every 5 words → high density."""
|
||
timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=5, pause_duration=0.8)
|
||
score = _pause_density(timings)
|
||
assert score > 0.5, f"Frequent pauses scored {score}, expected > 0.5"
|
||
|
||
def test_long_pauses_weighted_more(self):
|
||
"""One 1.5s pause should score higher than one 0.6s pause in a longer segment."""
|
||
# Build timings with one long pause at midpoint — 60 words for longer duration
|
||
long_pause = []
|
||
t = 0.0
|
||
for i in range(60):
|
||
long_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15})
|
||
t += 0.25
|
||
if i == 29:
|
||
t += 1.5 # long pause >1.0s
|
||
# Build timings with one short pause — same word count
|
||
short_pause = []
|
||
t = 0.0
|
||
for i in range(60):
|
||
short_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15})
|
||
t += 0.25
|
||
if i == 29:
|
||
t += 0.6 # short pause >0.5s but <1.0s
|
||
assert _pause_density(long_pause) > _pause_density(short_pause)
|
||
|
||
def test_score_bounded(self):
|
||
timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=3, pause_duration=1.5)
|
||
score = _pause_density(timings)
|
||
assert 0.0 <= score <= 1.0
|
||
|
||
|
||
class TestSpeakingPaceFitness:
|
||
def test_none_returns_neutral(self):
|
||
assert _speaking_pace_fitness(None) == 0.5
|
||
|
||
def test_single_word_returns_neutral(self):
|
||
assert _speaking_pace_fitness([{"start": 0.0, "end": 0.2}]) == 0.5
|
||
|
||
def test_optimal_pace_scores_high(self):
|
||
"""4 WPS (optimal teaching pace) → 1.0."""
|
||
timings = _make_word_timings(start=0.0, count=40, wps=4.0)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert score == 1.0, f"4 WPS scored {score}, expected 1.0"
|
||
|
||
def test_three_wps_is_sweet_spot_edge(self):
|
||
timings = _make_word_timings(start=0.0, count=30, wps=3.0)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert score == 1.0
|
||
|
||
def test_five_wps_is_sweet_spot_edge(self):
|
||
timings = _make_word_timings(start=0.0, count=50, wps=5.0)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert score > 0.95, f"5 WPS scored {score}, expected near 1.0"
|
||
|
||
def test_too_slow_scores_lower(self):
|
||
"""1.5 WPS → below sweet spot."""
|
||
timings = _make_word_timings(start=0.0, count=15, wps=1.5)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert 0.4 < score < 0.6, f"1.5 WPS scored {score}, expected ~0.5"
|
||
|
||
def test_too_fast_scores_lower(self):
|
||
"""8 WPS → above sweet spot."""
|
||
timings = _make_word_timings(start=0.0, count=80, wps=8.0)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert 0.0 < score < 1.0
|
||
|
||
def test_very_fast_scores_zero(self):
|
||
"""10+ WPS → 0."""
|
||
timings = _make_word_timings(start=0.0, count=110, wps=11.0)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert score == 0.0
|
||
|
||
def test_zero_wps_scores_zero(self):
|
||
"""Very short duration → neutral."""
|
||
timings = [{"start": 0.0, "end": 0.01}, {"start": 0.005, "end": 0.015}]
|
||
score = _speaking_pace_fitness(timings)
|
||
# Duration ~0.015s → too short → 0.5 (neutral)
|
||
assert score == 0.5
|
||
|
||
def test_score_bounded(self):
|
||
for wps in [0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0]:
|
||
timings = _make_word_timings(start=0.0, count=max(10, int(wps * 10)), wps=wps)
|
||
score = _speaking_pace_fitness(timings)
|
||
assert 0.0 <= score <= 1.0, f"WPS {wps} scored {score} out of bounds"
|
||
|
||
|
||
class TestBackwardCompatibility:
|
||
"""Ensure the weight rebalancing doesn't break existing relative orderings."""
|
||
|
||
def test_ideal_still_beats_poor(self):
|
||
ideal = score_moment(**_ideal_moment())
|
||
poor = score_moment(**_poor_moment())
|
||
assert ideal["score"] > poor["score"]
|
||
|
||
def test_ideal_still_above_threshold(self):
|
||
result = score_moment(**_ideal_moment())
|
||
assert result["score"] > 0.6, f"Ideal scored {result['score']}, expected > 0.6"
|
||
|
||
def test_poor_still_below_threshold(self):
|
||
result = score_moment(**_poor_moment())
|
||
assert result["score"] < 0.45, f"Poor scored {result['score']}, expected < 0.45"
|
||
|
||
def test_weights_sum_to_one(self):
|
||
from backend.pipeline.highlight_scorer import _WEIGHTS
|
||
assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9
|