chrysopedia/backend/pipeline/test_highlight_scorer.py
jlightner 27c5f4866b test: Added 3 audio proxy scoring functions, extract_word_timings utili…
- "backend/pipeline/highlight_scorer.py"
- "backend/pipeline/highlight_schemas.py"
- "backend/pipeline/test_highlight_scorer.py"

GSD-Task: S05/T01
2026-04-04 08:05:22 +00:00

521 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for the highlight scoring engine.
Verifies heuristic scoring produces sensible orderings and handles
edge cases gracefully.
"""
from __future__ import annotations
import pytest
from backend.pipeline.highlight_scorer import (
_content_type_weight,
_duration_fitness,
_pause_density,
_plugin_richness,
_source_quality_weight,
_speaking_pace_fitness,
_specificity_density,
_speech_rate_variance,
_transcript_energy,
_video_type_weight,
extract_word_timings,
score_moment,
)
# ── Fixture helpers ──────────────────────────────────────────────────────────
def _ideal_moment() -> dict:
"""45s technique moment, 3 plugins, specific summary, structured source."""
return dict(
start_time=10.0,
end_time=55.0, # 45s duration
content_type="technique",
summary=(
"Set the compressor threshold to -18 dB with a 4:1 ratio, "
"then boost the high shelf at 12 kHz by 3.5 dB using FabFilter Pro-Q 3."
),
plugins=["FabFilter Pro-Q 3", "SSL G-Bus Compressor", "Valhalla Room"],
raw_transcript=(
"The trick is to set the threshold low enough. Notice how "
"the compressor grabs the transients. Because we want to preserve "
"the dynamics, I always back off the ratio. The key is finding "
"that sweet spot where it's controlling but not squashing."
),
source_quality="structured",
video_content_type="tutorial",
)
def _mediocre_moment() -> dict:
"""90s settings moment, 1 plugin, decent summary, mixed source."""
return dict(
start_time=120.0,
end_time=210.0, # 90s duration
content_type="settings",
summary="Adjust the EQ settings for the vocal track to get a clearer sound.",
plugins=["FabFilter Pro-Q 3"],
raw_transcript=(
"So here we're just going to adjust this. I think it sounds "
"better when we cut some of the low end. Let me show you what "
"I mean. Yeah, that's better."
),
source_quality="mixed",
video_content_type="breakdown",
)
def _poor_moment() -> dict:
"""300s reasoning moment, 0 plugins, vague summary, unstructured source."""
return dict(
start_time=0.0,
end_time=300.0, # 300s duration → zero for duration_fitness
content_type="reasoning",
summary="General discussion about mixing philosophy and approach.",
plugins=[],
raw_transcript=(
"I think mixing is really about taste. Everyone has their own "
"approach. Some people like it loud, some people like it quiet. "
"There's no right or wrong way to do it really."
),
source_quality="unstructured",
video_content_type="livestream",
)
def _make_word_timings(
start: float = 0.0,
count: int = 40,
wps: float = 4.0,
pause_every: int | None = None,
pause_duration: float = 0.8,
) -> list[dict]:
"""Generate synthetic word-timing dicts for testing.
Parameters
----------
start : float
Start time in seconds.
count : int
Number of words to generate.
wps : float
Words per second (base rate).
pause_every : int | None
Insert a pause every N words. None = no pauses.
pause_duration : float
Duration of each pause in seconds.
"""
timings = []
t = start
word_dur = 1.0 / wps * 0.7 # 70% speaking, 30% normal gap
gap = 1.0 / wps * 0.3
for i in range(count):
timings.append({"word": f"word{i}", "start": t, "end": t + word_dur})
t += word_dur + gap
if pause_every and (i + 1) % pause_every == 0:
t += pause_duration
return timings
def _make_transcript_segments(word_timings: list[dict], words_per_segment: int = 10) -> list[dict]:
"""Group word timings into transcript segments for extract_word_timings tests."""
segments = []
for i in range(0, len(word_timings), words_per_segment):
chunk = word_timings[i : i + words_per_segment]
segments.append({"words": chunk})
return segments
# ── Tests ────────────────────────────────────────────────────────────────────
class TestScoreMoment:
def test_ideal_moment_scores_high(self):
result = score_moment(**_ideal_moment())
assert result["score"] > 0.7, f"Ideal moment scored {result['score']}, expected > 0.7"
def test_poor_moment_scores_low(self):
result = score_moment(**_poor_moment())
assert result["score"] < 0.4, f"Poor moment scored {result['score']}, expected < 0.4"
def test_ordering_is_sensible(self):
ideal = score_moment(**_ideal_moment())
mediocre = score_moment(**_mediocre_moment())
poor = score_moment(**_poor_moment())
assert ideal["score"] > mediocre["score"] > poor["score"], (
f"Expected ideal ({ideal['score']:.3f}) > "
f"mediocre ({mediocre['score']:.3f}) > "
f"poor ({poor['score']:.3f})"
)
def test_score_bounds(self):
"""All scores in [0.0, 1.0] for edge cases."""
edge_cases = [
dict(start_time=0, end_time=0, summary="", plugins=None, raw_transcript=None),
dict(start_time=0, end_time=500, summary=None, plugins=[], raw_transcript=""),
dict(start_time=0, end_time=45, summary="x" * 10000, plugins=["a"] * 100),
dict(start_time=100, end_time=100), # zero duration
]
for kwargs in edge_cases:
result = score_moment(**kwargs)
assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of bounds for {kwargs}"
for dim, val in result["score_breakdown"].items():
assert 0.0 <= val <= 1.0, f"{dim}={val} out of bounds for {kwargs}"
def test_missing_optional_fields(self):
"""None raw_transcript and None plugins don't crash."""
result = score_moment(
start_time=10.0,
end_time=55.0,
content_type="technique",
summary="A summary.",
plugins=None,
raw_transcript=None,
source_quality=None,
video_content_type=None,
)
assert 0.0 <= result["score"] <= 1.0
assert result["duration_secs"] == 45.0
assert len(result["score_breakdown"]) == 10
def test_returns_duration_secs(self):
result = score_moment(start_time=10.0, end_time=55.0)
assert result["duration_secs"] == 45.0
def test_breakdown_has_ten_dimensions(self):
result = score_moment(**_ideal_moment())
assert len(result["score_breakdown"]) == 10
expected_keys = {
"duration_score", "content_density_score", "technique_relevance_score",
"plugin_diversity_score", "engagement_proxy_score", "position_score",
"uniqueness_score", "speech_rate_variance_score", "pause_density_score",
"speaking_pace_score",
}
assert set(result["score_breakdown"].keys()) == expected_keys
def test_without_word_timings_audio_dims_are_neutral(self):
"""When word_timings is None, audio proxy dimensions score 0.5."""
result = score_moment(start_time=10.0, end_time=55.0)
bd = result["score_breakdown"]
assert bd["speech_rate_variance_score"] == 0.5
assert bd["pause_density_score"] == 0.5
assert bd["speaking_pace_score"] == 0.5
def test_with_word_timings_changes_score(self):
"""Providing word_timings should shift the composite score vs without."""
base = _ideal_moment()
without = score_moment(**base)
# Add word timings at a good teaching pace (~4 WPS) with some pauses
timings = _make_word_timings(start=10.0, count=120, wps=4.0, pause_every=15)
with_timings = score_moment(**base, word_timings=timings)
# Scores should differ since audio dims are no longer neutral
assert with_timings["score"] != without["score"]
class TestDurationFitness:
def test_bell_curve_peak(self):
"""45s scores higher than 10s, 10s scores higher than 400s."""
assert _duration_fitness(45) > _duration_fitness(10)
assert _duration_fitness(10) > _duration_fitness(400)
def test_sweet_spot(self):
assert _duration_fitness(30) == 1.0
assert _duration_fitness(45) == 1.0
assert _duration_fitness(60) == 1.0
def test_zero_at_extremes(self):
assert _duration_fitness(0) == 0.0
assert _duration_fitness(300) == 0.0
assert _duration_fitness(500) == 0.0
def test_negative_duration(self):
assert _duration_fitness(-10) == 0.0
class TestContentTypeWeight:
def test_technique_highest(self):
assert _content_type_weight("technique") == 1.0
def test_reasoning_lowest_known(self):
assert _content_type_weight("reasoning") == 0.4
def test_unknown_gets_default(self):
assert _content_type_weight("unknown") == 0.5
assert _content_type_weight(None) == 0.5
class TestSpecificityDensity:
def test_specific_summary_scores_high(self):
summary = "Set threshold to -18 dB with 4:1 ratio, boost 12 kHz by 3.5 dB"
score = _specificity_density(summary)
assert score > 0.5
def test_vague_summary_scores_low(self):
score = _specificity_density("General discussion about mixing philosophy.")
assert score < 0.3
def test_empty_returns_zero(self):
assert _specificity_density("") == 0.0
assert _specificity_density(None) == 0.0
class TestPluginRichness:
def test_three_plugins_maxes_out(self):
assert _plugin_richness(["a", "b", "c"]) == 1.0
def test_more_than_three_capped(self):
assert _plugin_richness(["a", "b", "c", "d"]) == 1.0
def test_empty(self):
assert _plugin_richness([]) == 0.0
assert _plugin_richness(None) == 0.0
class TestTranscriptEnergy:
def test_teaching_phrases_score_high(self):
transcript = (
"The trick is to notice how the compressor behaves. "
"Because we want dynamics, I always set it gently. The key is balance."
)
score = _transcript_energy(transcript)
assert score > 0.5
def test_bland_transcript_scores_low(self):
transcript = "And then we adjust this slider here. Okay that sounds fine."
score = _transcript_energy(transcript)
assert score < 0.3
def test_empty(self):
assert _transcript_energy("") == 0.0
assert _transcript_energy(None) == 0.0
class TestSourceQualityWeight:
def test_structured_highest(self):
assert _source_quality_weight("structured") == 1.0
def test_none_default(self):
assert _source_quality_weight(None) == 0.5
class TestVideoTypeWeight:
def test_tutorial_highest(self):
assert _video_type_weight("tutorial") == 1.0
def test_short_form_lowest(self):
assert _video_type_weight("short_form") == 0.3
def test_none_default(self):
assert _video_type_weight(None) == 0.5
# ── Audio proxy function tests ───────────────────────────────────────────────
class TestExtractWordTimings:
def test_filters_by_time_window(self):
words = _make_word_timings(start=0.0, count=40, wps=4.0)
segments = _make_transcript_segments(words)
# Extract window 2.05.0s
result = extract_word_timings(segments, start_time=2.0, end_time=5.0)
for w in result:
assert 2.0 <= w["start"] <= 5.0
def test_returns_all_when_window_covers_entire_range(self):
words = _make_word_timings(start=0.0, count=20, wps=4.0)
segments = _make_transcript_segments(words)
result = extract_word_timings(segments, start_time=0.0, end_time=100.0)
assert len(result) == 20
def test_empty_transcript_data(self):
assert extract_word_timings([], start_time=0.0, end_time=10.0) == []
def test_no_words_in_window(self):
words = _make_word_timings(start=0.0, count=10, wps=4.0)
segments = _make_transcript_segments(words)
# Window far beyond the word timings
result = extract_word_timings(segments, start_time=100.0, end_time=200.0)
assert result == []
def test_segments_without_words_key(self):
"""Segments missing 'words' are skipped gracefully."""
segments = [{"text": "hello"}, {"words": [{"start": 1.0, "end": 1.2, "word": "a"}]}]
result = extract_word_timings(segments, start_time=0.0, end_time=10.0)
assert len(result) == 1
def test_words_without_start_are_skipped(self):
segments = [{"words": [{"end": 1.2, "word": "a"}, {"start": 2.0, "end": 2.2, "word": "b"}]}]
result = extract_word_timings(segments, start_time=0.0, end_time=10.0)
assert len(result) == 1
assert result[0]["word"] == "b"
class TestSpeechRateVariance:
def test_none_returns_neutral(self):
assert _speech_rate_variance(None) == 0.5
def test_too_few_words_returns_neutral(self):
timings = _make_word_timings(count=3, wps=4.0)
assert _speech_rate_variance(timings) == 0.5
def test_short_span_returns_neutral(self):
"""Words spanning <5s should return neutral."""
timings = _make_word_timings(count=10, wps=4.0, start=0.0)
# 10 words at 4 WPS = 2.5s span → too short
assert _speech_rate_variance(timings) == 0.5
def test_uniform_pace_scores_low(self):
"""Steady 4 WPS for 30s → low variance."""
timings = _make_word_timings(start=0.0, count=120, wps=4.0)
score = _speech_rate_variance(timings)
assert score < 0.4, f"Uniform pace scored {score}, expected < 0.4"
def test_varied_pace_scores_higher(self):
"""Alternating fast/slow sections → higher variance."""
timings = []
t = 0.0
# Fast section: 6 WPS for 10s
for i in range(60):
dur = 0.12
timings.append({"word": f"w{i}", "start": t, "end": t + dur})
t += 1.0 / 6.0
# Slow section: 2 WPS for 10s
for i in range(20):
dur = 0.3
timings.append({"word": f"w{60+i}", "start": t, "end": t + dur})
t += 0.5
score = _speech_rate_variance(timings)
uniform_score = _speech_rate_variance(
_make_word_timings(start=0.0, count=80, wps=4.0)
)
assert score > uniform_score, (
f"Varied pace ({score:.3f}) should be > uniform ({uniform_score:.3f})"
)
def test_score_bounded(self):
timings = _make_word_timings(start=0.0, count=200, wps=4.0)
score = _speech_rate_variance(timings)
assert 0.0 <= score <= 1.0
class TestPauseDensity:
def test_none_returns_neutral(self):
assert _pause_density(None) == 0.5
def test_single_word_returns_neutral(self):
assert _pause_density([{"start": 0.0, "end": 0.2}]) == 0.5
def test_no_pauses_scores_zero(self):
"""Continuous speech with no gaps >0.5s → 0."""
timings = _make_word_timings(start=0.0, count=60, wps=4.0)
score = _pause_density(timings)
assert score == 0.0
def test_frequent_pauses_scores_high(self):
"""Pauses every 5 words → high density."""
timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=5, pause_duration=0.8)
score = _pause_density(timings)
assert score > 0.5, f"Frequent pauses scored {score}, expected > 0.5"
def test_long_pauses_weighted_more(self):
"""One 1.5s pause should score higher than one 0.6s pause in a longer segment."""
# Build timings with one long pause at midpoint — 60 words for longer duration
long_pause = []
t = 0.0
for i in range(60):
long_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15})
t += 0.25
if i == 29:
t += 1.5 # long pause >1.0s
# Build timings with one short pause — same word count
short_pause = []
t = 0.0
for i in range(60):
short_pause.append({"word": f"w{i}", "start": t, "end": t + 0.15})
t += 0.25
if i == 29:
t += 0.6 # short pause >0.5s but <1.0s
assert _pause_density(long_pause) > _pause_density(short_pause)
def test_score_bounded(self):
timings = _make_word_timings(start=0.0, count=60, wps=4.0, pause_every=3, pause_duration=1.5)
score = _pause_density(timings)
assert 0.0 <= score <= 1.0
class TestSpeakingPaceFitness:
def test_none_returns_neutral(self):
assert _speaking_pace_fitness(None) == 0.5
def test_single_word_returns_neutral(self):
assert _speaking_pace_fitness([{"start": 0.0, "end": 0.2}]) == 0.5
def test_optimal_pace_scores_high(self):
"""4 WPS (optimal teaching pace) → 1.0."""
timings = _make_word_timings(start=0.0, count=40, wps=4.0)
score = _speaking_pace_fitness(timings)
assert score == 1.0, f"4 WPS scored {score}, expected 1.0"
def test_three_wps_is_sweet_spot_edge(self):
timings = _make_word_timings(start=0.0, count=30, wps=3.0)
score = _speaking_pace_fitness(timings)
assert score == 1.0
def test_five_wps_is_sweet_spot_edge(self):
timings = _make_word_timings(start=0.0, count=50, wps=5.0)
score = _speaking_pace_fitness(timings)
assert score > 0.95, f"5 WPS scored {score}, expected near 1.0"
def test_too_slow_scores_lower(self):
"""1.5 WPS → below sweet spot."""
timings = _make_word_timings(start=0.0, count=15, wps=1.5)
score = _speaking_pace_fitness(timings)
assert 0.4 < score < 0.6, f"1.5 WPS scored {score}, expected ~0.5"
def test_too_fast_scores_lower(self):
"""8 WPS → above sweet spot."""
timings = _make_word_timings(start=0.0, count=80, wps=8.0)
score = _speaking_pace_fitness(timings)
assert 0.0 < score < 1.0
def test_very_fast_scores_zero(self):
"""10+ WPS → 0."""
timings = _make_word_timings(start=0.0, count=110, wps=11.0)
score = _speaking_pace_fitness(timings)
assert score == 0.0
def test_zero_wps_scores_zero(self):
"""Very short duration → neutral."""
timings = [{"start": 0.0, "end": 0.01}, {"start": 0.005, "end": 0.015}]
score = _speaking_pace_fitness(timings)
# Duration ~0.015s → too short → 0.5 (neutral)
assert score == 0.5
def test_score_bounded(self):
for wps in [0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0]:
timings = _make_word_timings(start=0.0, count=max(10, int(wps * 10)), wps=wps)
score = _speaking_pace_fitness(timings)
assert 0.0 <= score <= 1.0, f"WPS {wps} scored {score} out of bounds"
class TestBackwardCompatibility:
"""Ensure the weight rebalancing doesn't break existing relative orderings."""
def test_ideal_still_beats_poor(self):
ideal = score_moment(**_ideal_moment())
poor = score_moment(**_poor_moment())
assert ideal["score"] > poor["score"]
def test_ideal_still_above_threshold(self):
result = score_moment(**_ideal_moment())
assert result["score"] > 0.6, f"Ideal scored {result['score']}, expected > 0.6"
def test_poor_still_below_threshold(self):
result = score_moment(**_poor_moment())
assert result["score"] < 0.45, f"Poor scored {result['score']}, expected < 0.45"
def test_weights_sum_to_one(self):
from backend.pipeline.highlight_scorer import _WEIGHTS
assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9