- "backend/pipeline/highlight_scorer.py" - "backend/pipeline/highlight_schemas.py" - "backend/pipeline/test_highlight_scorer.py" GSD-Task: S05/T01
413 lines
13 KiB
Python
413 lines
13 KiB
Python
"""Heuristic scoring engine for highlight candidate detection.
|
|
|
|
Takes KeyMoment data + context (source quality, video content type) and
|
|
returns a composite score in [0, 1] with a 10-dimension breakdown.
|
|
|
|
The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py:
|
|
duration_score, content_density_score, technique_relevance_score,
|
|
position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score,
|
|
speech_rate_variance_score, pause_density_score, speaking_pace_score
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import re
|
|
import statistics
|
|
from typing import Any
|
|
|
|
|
|
# ── Weights per dimension (must sum to 1.0) ──────────────────────────────────
|
|
|
|
_WEIGHTS: dict[str, float] = {
|
|
"duration_score": 0.20,
|
|
"content_density_score": 0.15,
|
|
"technique_relevance_score": 0.15,
|
|
"plugin_diversity_score": 0.08,
|
|
"engagement_proxy_score": 0.08,
|
|
"position_score": 0.08, # mapped from source_quality
|
|
"uniqueness_score": 0.04, # mapped from video_type
|
|
"speech_rate_variance_score": 0.08,
|
|
"pause_density_score": 0.07,
|
|
"speaking_pace_score": 0.07,
|
|
}
|
|
|
|
assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
|
|
|
|
|
|
# ── Individual scoring functions ─────────────────────────────────────────────
|
|
|
|
def _duration_fitness(duration_secs: float) -> float:
|
|
"""Bell-curve around 30-60s sweet spot.
|
|
|
|
Peak at 30-60s (score 1.0), penalty below 15s and above 120s,
|
|
zero above 300s.
|
|
"""
|
|
if duration_secs <= 0:
|
|
return 0.0
|
|
if duration_secs >= 300:
|
|
return 0.0
|
|
|
|
# Sweet spot: 30-60s → 1.0
|
|
if 30 <= duration_secs <= 60:
|
|
return 1.0
|
|
|
|
# Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s
|
|
# with steeper penalty below 15s
|
|
if duration_secs < 30:
|
|
if duration_secs < 15:
|
|
return duration_secs / 30.0 # 0→0.5 over 0-15s
|
|
return 0.5 + (duration_secs - 15) / 30.0 # 0.5→1.0 over 15-30s
|
|
|
|
# Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s
|
|
return max(0.0, 1.0 - (duration_secs - 60) / 240.0)
|
|
|
|
|
|
def _content_type_weight(content_type: str | None) -> float:
|
|
"""Score based on KeyMoment content_type.
|
|
|
|
technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4
|
|
"""
|
|
mapping = {
|
|
"technique": 1.0,
|
|
"settings": 0.8,
|
|
"workflow": 0.6,
|
|
"reasoning": 0.4,
|
|
}
|
|
return mapping.get(content_type or "", 0.5)
|
|
|
|
|
|
def _specificity_density(summary: str | None) -> float:
|
|
"""Score based on specificity signals in the summary.
|
|
|
|
Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios)
|
|
normalized by summary length.
|
|
"""
|
|
if not summary:
|
|
return 0.0
|
|
|
|
words = summary.split()
|
|
word_count = len(words)
|
|
if word_count == 0:
|
|
return 0.0
|
|
|
|
# Patterns that indicate specificity
|
|
specificity_patterns = [
|
|
r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b", # units
|
|
r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b", # ratios like 3/4
|
|
r"\b\d{2,}\b", # multi-digit numbers
|
|
r"\b\d+\.\d+\b", # decimal numbers
|
|
]
|
|
|
|
hits = 0
|
|
for pattern in specificity_patterns:
|
|
hits += len(re.findall(pattern, summary, re.IGNORECASE))
|
|
|
|
# Normalize: ~1 specific value per 10 words is high density
|
|
density = hits / (word_count / 10.0)
|
|
return min(density, 1.0)
|
|
|
|
|
|
def _plugin_richness(plugins: list[str] | None) -> float:
|
|
"""Score based on number of plugins mentioned.
|
|
|
|
min(len(plugins) / 3, 1.0)
|
|
"""
|
|
if not plugins:
|
|
return 0.0
|
|
return min(len(plugins) / 3.0, 1.0)
|
|
|
|
|
|
def _transcript_energy(raw_transcript: str | None) -> float:
|
|
"""Score based on teaching/engagement phrases in transcript.
|
|
|
|
Counts teaching phrases ('the trick is', 'notice how', 'because',
|
|
'I always', 'the key is', 'what I do') normalized by transcript
|
|
word count.
|
|
"""
|
|
if not raw_transcript:
|
|
return 0.0
|
|
|
|
words = raw_transcript.split()
|
|
word_count = len(words)
|
|
if word_count == 0:
|
|
return 0.0
|
|
|
|
teaching_phrases = [
|
|
"the trick is",
|
|
"notice how",
|
|
"because",
|
|
"i always",
|
|
"the key is",
|
|
"what i do",
|
|
"important thing",
|
|
"you want to",
|
|
"make sure",
|
|
"here's why",
|
|
]
|
|
|
|
text_lower = raw_transcript.lower()
|
|
hits = sum(text_lower.count(phrase) for phrase in teaching_phrases)
|
|
|
|
# Normalize: ~1 phrase per 50 words is high energy
|
|
energy = hits / (word_count / 50.0)
|
|
return min(energy, 1.0)
|
|
|
|
|
|
def _source_quality_weight(source_quality: str | None) -> float:
|
|
"""Score based on TechniquePage source_quality.
|
|
|
|
structured=1.0, mixed=0.7, unstructured=0.4, None=0.5
|
|
"""
|
|
mapping = {
|
|
"structured": 1.0,
|
|
"mixed": 0.7,
|
|
"unstructured": 0.4,
|
|
}
|
|
return mapping.get(source_quality or "", 0.5)
|
|
|
|
|
|
def _video_type_weight(video_content_type: str | None) -> float:
|
|
"""Score based on SourceVideo content_type.
|
|
|
|
tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3
|
|
"""
|
|
mapping = {
|
|
"tutorial": 1.0,
|
|
"breakdown": 0.9,
|
|
"livestream": 0.5,
|
|
"short_form": 0.3,
|
|
}
|
|
return mapping.get(video_content_type or "", 0.5)
|
|
|
|
|
|
# ── Audio proxy scoring functions ─────────────────────────────────────────────
|
|
|
|
def extract_word_timings(
|
|
transcript_data: list[dict[str, Any]],
|
|
start_time: float,
|
|
end_time: float,
|
|
) -> list[dict[str, Any]]:
|
|
"""Extract word-level timing dicts from transcript segments within a time window.
|
|
|
|
Parameters
|
|
----------
|
|
transcript_data : list[dict]
|
|
Parsed transcript JSON — list of segments, each with a ``words`` array.
|
|
Each word dict must have ``start`` and ``end`` float fields (seconds).
|
|
start_time : float
|
|
Window start in seconds (inclusive).
|
|
end_time : float
|
|
Window end in seconds (inclusive).
|
|
|
|
Returns
|
|
-------
|
|
list[dict] — word-timing dicts whose ``start`` falls within [start_time, end_time].
|
|
"""
|
|
if not transcript_data:
|
|
return []
|
|
|
|
words: list[dict[str, Any]] = []
|
|
for segment in transcript_data:
|
|
seg_words = segment.get("words")
|
|
if not seg_words:
|
|
continue
|
|
for w in seg_words:
|
|
w_start = w.get("start")
|
|
if w_start is None:
|
|
continue
|
|
if start_time <= w_start <= end_time:
|
|
words.append(w)
|
|
return words
|
|
|
|
|
|
def _speech_rate_variance(word_timings: list[dict[str, Any]] | None) -> float:
|
|
"""Compute normalized stdev of words-per-second in sliding windows.
|
|
|
|
High variance indicates emphasis shifts (speeding up / slowing down),
|
|
which correlates with engaging highlights.
|
|
|
|
Uses 5-second sliding windows with 2.5-second step.
|
|
Returns 0.5 (neutral) when word_timings is None or insufficient data.
|
|
"""
|
|
if not word_timings or len(word_timings) < 4:
|
|
return 0.5
|
|
|
|
# Determine time span
|
|
first_start = word_timings[0].get("start", 0.0)
|
|
last_start = word_timings[-1].get("start", 0.0)
|
|
span = last_start - first_start
|
|
if span < 5.0:
|
|
return 0.5
|
|
|
|
# Compute WPS in 5s sliding windows with 2.5s step
|
|
window_size = 5.0
|
|
step = 2.5
|
|
wps_values: list[float] = []
|
|
|
|
t = first_start
|
|
while t + window_size <= last_start + 0.01:
|
|
count = sum(
|
|
1 for w in word_timings
|
|
if t <= w.get("start", 0.0) < t + window_size
|
|
)
|
|
wps_values.append(count / window_size)
|
|
t += step
|
|
|
|
if len(wps_values) < 2:
|
|
return 0.5
|
|
|
|
mean_wps = statistics.mean(wps_values)
|
|
if mean_wps < 0.01:
|
|
return 0.5
|
|
|
|
stdev = statistics.stdev(wps_values)
|
|
# Normalize: coefficient of variation, capped at 1.0
|
|
# CV of ~0.3-0.5 is typical for varied speech; >0.5 is high variance
|
|
cv = stdev / mean_wps
|
|
return min(cv / 0.6, 1.0)
|
|
|
|
|
|
def _pause_density(word_timings: list[dict[str, Any]] | None) -> float:
|
|
"""Count strategic pauses normalized by duration.
|
|
|
|
Inter-word gaps >0.5s and inter-segment gaps >1.0s indicate deliberate
|
|
pauses for emphasis, which correlate with better highlights.
|
|
|
|
Returns 0.5 (neutral) when word_timings is None or insufficient data.
|
|
"""
|
|
if not word_timings or len(word_timings) < 2:
|
|
return 0.5
|
|
|
|
first_start = word_timings[0].get("start", 0.0)
|
|
last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0))
|
|
duration = last_end - first_start
|
|
if duration < 1.0:
|
|
return 0.5
|
|
|
|
short_pauses = 0 # >0.5s gaps
|
|
long_pauses = 0 # >1.0s gaps
|
|
|
|
for i in range(1, len(word_timings)):
|
|
prev_end = word_timings[i - 1].get("end", word_timings[i - 1].get("start", 0.0))
|
|
curr_start = word_timings[i].get("start", 0.0)
|
|
gap = curr_start - prev_end
|
|
|
|
if gap > 1.0:
|
|
long_pauses += 1
|
|
elif gap > 0.5:
|
|
short_pauses += 1
|
|
|
|
# Weight long pauses more heavily
|
|
weighted_pauses = short_pauses + long_pauses * 2.0
|
|
# Normalize: ~2-4 weighted pauses per 30s is good density
|
|
density = weighted_pauses / (duration / 15.0)
|
|
return min(density, 1.0)
|
|
|
|
|
|
def _speaking_pace_fitness(word_timings: list[dict[str, Any]] | None) -> float:
|
|
"""Bell-curve score around 3-5 words-per-second optimal teaching pace.
|
|
|
|
3-5 WPS is the sweet spot for tutorial content — fast enough to be
|
|
engaging, slow enough for comprehension. Returns 0.5 (neutral) when
|
|
word_timings is None or insufficient data.
|
|
"""
|
|
if not word_timings or len(word_timings) < 2:
|
|
return 0.5
|
|
|
|
first_start = word_timings[0].get("start", 0.0)
|
|
last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0))
|
|
duration = last_end - first_start
|
|
if duration < 1.0:
|
|
return 0.5
|
|
|
|
wps = len(word_timings) / duration
|
|
|
|
# Sweet spot: 3-5 WPS → 1.0
|
|
if 3.0 <= wps <= 5.0:
|
|
return 1.0
|
|
|
|
# Below sweet spot: linear ramp from 0 at 0 WPS to 1.0 at 3 WPS
|
|
if wps < 3.0:
|
|
return max(0.0, wps / 3.0)
|
|
|
|
# Above sweet spot: decay from 1.0 at 5 WPS to 0.0 at 10 WPS
|
|
if wps > 5.0:
|
|
return max(0.0, 1.0 - (wps - 5.0) / 5.0)
|
|
|
|
return 0.5 # unreachable, but defensive
|
|
|
|
|
|
# ── Main scoring function ───────────────────────────────────────────────────
|
|
|
|
def score_moment(
|
|
*,
|
|
start_time: float,
|
|
end_time: float,
|
|
content_type: str | None = None,
|
|
summary: str | None = None,
|
|
plugins: list[str] | None = None,
|
|
raw_transcript: str | None = None,
|
|
source_quality: str | None = None,
|
|
video_content_type: str | None = None,
|
|
word_timings: list[dict[str, Any]] | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Score a KeyMoment for highlight potential.
|
|
|
|
Parameters
|
|
----------
|
|
start_time : float
|
|
Moment start in seconds.
|
|
end_time : float
|
|
Moment end in seconds.
|
|
content_type : str | None
|
|
KeyMoment content type (technique, settings, workflow, reasoning).
|
|
summary : str | None
|
|
KeyMoment summary text.
|
|
plugins : list[str] | None
|
|
Plugins mentioned in the moment.
|
|
raw_transcript : str | None
|
|
Raw transcript text of the moment.
|
|
source_quality : str | None
|
|
TechniquePage source quality (structured, mixed, unstructured).
|
|
video_content_type : str | None
|
|
SourceVideo content type (tutorial, breakdown, livestream, short_form).
|
|
word_timings : list[dict] | None
|
|
Word-level timing dicts with ``start`` and ``end`` keys (seconds).
|
|
When None, audio proxy dimensions score 0.5 (neutral).
|
|
|
|
Returns
|
|
-------
|
|
dict with keys:
|
|
score : float in [0.0, 1.0]
|
|
score_breakdown : dict mapping dimension names to float scores
|
|
duration_secs : float
|
|
"""
|
|
duration_secs = max(0.0, end_time - start_time)
|
|
|
|
breakdown = {
|
|
"duration_score": _duration_fitness(duration_secs),
|
|
"content_density_score": _specificity_density(summary),
|
|
"technique_relevance_score": _content_type_weight(content_type),
|
|
"plugin_diversity_score": _plugin_richness(plugins),
|
|
"engagement_proxy_score": _transcript_energy(raw_transcript),
|
|
"position_score": _source_quality_weight(source_quality),
|
|
"uniqueness_score": _video_type_weight(video_content_type),
|
|
"speech_rate_variance_score": _speech_rate_variance(word_timings),
|
|
"pause_density_score": _pause_density(word_timings),
|
|
"speaking_pace_score": _speaking_pace_fitness(word_timings),
|
|
}
|
|
|
|
# Weighted composite
|
|
composite = sum(
|
|
breakdown[dim] * weight for dim, weight in _WEIGHTS.items()
|
|
)
|
|
|
|
# Clamp to [0, 1] for safety
|
|
composite = max(0.0, min(1.0, composite))
|
|
|
|
return {
|
|
"score": composite,
|
|
"score_breakdown": breakdown,
|
|
"duration_secs": duration_secs,
|
|
}
|