chrysopedia/backend/pipeline/highlight_scorer.py
jlightner 2d7b812c6a test: Implemented pure-function scoring engine with 7 weighted dimensio…
- "backend/pipeline/highlight_scorer.py"
- "backend/pipeline/test_highlight_scorer.py"

GSD-Task: S04/T02
2026-04-04 05:33:04 +00:00

244 lines
7.4 KiB
Python

"""Heuristic scoring engine for highlight candidate detection.
Takes KeyMoment data + context (source quality, video content type) and
returns a composite score in [0, 1] with a 7-dimension breakdown.
The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py:
duration_score, content_density_score, technique_relevance_score,
position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score
"""
from __future__ import annotations
import math
import re
from typing import Any
# ── Weights per dimension (must sum to 1.0) ──────────────────────────────────
_WEIGHTS: dict[str, float] = {
"duration_score": 0.25,
"content_density_score": 0.20,
"technique_relevance_score": 0.20,
"plugin_diversity_score": 0.10,
"engagement_proxy_score": 0.10,
"position_score": 0.10, # mapped from source_quality
"uniqueness_score": 0.05, # mapped from video_type
}
assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
# ── Individual scoring functions ─────────────────────────────────────────────
def _duration_fitness(duration_secs: float) -> float:
"""Bell-curve around 30-60s sweet spot.
Peak at 30-60s (score 1.0), penalty below 15s and above 120s,
zero above 300s.
"""
if duration_secs <= 0:
return 0.0
if duration_secs >= 300:
return 0.0
# Sweet spot: 30-60s → 1.0
if 30 <= duration_secs <= 60:
return 1.0
# Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s
# with steeper penalty below 15s
if duration_secs < 30:
if duration_secs < 15:
return duration_secs / 30.0 # 0→0.5 over 0-15s
return 0.5 + (duration_secs - 15) / 30.0 # 0.5→1.0 over 15-30s
# Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s
return max(0.0, 1.0 - (duration_secs - 60) / 240.0)
def _content_type_weight(content_type: str | None) -> float:
"""Score based on KeyMoment content_type.
technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4
"""
mapping = {
"technique": 1.0,
"settings": 0.8,
"workflow": 0.6,
"reasoning": 0.4,
}
return mapping.get(content_type or "", 0.5)
def _specificity_density(summary: str | None) -> float:
"""Score based on specificity signals in the summary.
Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios)
normalized by summary length.
"""
if not summary:
return 0.0
words = summary.split()
word_count = len(words)
if word_count == 0:
return 0.0
# Patterns that indicate specificity
specificity_patterns = [
r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b", # units
r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b", # ratios like 3/4
r"\b\d{2,}\b", # multi-digit numbers
r"\b\d+\.\d+\b", # decimal numbers
]
hits = 0
for pattern in specificity_patterns:
hits += len(re.findall(pattern, summary, re.IGNORECASE))
# Normalize: ~1 specific value per 10 words is high density
density = hits / (word_count / 10.0)
return min(density, 1.0)
def _plugin_richness(plugins: list[str] | None) -> float:
"""Score based on number of plugins mentioned.
min(len(plugins) / 3, 1.0)
"""
if not plugins:
return 0.0
return min(len(plugins) / 3.0, 1.0)
def _transcript_energy(raw_transcript: str | None) -> float:
"""Score based on teaching/engagement phrases in transcript.
Counts teaching phrases ('the trick is', 'notice how', 'because',
'I always', 'the key is', 'what I do') normalized by transcript
word count.
"""
if not raw_transcript:
return 0.0
words = raw_transcript.split()
word_count = len(words)
if word_count == 0:
return 0.0
teaching_phrases = [
"the trick is",
"notice how",
"because",
"i always",
"the key is",
"what i do",
"important thing",
"you want to",
"make sure",
"here's why",
]
text_lower = raw_transcript.lower()
hits = sum(text_lower.count(phrase) for phrase in teaching_phrases)
# Normalize: ~1 phrase per 50 words is high energy
energy = hits / (word_count / 50.0)
return min(energy, 1.0)
def _source_quality_weight(source_quality: str | None) -> float:
"""Score based on TechniquePage source_quality.
structured=1.0, mixed=0.7, unstructured=0.4, None=0.5
"""
mapping = {
"structured": 1.0,
"mixed": 0.7,
"unstructured": 0.4,
}
return mapping.get(source_quality or "", 0.5)
def _video_type_weight(video_content_type: str | None) -> float:
"""Score based on SourceVideo content_type.
tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3
"""
mapping = {
"tutorial": 1.0,
"breakdown": 0.9,
"livestream": 0.5,
"short_form": 0.3,
}
return mapping.get(video_content_type or "", 0.5)
# ── Main scoring function ───────────────────────────────────────────────────
def score_moment(
*,
start_time: float,
end_time: float,
content_type: str | None = None,
summary: str | None = None,
plugins: list[str] | None = None,
raw_transcript: str | None = None,
source_quality: str | None = None,
video_content_type: str | None = None,
) -> dict[str, Any]:
"""Score a KeyMoment for highlight potential.
Parameters
----------
start_time : float
Moment start in seconds.
end_time : float
Moment end in seconds.
content_type : str | None
KeyMoment content type (technique, settings, workflow, reasoning).
summary : str | None
KeyMoment summary text.
plugins : list[str] | None
Plugins mentioned in the moment.
raw_transcript : str | None
Raw transcript text of the moment.
source_quality : str | None
TechniquePage source quality (structured, mixed, unstructured).
video_content_type : str | None
SourceVideo content type (tutorial, breakdown, livestream, short_form).
Returns
-------
dict with keys:
score : float in [0.0, 1.0]
score_breakdown : dict mapping dimension names to float scores
duration_secs : float
"""
duration_secs = max(0.0, end_time - start_time)
breakdown = {
"duration_score": _duration_fitness(duration_secs),
"content_density_score": _specificity_density(summary),
"technique_relevance_score": _content_type_weight(content_type),
"plugin_diversity_score": _plugin_richness(plugins),
"engagement_proxy_score": _transcript_energy(raw_transcript),
"position_score": _source_quality_weight(source_quality),
"uniqueness_score": _video_type_weight(video_content_type),
}
# Weighted composite
composite = sum(
breakdown[dim] * weight for dim, weight in _WEIGHTS.items()
)
# Clamp to [0, 1] for safety
composite = max(0.0, min(1.0, composite))
return {
"score": composite,
"score_breakdown": breakdown,
"duration_secs": duration_secs,
}