"""Heuristic scoring engine for highlight candidate detection. Takes KeyMoment data + context (source quality, video content type) and returns a composite score in [0, 1] with a 10-dimension breakdown. The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py: duration_score, content_density_score, technique_relevance_score, position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score, speech_rate_variance_score, pause_density_score, speaking_pace_score """ from __future__ import annotations import math import re import statistics from typing import Any # ── Weights per dimension (must sum to 1.0) ────────────────────────────────── _WEIGHTS: dict[str, float] = { "duration_score": 0.20, "content_density_score": 0.15, "technique_relevance_score": 0.15, "plugin_diversity_score": 0.08, "engagement_proxy_score": 0.08, "position_score": 0.08, # mapped from source_quality "uniqueness_score": 0.04, # mapped from video_type "speech_rate_variance_score": 0.08, "pause_density_score": 0.07, "speaking_pace_score": 0.07, } assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0" # ── Individual scoring functions ───────────────────────────────────────────── def _duration_fitness(duration_secs: float) -> float: """Bell-curve around 30-60s sweet spot. Peak at 30-60s (score 1.0), penalty below 15s and above 120s, zero above 300s. """ if duration_secs <= 0: return 0.0 if duration_secs >= 300: return 0.0 # Sweet spot: 30-60s → 1.0 if 30 <= duration_secs <= 60: return 1.0 # Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s # with steeper penalty below 15s if duration_secs < 30: if duration_secs < 15: return duration_secs / 30.0 # 0→0.5 over 0-15s return 0.5 + (duration_secs - 15) / 30.0 # 0.5→1.0 over 15-30s # Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s return max(0.0, 1.0 - (duration_secs - 60) / 240.0) def _content_type_weight(content_type: str | None) -> float: """Score based on KeyMoment content_type. technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4 """ mapping = { "technique": 1.0, "settings": 0.8, "workflow": 0.6, "reasoning": 0.4, } return mapping.get(content_type or "", 0.5) def _specificity_density(summary: str | None) -> float: """Score based on specificity signals in the summary. Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios) normalized by summary length. """ if not summary: return 0.0 words = summary.split() word_count = len(words) if word_count == 0: return 0.0 # Patterns that indicate specificity specificity_patterns = [ r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b", # units r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b", # ratios like 3/4 r"\b\d{2,}\b", # multi-digit numbers r"\b\d+\.\d+\b", # decimal numbers ] hits = 0 for pattern in specificity_patterns: hits += len(re.findall(pattern, summary, re.IGNORECASE)) # Normalize: ~1 specific value per 10 words is high density density = hits / (word_count / 10.0) return min(density, 1.0) def _plugin_richness(plugins: list[str] | None) -> float: """Score based on number of plugins mentioned. min(len(plugins) / 3, 1.0) """ if not plugins: return 0.0 return min(len(plugins) / 3.0, 1.0) def _transcript_energy(raw_transcript: str | None) -> float: """Score based on teaching/engagement phrases in transcript. Counts teaching phrases ('the trick is', 'notice how', 'because', 'I always', 'the key is', 'what I do') normalized by transcript word count. """ if not raw_transcript: return 0.0 words = raw_transcript.split() word_count = len(words) if word_count == 0: return 0.0 teaching_phrases = [ "the trick is", "notice how", "because", "i always", "the key is", "what i do", "important thing", "you want to", "make sure", "here's why", ] text_lower = raw_transcript.lower() hits = sum(text_lower.count(phrase) for phrase in teaching_phrases) # Normalize: ~1 phrase per 50 words is high energy energy = hits / (word_count / 50.0) return min(energy, 1.0) def _source_quality_weight(source_quality: str | None) -> float: """Score based on TechniquePage source_quality. structured=1.0, mixed=0.7, unstructured=0.4, None=0.5 """ mapping = { "structured": 1.0, "mixed": 0.7, "unstructured": 0.4, } return mapping.get(source_quality or "", 0.5) def _video_type_weight(video_content_type: str | None) -> float: """Score based on SourceVideo content_type. tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3 """ mapping = { "tutorial": 1.0, "breakdown": 0.9, "livestream": 0.5, "short_form": 0.3, } return mapping.get(video_content_type or "", 0.5) # ── Audio proxy scoring functions ───────────────────────────────────────────── def extract_word_timings( transcript_data: list[dict[str, Any]], start_time: float, end_time: float, ) -> list[dict[str, Any]]: """Extract word-level timing dicts from transcript segments within a time window. Parameters ---------- transcript_data : list[dict] Parsed transcript JSON — list of segments, each with a ``words`` array. Each word dict must have ``start`` and ``end`` float fields (seconds). start_time : float Window start in seconds (inclusive). end_time : float Window end in seconds (inclusive). Returns ------- list[dict] — word-timing dicts whose ``start`` falls within [start_time, end_time]. """ if not transcript_data: return [] words: list[dict[str, Any]] = [] for segment in transcript_data: seg_words = segment.get("words") if not seg_words: continue for w in seg_words: w_start = w.get("start") if w_start is None: continue if start_time <= w_start <= end_time: words.append(w) return words def _speech_rate_variance(word_timings: list[dict[str, Any]] | None) -> float: """Compute normalized stdev of words-per-second in sliding windows. High variance indicates emphasis shifts (speeding up / slowing down), which correlates with engaging highlights. Uses 5-second sliding windows with 2.5-second step. Returns 0.5 (neutral) when word_timings is None or insufficient data. """ if not word_timings or len(word_timings) < 4: return 0.5 # Determine time span first_start = word_timings[0].get("start", 0.0) last_start = word_timings[-1].get("start", 0.0) span = last_start - first_start if span < 5.0: return 0.5 # Compute WPS in 5s sliding windows with 2.5s step window_size = 5.0 step = 2.5 wps_values: list[float] = [] t = first_start while t + window_size <= last_start + 0.01: count = sum( 1 for w in word_timings if t <= w.get("start", 0.0) < t + window_size ) wps_values.append(count / window_size) t += step if len(wps_values) < 2: return 0.5 mean_wps = statistics.mean(wps_values) if mean_wps < 0.01: return 0.5 stdev = statistics.stdev(wps_values) # Normalize: coefficient of variation, capped at 1.0 # CV of ~0.3-0.5 is typical for varied speech; >0.5 is high variance cv = stdev / mean_wps return min(cv / 0.6, 1.0) def _pause_density(word_timings: list[dict[str, Any]] | None) -> float: """Count strategic pauses normalized by duration. Inter-word gaps >0.5s and inter-segment gaps >1.0s indicate deliberate pauses for emphasis, which correlate with better highlights. Returns 0.5 (neutral) when word_timings is None or insufficient data. """ if not word_timings or len(word_timings) < 2: return 0.5 first_start = word_timings[0].get("start", 0.0) last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0)) duration = last_end - first_start if duration < 1.0: return 0.5 short_pauses = 0 # >0.5s gaps long_pauses = 0 # >1.0s gaps for i in range(1, len(word_timings)): prev_end = word_timings[i - 1].get("end", word_timings[i - 1].get("start", 0.0)) curr_start = word_timings[i].get("start", 0.0) gap = curr_start - prev_end if gap > 1.0: long_pauses += 1 elif gap > 0.5: short_pauses += 1 # Weight long pauses more heavily weighted_pauses = short_pauses + long_pauses * 2.0 # Normalize: ~2-4 weighted pauses per 30s is good density density = weighted_pauses / (duration / 15.0) return min(density, 1.0) def _speaking_pace_fitness(word_timings: list[dict[str, Any]] | None) -> float: """Bell-curve score around 3-5 words-per-second optimal teaching pace. 3-5 WPS is the sweet spot for tutorial content — fast enough to be engaging, slow enough for comprehension. Returns 0.5 (neutral) when word_timings is None or insufficient data. """ if not word_timings or len(word_timings) < 2: return 0.5 first_start = word_timings[0].get("start", 0.0) last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0)) duration = last_end - first_start if duration < 1.0: return 0.5 wps = len(word_timings) / duration # Sweet spot: 3-5 WPS → 1.0 if 3.0 <= wps <= 5.0: return 1.0 # Below sweet spot: linear ramp from 0 at 0 WPS to 1.0 at 3 WPS if wps < 3.0: return max(0.0, wps / 3.0) # Above sweet spot: decay from 1.0 at 5 WPS to 0.0 at 10 WPS if wps > 5.0: return max(0.0, 1.0 - (wps - 5.0) / 5.0) return 0.5 # unreachable, but defensive # ── Main scoring function ─────────────────────────────────────────────────── def score_moment( *, start_time: float, end_time: float, content_type: str | None = None, summary: str | None = None, plugins: list[str] | None = None, raw_transcript: str | None = None, source_quality: str | None = None, video_content_type: str | None = None, word_timings: list[dict[str, Any]] | None = None, ) -> dict[str, Any]: """Score a KeyMoment for highlight potential. Parameters ---------- start_time : float Moment start in seconds. end_time : float Moment end in seconds. content_type : str | None KeyMoment content type (technique, settings, workflow, reasoning). summary : str | None KeyMoment summary text. plugins : list[str] | None Plugins mentioned in the moment. raw_transcript : str | None Raw transcript text of the moment. source_quality : str | None TechniquePage source quality (structured, mixed, unstructured). video_content_type : str | None SourceVideo content type (tutorial, breakdown, livestream, short_form). word_timings : list[dict] | None Word-level timing dicts with ``start`` and ``end`` keys (seconds). When None, audio proxy dimensions score 0.5 (neutral). Returns ------- dict with keys: score : float in [0.0, 1.0] score_breakdown : dict mapping dimension names to float scores duration_secs : float """ duration_secs = max(0.0, end_time - start_time) breakdown = { "duration_score": _duration_fitness(duration_secs), "content_density_score": _specificity_density(summary), "technique_relevance_score": _content_type_weight(content_type), "plugin_diversity_score": _plugin_richness(plugins), "engagement_proxy_score": _transcript_energy(raw_transcript), "position_score": _source_quality_weight(source_quality), "uniqueness_score": _video_type_weight(video_content_type), "speech_rate_variance_score": _speech_rate_variance(word_timings), "pause_density_score": _pause_density(word_timings), "speaking_pace_score": _speaking_pace_fitness(word_timings), } # Weighted composite composite = sum( breakdown[dim] * weight for dim, weight in _WEIGHTS.items() ) # Clamp to [0, 1] for safety composite = max(0.0, min(1.0, composite)) return { "score": composite, "score_breakdown": breakdown, "duration_secs": duration_secs, }