"""Heuristic scoring engine for highlight candidate detection. Takes KeyMoment data + context (source quality, video content type) and returns a composite score in [0, 1] with a 7-dimension breakdown. The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py: duration_score, content_density_score, technique_relevance_score, position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score """ from __future__ import annotations import math import re from typing import Any # ── Weights per dimension (must sum to 1.0) ────────────────────────────────── _WEIGHTS: dict[str, float] = { "duration_score": 0.25, "content_density_score": 0.20, "technique_relevance_score": 0.20, "plugin_diversity_score": 0.10, "engagement_proxy_score": 0.10, "position_score": 0.10, # mapped from source_quality "uniqueness_score": 0.05, # mapped from video_type } assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0" # ── Individual scoring functions ───────────────────────────────────────────── def _duration_fitness(duration_secs: float) -> float: """Bell-curve around 30-60s sweet spot. Peak at 30-60s (score 1.0), penalty below 15s and above 120s, zero above 300s. """ if duration_secs <= 0: return 0.0 if duration_secs >= 300: return 0.0 # Sweet spot: 30-60s → 1.0 if 30 <= duration_secs <= 60: return 1.0 # Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s # with steeper penalty below 15s if duration_secs < 30: if duration_secs < 15: return duration_secs / 30.0 # 0→0.5 over 0-15s return 0.5 + (duration_secs - 15) / 30.0 # 0.5→1.0 over 15-30s # Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s return max(0.0, 1.0 - (duration_secs - 60) / 240.0) def _content_type_weight(content_type: str | None) -> float: """Score based on KeyMoment content_type. technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4 """ mapping = { "technique": 1.0, "settings": 0.8, "workflow": 0.6, "reasoning": 0.4, } return mapping.get(content_type or "", 0.5) def _specificity_density(summary: str | None) -> float: """Score based on specificity signals in the summary. Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios) normalized by summary length. """ if not summary: return 0.0 words = summary.split() word_count = len(words) if word_count == 0: return 0.0 # Patterns that indicate specificity specificity_patterns = [ r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b", # units r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b", # ratios like 3/4 r"\b\d{2,}\b", # multi-digit numbers r"\b\d+\.\d+\b", # decimal numbers ] hits = 0 for pattern in specificity_patterns: hits += len(re.findall(pattern, summary, re.IGNORECASE)) # Normalize: ~1 specific value per 10 words is high density density = hits / (word_count / 10.0) return min(density, 1.0) def _plugin_richness(plugins: list[str] | None) -> float: """Score based on number of plugins mentioned. min(len(plugins) / 3, 1.0) """ if not plugins: return 0.0 return min(len(plugins) / 3.0, 1.0) def _transcript_energy(raw_transcript: str | None) -> float: """Score based on teaching/engagement phrases in transcript. Counts teaching phrases ('the trick is', 'notice how', 'because', 'I always', 'the key is', 'what I do') normalized by transcript word count. """ if not raw_transcript: return 0.0 words = raw_transcript.split() word_count = len(words) if word_count == 0: return 0.0 teaching_phrases = [ "the trick is", "notice how", "because", "i always", "the key is", "what i do", "important thing", "you want to", "make sure", "here's why", ] text_lower = raw_transcript.lower() hits = sum(text_lower.count(phrase) for phrase in teaching_phrases) # Normalize: ~1 phrase per 50 words is high energy energy = hits / (word_count / 50.0) return min(energy, 1.0) def _source_quality_weight(source_quality: str | None) -> float: """Score based on TechniquePage source_quality. structured=1.0, mixed=0.7, unstructured=0.4, None=0.5 """ mapping = { "structured": 1.0, "mixed": 0.7, "unstructured": 0.4, } return mapping.get(source_quality or "", 0.5) def _video_type_weight(video_content_type: str | None) -> float: """Score based on SourceVideo content_type. tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3 """ mapping = { "tutorial": 1.0, "breakdown": 0.9, "livestream": 0.5, "short_form": 0.3, } return mapping.get(video_content_type or "", 0.5) # ── Main scoring function ─────────────────────────────────────────────────── def score_moment( *, start_time: float, end_time: float, content_type: str | None = None, summary: str | None = None, plugins: list[str] | None = None, raw_transcript: str | None = None, source_quality: str | None = None, video_content_type: str | None = None, ) -> dict[str, Any]: """Score a KeyMoment for highlight potential. Parameters ---------- start_time : float Moment start in seconds. end_time : float Moment end in seconds. content_type : str | None KeyMoment content type (technique, settings, workflow, reasoning). summary : str | None KeyMoment summary text. plugins : list[str] | None Plugins mentioned in the moment. raw_transcript : str | None Raw transcript text of the moment. source_quality : str | None TechniquePage source quality (structured, mixed, unstructured). video_content_type : str | None SourceVideo content type (tutorial, breakdown, livestream, short_form). Returns ------- dict with keys: score : float in [0.0, 1.0] score_breakdown : dict mapping dimension names to float scores duration_secs : float """ duration_secs = max(0.0, end_time - start_time) breakdown = { "duration_score": _duration_fitness(duration_secs), "content_density_score": _specificity_density(summary), "technique_relevance_score": _content_type_weight(content_type), "plugin_diversity_score": _plugin_richness(plugins), "engagement_proxy_score": _transcript_energy(raw_transcript), "position_score": _source_quality_weight(source_quality), "uniqueness_score": _video_type_weight(video_content_type), } # Weighted composite composite = sum( breakdown[dim] * weight for dim, weight in _WEIGHTS.items() ) # Clamp to [0, 1] for safety composite = max(0.0, min(1.0, composite)) return { "score": composite, "score_breakdown": breakdown, "duration_secs": duration_secs, }