chrysopedia/backend/pipeline/highlight_scorer.py

"""Heuristic scoring engine for highlight candidate detection.

Takes KeyMoment data + context (source quality, video content type) and
returns a composite score in [0, 1] with a 10-dimension breakdown.

The breakdown fields align with HighlightScoreBreakdown in highlight_schemas.py:
  duration_score, content_density_score, technique_relevance_score,
  position_score, uniqueness_score, engagement_proxy_score, plugin_diversity_score,
  speech_rate_variance_score, pause_density_score, speaking_pace_score
"""

from __future__ import annotations

import math
import re
import statistics
from typing import Any


# ── Weights per dimension (must sum to 1.0) ──────────────────────────────────

_WEIGHTS: dict[str, float] = {
    "duration_score": 0.20,
    "content_density_score": 0.15,
    "technique_relevance_score": 0.15,
    "plugin_diversity_score": 0.08,
    "engagement_proxy_score": 0.08,
    "position_score": 0.08,          # mapped from source_quality
    "uniqueness_score": 0.04,        # mapped from video_type
    "speech_rate_variance_score": 0.08,
    "pause_density_score": 0.07,
    "speaking_pace_score": 0.07,
}

assert abs(sum(_WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"


# ── Individual scoring functions ─────────────────────────────────────────────

def _duration_fitness(duration_secs: float) -> float:
    """Bell-curve around 30-60s sweet spot.

    Peak at 30-60s (score 1.0), penalty below 15s and above 120s,
    zero above 300s.
    """
    if duration_secs <= 0:
        return 0.0
    if duration_secs >= 300:
        return 0.0

    # Sweet spot: 30-60s → 1.0
    if 30 <= duration_secs <= 60:
        return 1.0

    # Below sweet spot: linear ramp from 0 at 0s to 1.0 at 30s
    # with steeper penalty below 15s
    if duration_secs < 30:
        if duration_secs < 15:
            return duration_secs / 30.0  # 0→0.5 over 0-15s
        return 0.5 + (duration_secs - 15) / 30.0  # 0.5→1.0 over 15-30s

    # Above sweet spot: gradual decay from 1.0 at 60s to 0.0 at 300s
    return max(0.0, 1.0 - (duration_secs - 60) / 240.0)


def _content_type_weight(content_type: str | None) -> float:
    """Score based on KeyMoment content_type.

    technique=1.0, settings=0.8, workflow=0.6, reasoning=0.4
    """
    mapping = {
        "technique": 1.0,
        "settings": 0.8,
        "workflow": 0.6,
        "reasoning": 0.4,
    }
    return mapping.get(content_type or "", 0.5)


def _specificity_density(summary: str | None) -> float:
    """Score based on specificity signals in the summary.

    Counts specific values (numbers, plugin names, dB, Hz, ms, %, ratios)
    normalized by summary length.
    """
    if not summary:
        return 0.0

    words = summary.split()
    word_count = len(words)
    if word_count == 0:
        return 0.0

    # Patterns that indicate specificity
    specificity_patterns = [
        r"\b\d+\.?\d*\s*(?:dB|Hz|kHz|ms|sec|bpm|%)\b",  # units
        r"\b\d+\.?\d*\s*/\s*\d+\.?\d*\b",                # ratios like 3/4
        r"\b\d{2,}\b",                                     # multi-digit numbers
        r"\b\d+\.\d+\b",                                   # decimal numbers
    ]

    hits = 0
    for pattern in specificity_patterns:
        hits += len(re.findall(pattern, summary, re.IGNORECASE))

    # Normalize: ~1 specific value per 10 words is high density
    density = hits / (word_count / 10.0)
    return min(density, 1.0)


def _plugin_richness(plugins: list[str] | None) -> float:
    """Score based on number of plugins mentioned.

    min(len(plugins) / 3, 1.0)
    """
    if not plugins:
        return 0.0
    return min(len(plugins) / 3.0, 1.0)


def _transcript_energy(raw_transcript: str | None) -> float:
    """Score based on teaching/engagement phrases in transcript.

    Counts teaching phrases ('the trick is', 'notice how', 'because',
    'I always', 'the key is', 'what I do') normalized by transcript
    word count.
    """
    if not raw_transcript:
        return 0.0

    words = raw_transcript.split()
    word_count = len(words)
    if word_count == 0:
        return 0.0

    teaching_phrases = [
        "the trick is",
        "notice how",
        "because",
        "i always",
        "the key is",
        "what i do",
        "important thing",
        "you want to",
        "make sure",
        "here's why",
    ]

    text_lower = raw_transcript.lower()
    hits = sum(text_lower.count(phrase) for phrase in teaching_phrases)

    # Normalize: ~1 phrase per 50 words is high energy
    energy = hits / (word_count / 50.0)
    return min(energy, 1.0)


def _source_quality_weight(source_quality: str | None) -> float:
    """Score based on TechniquePage source_quality.

    structured=1.0, mixed=0.7, unstructured=0.4, None=0.5
    """
    mapping = {
        "structured": 1.0,
        "mixed": 0.7,
        "unstructured": 0.4,
    }
    return mapping.get(source_quality or "", 0.5)


def _video_type_weight(video_content_type: str | None) -> float:
    """Score based on SourceVideo content_type.

    tutorial=1.0, breakdown=0.9, livestream=0.5, short_form=0.3
    """
    mapping = {
        "tutorial": 1.0,
        "breakdown": 0.9,
        "livestream": 0.5,
        "short_form": 0.3,
    }
    return mapping.get(video_content_type or "", 0.5)


# ── Audio proxy scoring functions ─────────────────────────────────────────────

def extract_word_timings(
    transcript_data: list[dict[str, Any]],
    start_time: float,
    end_time: float,
) -> list[dict[str, Any]]:
    """Extract word-level timing dicts from transcript segments within a time window.

    Parameters
    ----------
    transcript_data : list[dict]
        Parsed transcript JSON — list of segments, each with a ``words`` array.
        Each word dict must have ``start`` and ``end`` float fields (seconds).
    start_time : float
        Window start in seconds (inclusive).
    end_time : float
        Window end in seconds (inclusive).

    Returns
    -------
    list[dict] — word-timing dicts whose ``start`` falls within [start_time, end_time].
    """
    if not transcript_data:
        return []

    words: list[dict[str, Any]] = []
    for segment in transcript_data:
        seg_words = segment.get("words")
        if not seg_words:
            continue
        for w in seg_words:
            w_start = w.get("start")
            if w_start is None:
                continue
            if start_time <= w_start <= end_time:
                words.append(w)
    return words


def _speech_rate_variance(word_timings: list[dict[str, Any]] | None) -> float:
    """Compute normalized stdev of words-per-second in sliding windows.

    High variance indicates emphasis shifts (speeding up / slowing down),
    which correlates with engaging highlights.

    Uses 5-second sliding windows with 2.5-second step.
    Returns 0.5 (neutral) when word_timings is None or insufficient data.
    """
    if not word_timings or len(word_timings) < 4:
        return 0.5

    # Determine time span
    first_start = word_timings[0].get("start", 0.0)
    last_start = word_timings[-1].get("start", 0.0)
    span = last_start - first_start
    if span < 5.0:
        return 0.5

    # Compute WPS in 5s sliding windows with 2.5s step
    window_size = 5.0
    step = 2.5
    wps_values: list[float] = []

    t = first_start
    while t + window_size <= last_start + 0.01:
        count = sum(
            1 for w in word_timings
            if t <= w.get("start", 0.0) < t + window_size
        )
        wps_values.append(count / window_size)
        t += step

    if len(wps_values) < 2:
        return 0.5

    mean_wps = statistics.mean(wps_values)
    if mean_wps < 0.01:
        return 0.5

    stdev = statistics.stdev(wps_values)
    # Normalize: coefficient of variation, capped at 1.0
    # CV of ~0.3-0.5 is typical for varied speech; >0.5 is high variance
    cv = stdev / mean_wps
    return min(cv / 0.6, 1.0)


def _pause_density(word_timings: list[dict[str, Any]] | None) -> float:
    """Count strategic pauses normalized by duration.

    Inter-word gaps >0.5s and inter-segment gaps >1.0s indicate deliberate
    pauses for emphasis, which correlate with better highlights.

    Returns 0.5 (neutral) when word_timings is None or insufficient data.
    """
    if not word_timings or len(word_timings) < 2:
        return 0.5

    first_start = word_timings[0].get("start", 0.0)
    last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0))
    duration = last_end - first_start
    if duration < 1.0:
        return 0.5

    short_pauses = 0  # >0.5s gaps
    long_pauses = 0   # >1.0s gaps

    for i in range(1, len(word_timings)):
        prev_end = word_timings[i - 1].get("end", word_timings[i - 1].get("start", 0.0))
        curr_start = word_timings[i].get("start", 0.0)
        gap = curr_start - prev_end

        if gap > 1.0:
            long_pauses += 1
        elif gap > 0.5:
            short_pauses += 1

    # Weight long pauses more heavily
    weighted_pauses = short_pauses + long_pauses * 2.0
    # Normalize: ~2-4 weighted pauses per 30s is good density
    density = weighted_pauses / (duration / 15.0)
    return min(density, 1.0)


def _speaking_pace_fitness(word_timings: list[dict[str, Any]] | None) -> float:
    """Bell-curve score around 3-5 words-per-second optimal teaching pace.

    3-5 WPS is the sweet spot for tutorial content — fast enough to be
    engaging, slow enough for comprehension. Returns 0.5 (neutral) when
    word_timings is None or insufficient data.
    """
    if not word_timings or len(word_timings) < 2:
        return 0.5

    first_start = word_timings[0].get("start", 0.0)
    last_end = word_timings[-1].get("end", word_timings[-1].get("start", 0.0))
    duration = last_end - first_start
    if duration < 1.0:
        return 0.5

    wps = len(word_timings) / duration

    # Sweet spot: 3-5 WPS → 1.0
    if 3.0 <= wps <= 5.0:
        return 1.0

    # Below sweet spot: linear ramp from 0 at 0 WPS to 1.0 at 3 WPS
    if wps < 3.0:
        return max(0.0, wps / 3.0)

    # Above sweet spot: decay from 1.0 at 5 WPS to 0.0 at 10 WPS
    if wps > 5.0:
        return max(0.0, 1.0 - (wps - 5.0) / 5.0)

    return 0.5  # unreachable, but defensive


# ── Main scoring function ───────────────────────────────────────────────────

def score_moment(
    *,
    start_time: float,
    end_time: float,
    content_type: str | None = None,
    summary: str | None = None,
    plugins: list[str] | None = None,
    raw_transcript: str | None = None,
    source_quality: str | None = None,
    video_content_type: str | None = None,
    word_timings: list[dict[str, Any]] | None = None,
) -> dict[str, Any]:
    """Score a KeyMoment for highlight potential.

    Parameters
    ----------
    start_time : float
        Moment start in seconds.
    end_time : float
        Moment end in seconds.
    content_type : str | None
        KeyMoment content type (technique, settings, workflow, reasoning).
    summary : str | None
        KeyMoment summary text.
    plugins : list[str] | None
        Plugins mentioned in the moment.
    raw_transcript : str | None
        Raw transcript text of the moment.
    source_quality : str | None
        TechniquePage source quality (structured, mixed, unstructured).
    video_content_type : str | None
        SourceVideo content type (tutorial, breakdown, livestream, short_form).
    word_timings : list[dict] | None
        Word-level timing dicts with ``start`` and ``end`` keys (seconds).
        When None, audio proxy dimensions score 0.5 (neutral).

    Returns
    -------
    dict with keys:
        score : float in [0.0, 1.0]
        score_breakdown : dict mapping dimension names to float scores
        duration_secs : float
    """
    duration_secs = max(0.0, end_time - start_time)

    breakdown = {
        "duration_score": _duration_fitness(duration_secs),
        "content_density_score": _specificity_density(summary),
        "technique_relevance_score": _content_type_weight(content_type),
        "plugin_diversity_score": _plugin_richness(plugins),
        "engagement_proxy_score": _transcript_energy(raw_transcript),
        "position_score": _source_quality_weight(source_quality),
        "uniqueness_score": _video_type_weight(video_content_type),
        "speech_rate_variance_score": _speech_rate_variance(word_timings),
        "pause_density_score": _pause_density(word_timings),
        "speaking_pace_score": _speaking_pace_fitness(word_timings),
    }

    # Weighted composite
    composite = sum(
        breakdown[dim] * weight for dim, weight in _WEIGHTS.items()
    )

    # Clamp to [0, 1] for safety
    composite = max(0.0, min(1.0, composite))

    return {
        "score": composite,
        "score_breakdown": breakdown,
        "duration_secs": duration_secs,
    }