From 7cc9497f3cf08c58da7377b002e44110ac0efff0 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sat, 4 Apr 2026 08:11:32 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Wired=20word-timing=20extraction=20into?= =?UTF-8?q?=20stage=5Fhighlight=5Fdetection=20=E2=80=94=2062=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/pipeline/stages.py" - ".gsd/KNOWLEDGE.md" GSD-Task: S05/T02 --- backend/pipeline/stages.py | 54 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py index 0add68a..5289679 100644 --- a/backend/pipeline/stages.py +++ b/backend/pipeline/stages.py @@ -12,6 +12,7 @@ from __future__ import annotations import hashlib import json import logging +import os import re import subprocess import time @@ -2449,7 +2450,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) -> Returns the video_id for chain compatibility. """ - from pipeline.highlight_scorer import score_moment + from pipeline.highlight_scorer import extract_word_timings, score_moment start = time.monotonic() logger.info("Highlight detection starting for video_id=%s", video_id) @@ -2457,6 +2458,47 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) -> session = _get_sync_session() try: + # ------------------------------------------------------------------ + # Load transcript data once for the entire video (word-level timing) + # ------------------------------------------------------------------ + transcript_data: list | None = None + source_video = session.execute( + select(SourceVideo).where(SourceVideo.id == video_id) + ).scalar_one_or_none() + + if source_video and source_video.transcript_path: + transcript_file = source_video.transcript_path + try: + with open(transcript_file, "r") as fh: + raw = json.load(fh) + # Accept both {"segments": [...]} and bare [...] + if isinstance(raw, dict): + transcript_data = raw.get("segments", raw.get("results", [])) + elif isinstance(raw, list): + transcript_data = raw + else: + transcript_data = None + if transcript_data: + logger.info( + "Loaded transcript for video_id=%s (%d segments)", + video_id, len(transcript_data), + ) + except FileNotFoundError: + logger.warning( + "Transcript file not found for video_id=%s: %s", + video_id, transcript_file, + ) + except (json.JSONDecodeError, OSError) as io_exc: + logger.warning( + "Failed to load transcript for video_id=%s: %s", + video_id, io_exc, + ) + else: + logger.info( + "No transcript_path for video_id=%s — audio proxy signals will be neutral", + video_id, + ) + moments = ( session.execute( select(KeyMoment) @@ -2480,6 +2522,13 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) -> candidate_count = 0 for moment in moments: try: + # Extract word-level timings for this moment's window + word_timings = None + if transcript_data: + word_timings = extract_word_timings( + transcript_data, moment.start_time, moment.end_time, + ) or None # empty list → None for neutral fallback + result = score_moment( start_time=moment.start_time, end_time=moment.end_time, @@ -2489,6 +2538,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) -> raw_transcript=moment.raw_transcript, source_quality=None, # filled below if technique_page loaded video_content_type=None, # filled below if source_video loaded + word_timings=word_timings, ) except Exception as score_exc: logger.warning( @@ -2509,7 +2559,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) -> duration_secs=result["duration_secs"], ) stmt = stmt.on_conflict_do_update( - constraint="uq_highlight_candidate_moment", + constraint="highlight_candidates_key_moment_id_key", set_={ "score": stmt.excluded.score, "score_breakdown": stmt.excluded.score_breakdown,