feat: Wired word-timing extraction into stage_highlight_detection — 62…

- "backend/pipeline/stages.py"
- ".gsd/KNOWLEDGE.md"

GSD-Task: S05/T02
This commit is contained in:
jlightner 2026-04-04 08:11:32 +00:00
parent 52df9c0dc2
commit 7cc9497f3c

View file

@ -12,6 +12,7 @@ from __future__ import annotations
import hashlib import hashlib
import json import json
import logging import logging
import os
import re import re
import subprocess import subprocess
import time import time
@ -2449,7 +2450,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) ->
Returns the video_id for chain compatibility. Returns the video_id for chain compatibility.
""" """
from pipeline.highlight_scorer import score_moment from pipeline.highlight_scorer import extract_word_timings, score_moment
start = time.monotonic() start = time.monotonic()
logger.info("Highlight detection starting for video_id=%s", video_id) logger.info("Highlight detection starting for video_id=%s", video_id)
@ -2457,6 +2458,47 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) ->
session = _get_sync_session() session = _get_sync_session()
try: try:
# ------------------------------------------------------------------
# Load transcript data once for the entire video (word-level timing)
# ------------------------------------------------------------------
transcript_data: list | None = None
source_video = session.execute(
select(SourceVideo).where(SourceVideo.id == video_id)
).scalar_one_or_none()
if source_video and source_video.transcript_path:
transcript_file = source_video.transcript_path
try:
with open(transcript_file, "r") as fh:
raw = json.load(fh)
# Accept both {"segments": [...]} and bare [...]
if isinstance(raw, dict):
transcript_data = raw.get("segments", raw.get("results", []))
elif isinstance(raw, list):
transcript_data = raw
else:
transcript_data = None
if transcript_data:
logger.info(
"Loaded transcript for video_id=%s (%d segments)",
video_id, len(transcript_data),
)
except FileNotFoundError:
logger.warning(
"Transcript file not found for video_id=%s: %s",
video_id, transcript_file,
)
except (json.JSONDecodeError, OSError) as io_exc:
logger.warning(
"Failed to load transcript for video_id=%s: %s",
video_id, io_exc,
)
else:
logger.info(
"No transcript_path for video_id=%s — audio proxy signals will be neutral",
video_id,
)
moments = ( moments = (
session.execute( session.execute(
select(KeyMoment) select(KeyMoment)
@ -2480,6 +2522,13 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) ->
candidate_count = 0 candidate_count = 0
for moment in moments: for moment in moments:
try: try:
# Extract word-level timings for this moment's window
word_timings = None
if transcript_data:
word_timings = extract_word_timings(
transcript_data, moment.start_time, moment.end_time,
) or None # empty list → None for neutral fallback
result = score_moment( result = score_moment(
start_time=moment.start_time, start_time=moment.start_time,
end_time=moment.end_time, end_time=moment.end_time,
@ -2489,6 +2538,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) ->
raw_transcript=moment.raw_transcript, raw_transcript=moment.raw_transcript,
source_quality=None, # filled below if technique_page loaded source_quality=None, # filled below if technique_page loaded
video_content_type=None, # filled below if source_video loaded video_content_type=None, # filled below if source_video loaded
word_timings=word_timings,
) )
except Exception as score_exc: except Exception as score_exc:
logger.warning( logger.warning(
@ -2509,7 +2559,7 @@ def stage_highlight_detection(self, video_id: str, run_id: str | None = None) ->
duration_secs=result["duration_secs"], duration_secs=result["duration_secs"],
) )
stmt = stmt.on_conflict_do_update( stmt = stmt.on_conflict_do_update(
constraint="uq_highlight_candidate_moment", constraint="highlight_candidates_key_moment_id_key",
set_={ set_={
"score": stmt.excluded.score, "score": stmt.excluded.score,
"score_breakdown": stmt.excluded.score_breakdown, "score_breakdown": stmt.excluded.score_breakdown,