fix: Moment-to-page linking via moment_indices in stage 5 synthesis

When the LLM splits a category group into multiple technique pages,
moments were blanket-linked to the last page in the loop, leaving all
other pages as orphans with 0 key moments (48 out of 204 pages affected).

Added moment_indices field to SynthesizedPage schema and synthesis prompt
so the LLM explicitly declares which input moments each page covers.
Stage 5 now uses these indices for targeted linking instead of the broken
blanket approach. Tags are also computed per-page from linked moments
only, fixing cross-contamination (e.g. "stereo imaging" tag appearing
on gain staging pages).

Deleted 48 orphan technique pages from the database.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
jlightner 2026-04-01 00:34:37 -05:00
parent 9b2db11095
commit c344b8c670
4 changed files with 304 additions and 45 deletions

View file

@ -47,6 +47,9 @@ class Settings(BaseSettings):
llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator llm_max_tokens_hard_limit: int = 32768 # Hard ceiling for dynamic estimator
llm_max_tokens: int = 65536 # Fallback when no estimate is provided llm_max_tokens: int = 65536 # Fallback when no estimate is provided
# Stage 5 synthesis chunking — max moments per LLM call before splitting
synthesis_chunk_size: int = 30
# Embedding endpoint # Embedding endpoint
embedding_api_url: str = "http://localhost:11434/v1" embedding_api_url: str = "http://localhost:11434/v1"
embedding_model: str = "nomic-embed-text" embedding_model: str = "nomic-embed-text"

View file

@ -91,6 +91,10 @@ class SynthesizedPage(BaseModel):
default="mixed", default="mixed",
description="One of: structured, mixed, unstructured", description="One of: structured, mixed, unstructured",
) )
moment_indices: list[int] = Field(
default_factory=list,
description="Indices of source moments (from the input list) that this page covers",
)
class SynthesisResult(BaseModel): class SynthesisResult(BaseModel):

View file

@ -643,7 +643,7 @@ def stage4_classification(self, video_id: str, run_id: str | None = None) -> str
classification_data.append({ classification_data.append({
"moment_id": str(moment.id), "moment_id": str(moment.id),
"topic_category": cls.topic_category, "topic_category": cls.topic_category.strip().title(),
"topic_tags": cls.topic_tags, "topic_tags": cls.topic_tags,
}) })
@ -784,6 +784,196 @@ def _capture_pipeline_metadata() -> dict:
# ── Stage 5: Synthesis ─────────────────────────────────────────────────────── # ── Stage 5: Synthesis ───────────────────────────────────────────────────────
def _compute_page_tags(
moment_indices: list[int],
moment_group: list[tuple],
all_tags: set[str],
) -> list[str] | None:
"""Compute tags for a specific page from its linked moment indices.
If moment_indices are available, collects tags only from those moments.
Falls back to all_tags for the category group if no indices provided.
"""
if not moment_indices:
return list(all_tags) if all_tags else None
page_tags: set[str] = set()
for idx in moment_indices:
if 0 <= idx < len(moment_group):
_, cls_info = moment_group[idx]
page_tags.update(cls_info.get("topic_tags", []))
return list(page_tags) if page_tags else None
def _build_moments_text(
moment_group: list[tuple[KeyMoment, dict]],
category: str,
) -> tuple[str, set[str]]:
"""Build the moments prompt text and collect all tags for a group of moments.
Returns (moments_text, all_tags).
"""
moments_lines = []
all_tags: set[str] = set()
for i, (m, cls_info) in enumerate(moment_group):
tags = cls_info.get("topic_tags", [])
all_tags.update(tags)
moments_lines.append(
f"[{i}] Title: {m.title}\n"
f" Summary: {m.summary}\n"
f" Content type: {m.content_type.value}\n"
f" Time: {m.start_time:.1f}s - {m.end_time:.1f}s\n"
f" Plugins: {', '.join(m.plugins) if m.plugins else 'none'}\n"
f" Category: {category}\n"
f" Tags: {', '.join(tags) if tags else 'none'}\n"
f" Transcript excerpt: {(m.raw_transcript or '')[:300]}"
)
return "\n\n".join(moments_lines), all_tags
def _synthesize_chunk(
chunk: list[tuple[KeyMoment, dict]],
category: str,
creator_name: str,
system_prompt: str,
llm: LLMClient,
model_override: str | None,
modality: str,
hard_limit: int,
video_id: str,
run_id: str | None,
chunk_label: str,
) -> SynthesisResult:
"""Run a single synthesis LLM call for a chunk of moments.
Returns the parsed SynthesisResult.
"""
moments_text, _ = _build_moments_text(chunk, category)
user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_text}\n</moments>"
estimated_input = estimate_max_tokens(system_prompt, user_prompt, stage="stage5_synthesis", hard_limit=hard_limit)
logger.info(
"Stage 5: Synthesizing %s%d moments, max_tokens=%d",
chunk_label, len(chunk), estimated_input,
)
raw = llm.complete(
system_prompt, user_prompt, response_model=SynthesisResult,
on_complete=_make_llm_callback(
video_id, "stage5_synthesis",
system_prompt=system_prompt, user_prompt=user_prompt,
run_id=run_id, context_label=chunk_label,
),
modality=modality, model_override=model_override, max_tokens=estimated_input,
)
return _safe_parse_llm_response(
raw, SynthesisResult, llm, system_prompt, user_prompt,
modality=modality, model_override=model_override, max_tokens=estimated_input,
)
def _slug_base(slug: str) -> str:
"""Extract the slug prefix before the creator name suffix for merge grouping.
E.g. 'wavetable-sound-design-copycatt' 'wavetable-sound-design'
Also normalizes casing.
"""
return slug.lower().strip()
def _merge_pages_by_slug(
all_pages: list,
creator_name: str,
llm: LLMClient,
model_override: str | None,
modality: str,
hard_limit: int,
video_id: str,
run_id: str | None,
) -> list:
"""Detect pages with the same slug across chunks and merge them via LLM.
Pages with unique slugs pass through unchanged. Pages sharing a slug
get sent to a merge prompt that combines them into one cohesive page.
Returns the final list of SynthesizedPage objects.
"""
from pipeline.schemas import SynthesizedPage
# Group pages by slug
by_slug: dict[str, list] = defaultdict(list)
for page in all_pages:
by_slug[_slug_base(page.slug)].append(page)
final_pages = []
for slug, pages_group in by_slug.items():
if len(pages_group) == 1:
# Unique slug — no merge needed
final_pages.append(pages_group[0])
continue
# Multiple pages share this slug — merge via LLM
logger.info(
"Stage 5: Merging %d partial pages with slug '%s' for video_id=%s",
len(pages_group), slug, video_id,
)
# Serialize partial pages to JSON for the merge prompt
pages_json = json.dumps(
[p.model_dump() for p in pages_group],
indent=2, ensure_ascii=False,
)
merge_system_prompt = _load_prompt("stage5_merge.txt")
merge_user_prompt = f"<creator>{creator_name}</creator>\n<pages>\n{pages_json}\n</pages>"
max_tokens = estimate_max_tokens(
merge_system_prompt, merge_user_prompt,
stage="stage5_synthesis", hard_limit=hard_limit,
)
logger.info(
"Stage 5: Merge call for slug '%s'%d partial pages, max_tokens=%d",
slug, len(pages_group), max_tokens,
)
raw = llm.complete(
merge_system_prompt, merge_user_prompt,
response_model=SynthesisResult,
on_complete=_make_llm_callback(
video_id, "stage5_synthesis",
system_prompt=merge_system_prompt,
user_prompt=merge_user_prompt,
run_id=run_id, context_label=f"merge:{slug}",
),
modality=modality, model_override=model_override,
max_tokens=max_tokens,
)
merge_result = _safe_parse_llm_response(
raw, SynthesisResult, llm,
merge_system_prompt, merge_user_prompt,
modality=modality, model_override=model_override,
max_tokens=max_tokens,
)
if merge_result.pages:
final_pages.extend(merge_result.pages)
logger.info(
"Stage 5: Merge produced %d page(s) for slug '%s'",
len(merge_result.pages), slug,
)
else:
# Merge returned nothing — fall back to keeping the partials
logger.warning(
"Stage 5: Merge returned 0 pages for slug '%s', keeping %d partials",
slug, len(pages_group),
)
final_pages.extend(pages_group)
return final_pages
@celery_app.task(bind=True, max_retries=3, default_retry_delay=30) @celery_app.task(bind=True, max_retries=3, default_retry_delay=30)
def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str: def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
"""Synthesize technique pages from classified key moments. """Synthesize technique pages from classified key moments.
@ -792,7 +982,11 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
each group into a TechniquePage, creates/updates page rows, and links each group into a TechniquePage, creates/updates page rows, and links
KeyMoments to their TechniquePage. KeyMoments to their TechniquePage.
Sets processing_status to 'published'. For large category groups (exceeding synthesis_chunk_size), moments are
split into chronological chunks, synthesized independently, then pages
with matching slugs are merged via a dedicated merge LLM call.
Sets processing_status to 'complete'.
Returns the video_id for chain compatibility. Returns the video_id for chain compatibility.
""" """
@ -801,6 +995,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
_emit_event(video_id, "stage5_synthesis", "start", run_id=run_id) _emit_event(video_id, "stage5_synthesis", "start", run_id=run_id)
settings = get_settings() settings = get_settings()
chunk_size = settings.synthesis_chunk_size
session = _get_sync_session() session = _get_sync_session()
try: try:
# Load video and moments # Load video and moments
@ -833,77 +1028,115 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
cls_by_moment_id = {c["moment_id"]: c for c in classification_data} cls_by_moment_id = {c["moment_id"]: c for c in classification_data}
# Group moments by topic_category (from classification) # Group moments by topic_category (from classification)
# Normalize category casing to prevent near-duplicate groups
# (e.g., "Sound design" vs "Sound Design")
groups: dict[str, list[tuple[KeyMoment, dict]]] = defaultdict(list) groups: dict[str, list[tuple[KeyMoment, dict]]] = defaultdict(list)
for moment in moments: for moment in moments:
cls_info = cls_by_moment_id.get(str(moment.id), {}) cls_info = cls_by_moment_id.get(str(moment.id), {})
category = cls_info.get("topic_category", "Uncategorized") category = cls_info.get("topic_category", "Uncategorized").strip().title()
groups[category].append((moment, cls_info)) groups[category].append((moment, cls_info))
system_prompt = _load_prompt("stage5_synthesis.txt") system_prompt = _load_prompt("stage5_synthesis.txt")
llm = _get_llm_client() llm = _get_llm_client()
model_override, modality = _get_stage_config(5) model_override, modality = _get_stage_config(5)
hard_limit = get_settings().llm_max_tokens_hard_limit hard_limit = settings.llm_max_tokens_hard_limit
logger.info("Stage 5 using model=%s, modality=%s", model_override or "default", modality) logger.info("Stage 5 using model=%s, modality=%s", model_override or "default", modality)
pages_created = 0 pages_created = 0
for category, moment_group in groups.items(): for category, moment_group in groups.items():
# Build moments text for the LLM # Collect all tags across the full group (used for DB writes later)
moments_lines = []
all_tags: set[str] = set() all_tags: set[str] = set()
for i, (m, cls_info) in enumerate(moment_group): for _, cls_info in moment_group:
tags = cls_info.get("topic_tags", []) all_tags.update(cls_info.get("topic_tags", []))
all_tags.update(tags)
moments_lines.append( # ── Chunked synthesis ────────────────────────────────────
f"[{i}] Title: {m.title}\n" if len(moment_group) <= chunk_size:
f" Summary: {m.summary}\n" # Small group — single LLM call (original behavior)
f" Content type: {m.content_type.value}\n" result = _synthesize_chunk(
f" Time: {m.start_time:.1f}s - {m.end_time:.1f}s\n" moment_group, category, creator_name,
f" Plugins: {', '.join(m.plugins) if m.plugins else 'none'}\n" system_prompt, llm, model_override, modality, hard_limit,
f" Category: {category}\n" video_id, run_id, f"category:{category}",
f" Tags: {', '.join(tags) if tags else 'none'}\n" )
f" Transcript excerpt: {(m.raw_transcript or '')[:300]}" synthesized_pages = list(result.pages)
logger.info(
"Stage 5: category '%s'%d moments, %d page(s) from single call",
category, len(moment_group), len(synthesized_pages),
)
else:
# Large group — split into chunks, synthesize each, then merge
num_chunks = (len(moment_group) + chunk_size - 1) // chunk_size
logger.info(
"Stage 5: category '%s' has %d moments — splitting into %d chunks of ≤%d",
category, len(moment_group), num_chunks, chunk_size,
) )
moments_text = "\n\n".join(moments_lines)
user_prompt = f"<creator>{creator_name}</creator>\n<moments>\n{moments_text}\n</moments>" chunk_pages = []
for chunk_idx in range(num_chunks):
chunk_start = chunk_idx * chunk_size
chunk_end = min(chunk_start + chunk_size, len(moment_group))
chunk = moment_group[chunk_start:chunk_end]
chunk_label = f"category:{category} chunk:{chunk_idx + 1}/{num_chunks}"
max_tokens = estimate_max_tokens(system_prompt, user_prompt, stage="stage5_synthesis", hard_limit=hard_limit) result = _synthesize_chunk(
raw = llm.complete(system_prompt, user_prompt, response_model=SynthesisResult, on_complete=_make_llm_callback(video_id, "stage5_synthesis", system_prompt=system_prompt, user_prompt=user_prompt, run_id=run_id, context_label=category), chunk, category, creator_name,
modality=modality, model_override=model_override, max_tokens=max_tokens) system_prompt, llm, model_override, modality, hard_limit,
result = _safe_parse_llm_response(raw, SynthesisResult, llm, system_prompt, user_prompt, video_id, run_id, chunk_label,
modality=modality, model_override=model_override, max_tokens=max_tokens) )
chunk_pages.extend(result.pages)
logger.info(
"Stage 5: %s produced %d page(s)",
chunk_label, len(result.pages),
)
# Merge pages with matching slugs across chunks
logger.info(
"Stage 5: category '%s'%d total pages from %d chunks, checking for merges",
category, len(chunk_pages), num_chunks,
)
synthesized_pages = _merge_pages_by_slug(
chunk_pages, creator_name,
llm, model_override, modality, hard_limit,
video_id, run_id,
)
logger.info(
"Stage 5: category '%s'%d final page(s) after merge",
category, len(synthesized_pages),
)
# ── Persist pages to DB ──────────────────────────────────
# Load prior pages from this video (snapshot taken before pipeline reset) # Load prior pages from this video (snapshot taken before pipeline reset)
prior_page_ids = _load_prior_pages(video_id) prior_page_ids = _load_prior_pages(video_id)
# Create/update TechniquePage rows for page_data in synthesized_pages:
for page_data in result.pages:
existing = None existing = None
# First: check prior pages from this video by creator + category # First: check by slug (most specific match)
if prior_page_ids: if existing is None:
existing = session.execute(
select(TechniquePage).where(TechniquePage.slug == page_data.slug)
).scalar_one_or_none()
# Fallback: check prior pages from this video by creator + category
# Use .first() since multiple pages may share a category
if existing is None and prior_page_ids:
existing = session.execute( existing = session.execute(
select(TechniquePage).where( select(TechniquePage).where(
TechniquePage.id.in_(prior_page_ids), TechniquePage.id.in_(prior_page_ids),
TechniquePage.creator_id == video.creator_id, TechniquePage.creator_id == video.creator_id,
TechniquePage.topic_category == (page_data.topic_category or category), func.lower(TechniquePage.topic_category) == func.lower(page_data.topic_category or category),
) )
).scalar_one_or_none() ).scalars().first()
if existing: if existing:
logger.info( logger.info(
"Stage 5: Matched prior page '%s' (id=%s) by creator+category for video_id=%s", "Stage 5: Matched prior page '%s' (id=%s) by creator+category for video_id=%s",
existing.slug, existing.id, video_id, existing.slug, existing.id, video_id,
) )
# Fallback: check by slug (handles cross-video dedup)
if existing is None:
existing = session.execute(
select(TechniquePage).where(TechniquePage.slug == page_data.slug)
).scalar_one_or_none()
if existing: if existing:
# Snapshot existing content before overwriting # Snapshot existing content before overwriting
try: try:
sq = existing.source_quality
sq_value = sq.value if hasattr(sq, 'value') else sq
snapshot = { snapshot = {
"title": existing.title, "title": existing.title,
"slug": existing.slug, "slug": existing.slug,
@ -913,7 +1146,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
"body_sections": existing.body_sections, "body_sections": existing.body_sections,
"signal_chains": existing.signal_chains, "signal_chains": existing.signal_chains,
"plugins": existing.plugins, "plugins": existing.plugins,
"source_quality": existing.source_quality.value if existing.source_quality else None, "source_quality": sq_value,
} }
version_count = session.execute( version_count = session.execute(
select(func.count()).where( select(func.count()).where(
@ -946,7 +1179,8 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
existing.body_sections = page_data.body_sections existing.body_sections = page_data.body_sections
existing.signal_chains = page_data.signal_chains existing.signal_chains = page_data.signal_chains
existing.plugins = page_data.plugins if page_data.plugins else None existing.plugins = page_data.plugins if page_data.plugins else None
existing.topic_tags = list(all_tags) if all_tags else None page_tags = _compute_page_tags(page_moment_indices, moment_group, all_tags)
existing.topic_tags = page_tags
existing.source_quality = page_data.source_quality existing.source_quality = page_data.source_quality
page = existing page = existing
else: else:
@ -955,7 +1189,7 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
title=page_data.title, title=page_data.title,
slug=page_data.slug, slug=page_data.slug,
topic_category=page_data.topic_category or category, topic_category=page_data.topic_category or category,
topic_tags=list(all_tags) if all_tags else None, topic_tags=_compute_page_tags(page_moment_indices, moment_group, all_tags),
summary=page_data.summary, summary=page_data.summary,
body_sections=page_data.body_sections, body_sections=page_data.body_sections,
signal_chains=page_data.signal_chains, signal_chains=page_data.signal_chains,
@ -967,9 +1201,25 @@ def stage5_synthesis(self, video_id: str, run_id: str | None = None) -> str:
pages_created += 1 pages_created += 1
# Link moments to the technique page # Link moments to the technique page using moment_indices
for m, _ in moment_group: page_moment_indices = getattr(page_data, "moment_indices", None) or []
m.technique_page_id = page.id
if page_moment_indices:
# LLM specified which moments belong to this page
for idx in page_moment_indices:
if 0 <= idx < len(moment_group):
moment_group[idx][0].technique_page_id = page.id
elif len(synthesized_pages) == 1:
# Single page — link all moments (safe fallback)
for m, _ in moment_group:
m.technique_page_id = page.id
else:
# Multiple pages but no moment_indices — log warning
logger.warning(
"Stage 5: page '%s' has no moment_indices and is one of %d pages "
"for category '%s'. Moments will not be linked to this page.",
page_data.slug, len(synthesized_pages), category,
)
# Update processing_status # Update processing_status
video.processing_status = ProcessingStatus.complete video.processing_status = ProcessingStatus.complete

View file

@ -73,7 +73,7 @@ The creator name is provided in a <creator> tag. Key moments are provided inside
## Output format ## Output format
Return a JSON object with a single key "pages" containing a list of synthesized pages. Most inputs produce a single page, but if the moments clearly cover two distinctly separate techniques (e.g., moments about both "kick design" and "hi-hat design" that happen to share a topic_category), split them into separate pages. Return a JSON object with a single key "pages" containing a list of synthesized pages. Most inputs produce a single page, but if the moments clearly cover two distinctly separate techniques (e.g., moments about both "kick design" and "hi-hat design" that happen to share a topic_category), split them into separate pages. When splitting, you MUST assign each moment to exactly one page via the moment_indices field — every input moment index must appear in exactly one page's moment_indices array.
```json ```json
{ {
@ -100,7 +100,8 @@ Return a JSON object with a single key "pages" containing a list of synthesized
} }
], ],
"plugins": ["Vital", "Kilohearts Transient Shaper", "FabFilter Pro-Q 3", "iZotope Trash 2"], "plugins": ["Vital", "Kilohearts Transient Shaper", "FabFilter Pro-Q 3", "iZotope Trash 2"],
"source_quality": "structured" "source_quality": "structured",
"moment_indices": [0, 1, 2, 3, 4]
} }
] ]
} }
@ -117,6 +118,7 @@ Return a JSON object with a single key "pages" containing a list of synthesized
- **signal_chains**: Array of signal chain objects. Each has a "name" (what this chain is for) and "steps" (ordered list of stages with plugin names, settings, and roles). Only include when explicitly demonstrated by the creator. Empty array if not applicable. - **signal_chains**: Array of signal chain objects. Each has a "name" (what this chain is for) and "steps" (ordered list of stages with plugin names, settings, and roles). Only include when explicitly demonstrated by the creator. Empty array if not applicable.
- **plugins**: Deduplicated array of all plugins, instruments, and specific tools mentioned across the moments. Use "<Manufacturer> <PluginName>" format consistently (e.g., "FabFilter Pro-Q 3" not "Pro-Q", "Xfer Serum" not just "Serum", "Valhalla VintageVerb" not "Valhalla reverb", "Kilohearts Disperser" not "Disperser"). Always include the manufacturer name for disambiguation. - **plugins**: Deduplicated array of all plugins, instruments, and specific tools mentioned across the moments. Use "<Manufacturer> <PluginName>" format consistently (e.g., "FabFilter Pro-Q 3" not "Pro-Q", "Xfer Serum" not just "Serum", "Valhalla VintageVerb" not "Valhalla reverb", "Kilohearts Disperser" not "Disperser"). Always include the manufacturer name for disambiguation.
- **source_quality**: One of "structured", "mixed", "unstructured". - **source_quality**: One of "structured", "mixed", "unstructured".
- **moment_indices**: Array of integer indices from the input moments list that this page covers. Every moment index must appear in exactly one page. If you produce a single page, include all indices. If you split into multiple pages, partition the indices so each moment is assigned to the page it most closely relates to. This field is required.
## Critical rules ## Critical rules