When the LLM splits a category group into multiple technique pages, moments were blanket-linked to the last page in the loop, leaving all other pages as orphans with 0 key moments (48 out of 204 pages affected). Added moment_indices field to SynthesizedPage schema and synthesis prompt so the LLM explicitly declares which input moments each page covers. Stage 5 now uses these indices for targeted linking instead of the broken blanket approach. Tags are also computed per-page from linked moments only, fixing cross-contamination (e.g. "stereo imaging" tag appearing on gain staging pages). Deleted 48 orphan technique pages from the database. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
103 lines
4.3 KiB
Python
103 lines
4.3 KiB
Python
"""Pydantic schemas for pipeline stage inputs and outputs.
|
|
|
|
Stage 2 — Segmentation: groups transcript segments by topic.
|
|
Stage 3 — Extraction: extracts key moments from segments.
|
|
Stage 4 — Classification: classifies moments by category/tags.
|
|
Stage 5 — Synthesis: generates technique pages from classified moments.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
# ── Stage 2: Segmentation ───────────────────────────────────────────────────
|
|
|
|
class TopicSegment(BaseModel):
|
|
"""A contiguous group of transcript segments sharing a topic."""
|
|
|
|
start_index: int = Field(description="First transcript segment index in this group")
|
|
end_index: int = Field(description="Last transcript segment index in this group (inclusive)")
|
|
topic_label: str = Field(description="Short label describing the topic")
|
|
summary: str = Field(description="Brief summary of what is discussed")
|
|
|
|
|
|
class SegmentationResult(BaseModel):
|
|
"""Full output of stage 2 (segmentation)."""
|
|
|
|
segments: list[TopicSegment]
|
|
|
|
|
|
# ── Stage 3: Extraction ─────────────────────────────────────────────────────
|
|
|
|
class ExtractedMoment(BaseModel):
|
|
"""A single key moment extracted from a topic segment group."""
|
|
|
|
title: str = Field(description="Concise title for the moment")
|
|
summary: str = Field(description="Detailed summary of the technique/concept")
|
|
start_time: float = Field(description="Start time in seconds")
|
|
end_time: float = Field(description="End time in seconds")
|
|
content_type: str = Field(description="One of: technique, settings, reasoning, workflow")
|
|
plugins: list[str] = Field(default_factory=list, description="Plugins/tools mentioned")
|
|
raw_transcript: str = Field(default="", description="Raw transcript text for this moment")
|
|
|
|
|
|
class ExtractionResult(BaseModel):
|
|
"""Full output of stage 3 (extraction)."""
|
|
|
|
moments: list[ExtractedMoment]
|
|
|
|
|
|
# ── Stage 4: Classification ─────────────────────────────────────────────────
|
|
|
|
class ClassifiedMoment(BaseModel):
|
|
"""Classification metadata for a single extracted moment."""
|
|
|
|
moment_index: int = Field(description="Index into ExtractionResult.moments")
|
|
topic_category: str = Field(description="High-level topic category")
|
|
topic_tags: list[str] = Field(default_factory=list, description="Specific topic tags")
|
|
content_type_override: str | None = Field(
|
|
default=None,
|
|
description="Override for content_type if classification disagrees with extraction",
|
|
)
|
|
|
|
|
|
class ClassificationResult(BaseModel):
|
|
"""Full output of stage 4 (classification)."""
|
|
|
|
classifications: list[ClassifiedMoment]
|
|
|
|
|
|
# ── Stage 5: Synthesis ───────────────────────────────────────────────────────
|
|
|
|
class SynthesizedPage(BaseModel):
|
|
"""A technique page synthesized from classified moments."""
|
|
|
|
title: str = Field(description="Page title")
|
|
slug: str = Field(description="URL-safe slug")
|
|
topic_category: str = Field(description="Primary topic category")
|
|
topic_tags: list[str] = Field(default_factory=list, description="Associated tags")
|
|
summary: str = Field(description="Page summary / overview paragraph")
|
|
body_sections: dict = Field(
|
|
default_factory=dict,
|
|
description="Structured body content as section_name -> content mapping",
|
|
)
|
|
signal_chains: list[dict] = Field(
|
|
default_factory=list,
|
|
description="Signal chain descriptions (for audio/music production contexts)",
|
|
)
|
|
plugins: list[str] = Field(default_factory=list, description="Plugins/tools referenced")
|
|
source_quality: str = Field(
|
|
default="mixed",
|
|
description="One of: structured, mixed, unstructured",
|
|
)
|
|
moment_indices: list[int] = Field(
|
|
default_factory=list,
|
|
description="Indices of source moments (from the input list) that this page covers",
|
|
)
|
|
|
|
|
|
class SynthesisResult(BaseModel):
|
|
"""Full output of stage 5 (synthesis)."""
|
|
|
|
pages: list[SynthesizedPage]
|