"""Pydantic schemas for pipeline stage inputs and outputs. Stage 2 — Segmentation: groups transcript segments by topic. Stage 3 — Extraction: extracts key moments from segments. Stage 4 — Classification: classifies moments by category/tags. Stage 5 — Synthesis: generates technique pages from classified moments. """ from __future__ import annotations from pydantic import BaseModel, Field # ── Stage 2: Segmentation ─────────────────────────────────────────────────── class TopicSegment(BaseModel): """A contiguous group of transcript segments sharing a topic.""" start_index: int = Field(description="First transcript segment index in this group") end_index: int = Field(description="Last transcript segment index in this group (inclusive)") topic_label: str = Field(description="Short label describing the topic") summary: str = Field(description="Brief summary of what is discussed") class SegmentationResult(BaseModel): """Full output of stage 2 (segmentation).""" segments: list[TopicSegment] # ── Stage 3: Extraction ───────────────────────────────────────────────────── class ExtractedMoment(BaseModel): """A single key moment extracted from a topic segment group.""" title: str = Field(description="Concise title for the moment") summary: str = Field(description="Detailed summary of the technique/concept") start_time: float = Field(description="Start time in seconds") end_time: float = Field(description="End time in seconds") content_type: str = Field(description="One of: technique, settings, reasoning, workflow") plugins: list[str] = Field(default_factory=list, description="Plugins/tools mentioned") raw_transcript: str = Field(default="", description="Raw transcript text for this moment") class ExtractionResult(BaseModel): """Full output of stage 3 (extraction).""" moments: list[ExtractedMoment] # ── Stage 4: Classification ───────────────────────────────────────────────── class ClassifiedMoment(BaseModel): """Classification metadata for a single extracted moment.""" moment_index: int = Field(description="Index into ExtractionResult.moments") topic_category: str = Field(description="High-level topic category") topic_tags: list[str] = Field(default_factory=list, description="Specific topic tags") content_type_override: str | None = Field( default=None, description="Override for content_type if classification disagrees with extraction", ) class ClassificationResult(BaseModel): """Full output of stage 4 (classification).""" classifications: list[ClassifiedMoment] # ── Stage 5: Synthesis ─────────────────────────────────────────────────────── class SynthesizedPage(BaseModel): """A technique page synthesized from classified moments.""" title: str = Field(description="Page title") slug: str = Field(description="URL-safe slug") topic_category: str = Field(description="Primary topic category") topic_tags: list[str] = Field(default_factory=list, description="Associated tags") summary: str = Field(description="Page summary / overview paragraph") body_sections: dict = Field( default_factory=dict, description="Structured body content as section_name -> content mapping", ) signal_chains: list[dict] = Field( default_factory=list, description="Signal chain descriptions (for audio/music production contexts)", ) plugins: list[str] = Field(default_factory=list, description="Plugins/tools referenced") source_quality: str = Field( default="mixed", description="One of: structured, mixed, unstructured", ) class SynthesisResult(BaseModel): """Full output of stage 5 (synthesis).""" pages: list[SynthesizedPage]