- "backend/pipeline/schemas.py" - "backend/pipeline/citation_utils.py" - "backend/pipeline/test_citation_utils.py" GSD-Task: S01/T01
125 lines
5.1 KiB
Python
125 lines
5.1 KiB
Python
"""Pydantic schemas for pipeline stage inputs and outputs.
|
|
|
|
Stage 2 — Segmentation: groups transcript segments by topic.
|
|
Stage 3 — Extraction: extracts key moments from segments.
|
|
Stage 4 — Classification: classifies moments by category/tags.
|
|
Stage 5 — Synthesis: generates technique pages from classified moments.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
# ── Stage 2: Segmentation ───────────────────────────────────────────────────
|
|
|
|
class TopicSegment(BaseModel):
|
|
"""A contiguous group of transcript segments sharing a topic."""
|
|
|
|
start_index: int = Field(description="First transcript segment index in this group")
|
|
end_index: int = Field(description="Last transcript segment index in this group (inclusive)")
|
|
topic_label: str = Field(description="Short label describing the topic")
|
|
summary: str = Field(description="Brief summary of what is discussed")
|
|
|
|
|
|
class SegmentationResult(BaseModel):
|
|
"""Full output of stage 2 (segmentation)."""
|
|
|
|
segments: list[TopicSegment]
|
|
|
|
|
|
# ── Stage 3: Extraction ─────────────────────────────────────────────────────
|
|
|
|
class ExtractedMoment(BaseModel):
|
|
"""A single key moment extracted from a topic segment group."""
|
|
|
|
title: str = Field(description="Concise title for the moment")
|
|
summary: str = Field(description="Detailed summary of the technique/concept")
|
|
start_time: float = Field(description="Start time in seconds")
|
|
end_time: float = Field(description="End time in seconds")
|
|
content_type: str = Field(description="One of: technique, settings, reasoning, workflow")
|
|
plugins: list[str] = Field(default_factory=list, description="Plugins/tools mentioned")
|
|
raw_transcript: str = Field(default="", description="Raw transcript text for this moment")
|
|
|
|
|
|
class ExtractionResult(BaseModel):
|
|
"""Full output of stage 3 (extraction)."""
|
|
|
|
moments: list[ExtractedMoment]
|
|
|
|
|
|
# ── Stage 4: Classification ─────────────────────────────────────────────────
|
|
|
|
class ClassifiedMoment(BaseModel):
|
|
"""Classification metadata for a single extracted moment."""
|
|
|
|
moment_index: int = Field(description="Index into ExtractionResult.moments")
|
|
topic_category: str = Field(description="High-level topic category")
|
|
topic_tags: list[str] = Field(default_factory=list, description="Specific topic tags")
|
|
content_type_override: str | None = Field(
|
|
default=None,
|
|
description="Override for content_type if classification disagrees with extraction",
|
|
)
|
|
|
|
|
|
class ClassificationResult(BaseModel):
|
|
"""Full output of stage 4 (classification)."""
|
|
|
|
classifications: list[ClassifiedMoment]
|
|
|
|
|
|
# ── Stage 5: Synthesis ───────────────────────────────────────────────────────
|
|
|
|
class BodySubSection(BaseModel):
|
|
"""An H3-level subsection within a body section."""
|
|
|
|
heading: str = Field(description="H3 subsection heading")
|
|
content: str = Field(description="Subsection body text (may contain [N] citation markers)")
|
|
|
|
|
|
class BodySection(BaseModel):
|
|
"""An H2-level section of a technique page body."""
|
|
|
|
heading: str = Field(description="H2 section heading")
|
|
content: str = Field(description="Section body text (may contain [N] citation markers)")
|
|
subsections: list[BodySubSection] = Field(
|
|
default_factory=list,
|
|
description="Optional H3-level subsections",
|
|
)
|
|
|
|
|
|
class SynthesizedPage(BaseModel):
|
|
"""A technique page synthesized from classified moments."""
|
|
|
|
title: str = Field(description="Page title")
|
|
slug: str = Field(description="URL-safe slug")
|
|
topic_category: str = Field(description="Primary topic category")
|
|
topic_tags: list[str] = Field(default_factory=list, description="Associated tags")
|
|
summary: str = Field(description="Page summary / overview paragraph")
|
|
body_sections: list[BodySection] = Field(
|
|
default_factory=list,
|
|
description="Structured body content as H2 sections with optional H3 subsections",
|
|
)
|
|
body_sections_format: str = Field(
|
|
default="v2",
|
|
description="Schema version for body_sections ('v2' = list[BodySection])",
|
|
)
|
|
signal_chains: list[dict] = Field(
|
|
default_factory=list,
|
|
description="Signal chain descriptions (for audio/music production contexts)",
|
|
)
|
|
plugins: list[str] = Field(default_factory=list, description="Plugins/tools referenced")
|
|
source_quality: str = Field(
|
|
default="mixed",
|
|
description="One of: structured, mixed, unstructured",
|
|
)
|
|
moment_indices: list[int] = Field(
|
|
default_factory=list,
|
|
description="Indices of source moments (from the input list) that this page covers",
|
|
)
|
|
|
|
|
|
class SynthesisResult(BaseModel):
|
|
"""Full output of stage 5 (synthesis)."""
|
|
|
|
pages: list[SynthesizedPage]
|