chrysopedia/backend/pipeline/schemas.py
jlightner 15dcab201a test: Added BodySection/BodySubSection schema models, changed Synthesiz…
- "backend/pipeline/schemas.py"
- "backend/pipeline/citation_utils.py"
- "backend/pipeline/test_citation_utils.py"

GSD-Task: S01/T01
2026-04-03 00:50:30 +00:00

125 lines
5.1 KiB
Python

"""Pydantic schemas for pipeline stage inputs and outputs.
Stage 2 — Segmentation: groups transcript segments by topic.
Stage 3 — Extraction: extracts key moments from segments.
Stage 4 — Classification: classifies moments by category/tags.
Stage 5 — Synthesis: generates technique pages from classified moments.
"""
from __future__ import annotations
from pydantic import BaseModel, Field
# ── Stage 2: Segmentation ───────────────────────────────────────────────────
class TopicSegment(BaseModel):
"""A contiguous group of transcript segments sharing a topic."""
start_index: int = Field(description="First transcript segment index in this group")
end_index: int = Field(description="Last transcript segment index in this group (inclusive)")
topic_label: str = Field(description="Short label describing the topic")
summary: str = Field(description="Brief summary of what is discussed")
class SegmentationResult(BaseModel):
"""Full output of stage 2 (segmentation)."""
segments: list[TopicSegment]
# ── Stage 3: Extraction ─────────────────────────────────────────────────────
class ExtractedMoment(BaseModel):
"""A single key moment extracted from a topic segment group."""
title: str = Field(description="Concise title for the moment")
summary: str = Field(description="Detailed summary of the technique/concept")
start_time: float = Field(description="Start time in seconds")
end_time: float = Field(description="End time in seconds")
content_type: str = Field(description="One of: technique, settings, reasoning, workflow")
plugins: list[str] = Field(default_factory=list, description="Plugins/tools mentioned")
raw_transcript: str = Field(default="", description="Raw transcript text for this moment")
class ExtractionResult(BaseModel):
"""Full output of stage 3 (extraction)."""
moments: list[ExtractedMoment]
# ── Stage 4: Classification ─────────────────────────────────────────────────
class ClassifiedMoment(BaseModel):
"""Classification metadata for a single extracted moment."""
moment_index: int = Field(description="Index into ExtractionResult.moments")
topic_category: str = Field(description="High-level topic category")
topic_tags: list[str] = Field(default_factory=list, description="Specific topic tags")
content_type_override: str | None = Field(
default=None,
description="Override for content_type if classification disagrees with extraction",
)
class ClassificationResult(BaseModel):
"""Full output of stage 4 (classification)."""
classifications: list[ClassifiedMoment]
# ── Stage 5: Synthesis ───────────────────────────────────────────────────────
class BodySubSection(BaseModel):
"""An H3-level subsection within a body section."""
heading: str = Field(description="H3 subsection heading")
content: str = Field(description="Subsection body text (may contain [N] citation markers)")
class BodySection(BaseModel):
"""An H2-level section of a technique page body."""
heading: str = Field(description="H2 section heading")
content: str = Field(description="Section body text (may contain [N] citation markers)")
subsections: list[BodySubSection] = Field(
default_factory=list,
description="Optional H3-level subsections",
)
class SynthesizedPage(BaseModel):
"""A technique page synthesized from classified moments."""
title: str = Field(description="Page title")
slug: str = Field(description="URL-safe slug")
topic_category: str = Field(description="Primary topic category")
topic_tags: list[str] = Field(default_factory=list, description="Associated tags")
summary: str = Field(description="Page summary / overview paragraph")
body_sections: list[BodySection] = Field(
default_factory=list,
description="Structured body content as H2 sections with optional H3 subsections",
)
body_sections_format: str = Field(
default="v2",
description="Schema version for body_sections ('v2' = list[BodySection])",
)
signal_chains: list[dict] = Field(
default_factory=list,
description="Signal chain descriptions (for audio/music production contexts)",
)
plugins: list[str] = Field(default_factory=list, description="Plugins/tools referenced")
source_quality: str = Field(
default="mixed",
description="One of: structured, mixed, unstructured",
)
moment_indices: list[int] = Field(
default_factory=list,
description="Indices of source moments (from the input list) that this page covers",
)
class SynthesisResult(BaseModel):
"""Full output of stage 5 (synthesis)."""
pages: list[SynthesizedPage]