chrysopedia/backend/models.py
jlightner 4b0914b12b fix: restore complete project tree from ub01 canonical state
Auto-mode commit 7aa33cd accidentally deleted 78 files (14,814 lines) during M005
execution. Subsequent commits rebuilt some frontend files but backend/, alembic/,
tests/, whisper/, docker configs, and prompts were never restored in this repo.

This commit restores the full project tree by syncing from ub01's working directory,
which has all M001-M007 features running in production containers.

Restored: backend/ (config, models, routers, database, redis, search_service, worker),
alembic/ (6 migrations), docker/ (Dockerfiles, nginx, compose), prompts/ (4 stages),
tests/, whisper/, README.md, .env.example, chrysopedia-spec.md
2026-03-31 02:10:41 +00:00

419 lines
16 KiB
Python

"""SQLAlchemy ORM models for the Chrysopedia knowledge base.
Seven entities matching chrysopedia-spec.md §6.1:
Creator, SourceVideo, TranscriptSegment, KeyMoment,
TechniquePage, RelatedTechniqueLink, Tag
"""
from __future__ import annotations
import enum
import uuid
from datetime import datetime, timezone
from sqlalchemy import (
Enum,
Float,
ForeignKey,
Integer,
String,
Text,
UniqueConstraint,
func,
)
from sqlalchemy.dialects.postgresql import ARRAY, JSONB, UUID
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm import relationship as sa_relationship
from database import Base
# ── Enums ────────────────────────────────────────────────────────────────────
class ContentType(str, enum.Enum):
"""Source video content type."""
tutorial = "tutorial"
livestream = "livestream"
breakdown = "breakdown"
short_form = "short_form"
class ProcessingStatus(str, enum.Enum):
"""Pipeline processing status for a source video."""
pending = "pending"
transcribed = "transcribed"
extracted = "extracted"
reviewed = "reviewed"
published = "published"
class KeyMomentContentType(str, enum.Enum):
"""Content classification for a key moment."""
technique = "technique"
settings = "settings"
reasoning = "reasoning"
workflow = "workflow"
class ReviewStatus(str, enum.Enum):
"""Human review status for key moments."""
pending = "pending"
approved = "approved"
edited = "edited"
rejected = "rejected"
class SourceQuality(str, enum.Enum):
"""Derived source quality for technique pages."""
structured = "structured"
mixed = "mixed"
unstructured = "unstructured"
class PageReviewStatus(str, enum.Enum):
"""Review lifecycle for technique pages."""
draft = "draft"
reviewed = "reviewed"
published = "published"
class RelationshipType(str, enum.Enum):
"""Types of links between technique pages."""
same_technique_other_creator = "same_technique_other_creator"
same_creator_adjacent = "same_creator_adjacent"
general_cross_reference = "general_cross_reference"
# ── Helpers ──────────────────────────────────────────────────────────────────
def _uuid_pk() -> Mapped[uuid.UUID]:
return mapped_column(
UUID(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
server_default=func.gen_random_uuid(),
)
def _now() -> datetime:
"""Return current UTC time as a naive datetime (no tzinfo).
PostgreSQL TIMESTAMP WITHOUT TIME ZONE columns require naive datetimes.
asyncpg rejects timezone-aware datetimes for such columns.
"""
return datetime.now(timezone.utc).replace(tzinfo=None)
# ── Models ───────────────────────────────────────────────────────────────────
class Creator(Base):
__tablename__ = "creators"
id: Mapped[uuid.UUID] = _uuid_pk()
name: Mapped[str] = mapped_column(String(255), nullable=False)
slug: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
genres: Mapped[list[str] | None] = mapped_column(ARRAY(String), nullable=True)
folder_name: Mapped[str] = mapped_column(String(255), nullable=False)
view_count: Mapped[int] = mapped_column(Integer, default=0, server_default="0")
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
updated_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now(), onupdate=_now
)
# relationships
videos: Mapped[list[SourceVideo]] = sa_relationship(back_populates="creator")
technique_pages: Mapped[list[TechniquePage]] = sa_relationship(back_populates="creator")
class SourceVideo(Base):
__tablename__ = "source_videos"
id: Mapped[uuid.UUID] = _uuid_pk()
creator_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("creators.id", ondelete="CASCADE"), nullable=False
)
filename: Mapped[str] = mapped_column(String(500), nullable=False)
file_path: Mapped[str] = mapped_column(String(1000), nullable=False)
duration_seconds: Mapped[int] = mapped_column(Integer, nullable=True)
content_type: Mapped[ContentType] = mapped_column(
Enum(ContentType, name="content_type", create_constraint=True),
nullable=False,
)
transcript_path: Mapped[str | None] = mapped_column(String(1000), nullable=True)
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True, index=True)
processing_status: Mapped[ProcessingStatus] = mapped_column(
Enum(ProcessingStatus, name="processing_status", create_constraint=True),
default=ProcessingStatus.pending,
server_default="pending",
)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
updated_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now(), onupdate=_now
)
# relationships
creator: Mapped[Creator] = sa_relationship(back_populates="videos")
segments: Mapped[list[TranscriptSegment]] = sa_relationship(back_populates="source_video")
key_moments: Mapped[list[KeyMoment]] = sa_relationship(back_populates="source_video")
class TranscriptSegment(Base):
__tablename__ = "transcript_segments"
id: Mapped[uuid.UUID] = _uuid_pk()
source_video_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("source_videos.id", ondelete="CASCADE"), nullable=False
)
start_time: Mapped[float] = mapped_column(Float, nullable=False)
end_time: Mapped[float] = mapped_column(Float, nullable=False)
text: Mapped[str] = mapped_column(Text, nullable=False)
segment_index: Mapped[int] = mapped_column(Integer, nullable=False)
topic_label: Mapped[str | None] = mapped_column(String(255), nullable=True)
# relationships
source_video: Mapped[SourceVideo] = sa_relationship(back_populates="segments")
class KeyMoment(Base):
__tablename__ = "key_moments"
id: Mapped[uuid.UUID] = _uuid_pk()
source_video_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("source_videos.id", ondelete="CASCADE"), nullable=False
)
technique_page_id: Mapped[uuid.UUID | None] = mapped_column(
ForeignKey("technique_pages.id", ondelete="SET NULL"), nullable=True
)
title: Mapped[str] = mapped_column(String(500), nullable=False)
summary: Mapped[str] = mapped_column(Text, nullable=False)
start_time: Mapped[float] = mapped_column(Float, nullable=False)
end_time: Mapped[float] = mapped_column(Float, nullable=False)
content_type: Mapped[KeyMomentContentType] = mapped_column(
Enum(KeyMomentContentType, name="key_moment_content_type", create_constraint=True),
nullable=False,
)
plugins: Mapped[list[str] | None] = mapped_column(ARRAY(String), nullable=True)
review_status: Mapped[ReviewStatus] = mapped_column(
Enum(ReviewStatus, name="review_status", create_constraint=True),
default=ReviewStatus.pending,
server_default="pending",
)
raw_transcript: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
updated_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now(), onupdate=_now
)
# relationships
source_video: Mapped[SourceVideo] = sa_relationship(back_populates="key_moments")
technique_page: Mapped[TechniquePage | None] = sa_relationship(
back_populates="key_moments", foreign_keys=[technique_page_id]
)
class TechniquePage(Base):
__tablename__ = "technique_pages"
id: Mapped[uuid.UUID] = _uuid_pk()
creator_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("creators.id", ondelete="CASCADE"), nullable=False
)
title: Mapped[str] = mapped_column(String(500), nullable=False)
slug: Mapped[str] = mapped_column(String(500), unique=True, nullable=False)
topic_category: Mapped[str] = mapped_column(String(255), nullable=False)
topic_tags: Mapped[list[str] | None] = mapped_column(ARRAY(String), nullable=True)
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
body_sections: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
signal_chains: Mapped[list | None] = mapped_column(JSONB, nullable=True)
plugins: Mapped[list[str] | None] = mapped_column(ARRAY(String), nullable=True)
source_quality: Mapped[SourceQuality | None] = mapped_column(
Enum(SourceQuality, name="source_quality", create_constraint=True),
nullable=True,
)
view_count: Mapped[int] = mapped_column(Integer, default=0, server_default="0")
review_status: Mapped[PageReviewStatus] = mapped_column(
Enum(PageReviewStatus, name="page_review_status", create_constraint=True),
default=PageReviewStatus.draft,
server_default="draft",
)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
updated_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now(), onupdate=_now
)
# relationships
creator: Mapped[Creator] = sa_relationship(back_populates="technique_pages")
key_moments: Mapped[list[KeyMoment]] = sa_relationship(
back_populates="technique_page", foreign_keys=[KeyMoment.technique_page_id]
)
versions: Mapped[list[TechniquePageVersion]] = sa_relationship(
back_populates="technique_page", order_by="TechniquePageVersion.version_number"
)
outgoing_links: Mapped[list[RelatedTechniqueLink]] = sa_relationship(
foreign_keys="RelatedTechniqueLink.source_page_id", back_populates="source_page"
)
incoming_links: Mapped[list[RelatedTechniqueLink]] = sa_relationship(
foreign_keys="RelatedTechniqueLink.target_page_id", back_populates="target_page"
)
class RelatedTechniqueLink(Base):
__tablename__ = "related_technique_links"
__table_args__ = (
UniqueConstraint("source_page_id", "target_page_id", "relationship", name="uq_technique_link"),
)
id: Mapped[uuid.UUID] = _uuid_pk()
source_page_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("technique_pages.id", ondelete="CASCADE"), nullable=False
)
target_page_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("technique_pages.id", ondelete="CASCADE"), nullable=False
)
relationship: Mapped[RelationshipType] = mapped_column(
Enum(RelationshipType, name="relationship_type", create_constraint=True),
nullable=False,
)
# relationships
source_page: Mapped[TechniquePage] = sa_relationship(
foreign_keys=[source_page_id], back_populates="outgoing_links"
)
target_page: Mapped[TechniquePage] = sa_relationship(
foreign_keys=[target_page_id], back_populates="incoming_links"
)
class TechniquePageVersion(Base):
"""Snapshot of a TechniquePage before a pipeline re-synthesis overwrites it."""
__tablename__ = "technique_page_versions"
id: Mapped[uuid.UUID] = _uuid_pk()
technique_page_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("technique_pages.id", ondelete="CASCADE"), nullable=False
)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
content_snapshot: Mapped[dict] = mapped_column(JSONB, nullable=False)
pipeline_metadata: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
# relationships
technique_page: Mapped[TechniquePage] = sa_relationship(
back_populates="versions"
)
class Tag(Base):
__tablename__ = "tags"
id: Mapped[uuid.UUID] = _uuid_pk()
name: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
category: Mapped[str] = mapped_column(String(255), nullable=False)
aliases: Mapped[list[str] | None] = mapped_column(ARRAY(String), nullable=True)
# ── Content Report Enums ─────────────────────────────────────────────────────
class ReportType(str, enum.Enum):
"""Classification of user-submitted content reports."""
inaccurate = "inaccurate"
missing_info = "missing_info"
wrong_attribution = "wrong_attribution"
formatting = "formatting"
other = "other"
class ReportStatus(str, enum.Enum):
"""Triage status for content reports."""
open = "open"
acknowledged = "acknowledged"
resolved = "resolved"
dismissed = "dismissed"
# ── Content Report ───────────────────────────────────────────────────────────
class ContentReport(Base):
"""User-submitted report about a content issue.
Generic: content_type + content_id can reference any entity
(technique_page, key_moment, creator, or general).
"""
__tablename__ = "content_reports"
id: Mapped[uuid.UUID] = _uuid_pk()
content_type: Mapped[str] = mapped_column(
String(50), nullable=False, doc="Entity type: technique_page, key_moment, creator, general"
)
content_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), nullable=True, doc="FK to the reported entity (null for general reports)"
)
content_title: Mapped[str | None] = mapped_column(
String(500), nullable=True, doc="Snapshot of entity title at report time"
)
report_type: Mapped[ReportType] = mapped_column(
Enum(ReportType, name="report_type", create_constraint=True),
nullable=False,
)
description: Mapped[str] = mapped_column(Text, nullable=False)
status: Mapped[ReportStatus] = mapped_column(
Enum(ReportStatus, name="report_status", create_constraint=True),
default=ReportStatus.open,
server_default="open",
)
admin_notes: Mapped[str | None] = mapped_column(Text, nullable=True)
page_url: Mapped[str | None] = mapped_column(
String(1000), nullable=True, doc="URL the user was on when reporting"
)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
resolved_at: Mapped[datetime | None] = mapped_column(nullable=True)
# ── Pipeline Event ───────────────────────────────────────────────────────────
class PipelineEvent(Base):
"""Structured log entry for pipeline execution.
Captures per-stage start/complete/error/llm_call events with
token usage and optional response payloads for debugging.
"""
__tablename__ = "pipeline_events"
id: Mapped[uuid.UUID] = _uuid_pk()
video_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), nullable=False, index=True,
)
stage: Mapped[str] = mapped_column(
String(50), nullable=False, doc="stage2_segmentation, stage3_extraction, etc."
)
event_type: Mapped[str] = mapped_column(
String(30), nullable=False, doc="start, complete, error, llm_call"
)
prompt_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
completion_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
model: Mapped[str | None] = mapped_column(String(100), nullable=True)
duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
payload: Mapped[dict | None] = mapped_column(
JSONB, nullable=True, doc="LLM response content, error details, stage metadata"
)
created_at: Mapped[datetime] = mapped_column(
default=_now, server_default=func.now()
)
# Debug mode — full LLM I/O capture columns
system_prompt_text: Mapped[str | None] = mapped_column(Text, nullable=True)
user_prompt_text: Mapped[str | None] = mapped_column(Text, nullable=True)
response_text: Mapped[str | None] = mapped_column(Text, nullable=True)