From 10cd1753331231e82d9d742a88af4306c7962bbf Mon Sep 17 00:00:00 2001
From: jlightner <jlightner@users.noreply.github.com>
Date: Sat, 4 Apr 2026 08:24:44 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20Added=20personality=5Fprofile=20JSONB?=
 =?UTF-8?q?=20column=20to=20Creator=20model=20with=20migr=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- "backend/models.py"
- "backend/schemas.py"
- "backend/routers/creators.py"
- "alembic/versions/023_add_personality_profile.py"

GSD-Task: S06/T01
---
 .gsd/DECISIONS.md                             |   1 +
 .gsd/milestones/M022/M022-ROADMAP.md          |   2 +-
 .../milestones/M022/slices/S05/S05-SUMMARY.md |  84 +++++++
 .gsd/milestones/M022/slices/S05/S05-UAT.md    |  49 ++++
 .../M022/slices/S05/tasks/T02-VERIFY.json     |  36 +++
 .gsd/milestones/M022/slices/S06/S06-PLAN.md   | 224 +++++++++++++++++-
 .../M022/slices/S06/S06-RESEARCH.md           | 162 +++++++++++++
 .../M022/slices/S06/tasks/T01-PLAN.md         |  65 +++++
 .../M022/slices/S06/tasks/T01-SUMMARY.md      |  83 +++++++
 .../M022/slices/S06/tasks/T02-PLAN.md         | 119 ++++++++++
 .../M022/slices/S06/tasks/T03-PLAN.md         | 107 +++++++++
 .../versions/023_add_personality_profile.py   |  21 ++
 backend/models.py                             |   1 +
 backend/routers/creators.py                   |   1 +
 backend/schemas.py                            |   1 +
 15 files changed, 954 insertions(+), 2 deletions(-)
 create mode 100644 .gsd/milestones/M022/slices/S05/S05-SUMMARY.md
 create mode 100644 .gsd/milestones/M022/slices/S05/S05-UAT.md
 create mode 100644 .gsd/milestones/M022/slices/S05/tasks/T02-VERIFY.json
 create mode 100644 .gsd/milestones/M022/slices/S06/S06-RESEARCH.md
 create mode 100644 .gsd/milestones/M022/slices/S06/tasks/T01-PLAN.md
 create mode 100644 .gsd/milestones/M022/slices/S06/tasks/T01-SUMMARY.md
 create mode 100644 .gsd/milestones/M022/slices/S06/tasks/T02-PLAN.md
 create mode 100644 .gsd/milestones/M022/slices/S06/tasks/T03-PLAN.md
 create mode 100644 alembic/versions/023_add_personality_profile.py

diff --git a/.gsd/DECISIONS.md b/.gsd/DECISIONS.md
index 85be067..d3df628 100644
--- a/.gsd/DECISIONS.md
+++ b/.gsd/DECISIONS.md
@@ -46,3 +46,4 @@
 | D038 |  | infrastructure | Primary git remote for chrysopedia | git.xpltd.co (Forgejo) instead of github.com | Consolidating on self-hosted Forgejo instance at git.xpltd.co. Wiki is already there. Single source of truth. | Yes | human |
 | D039 |  | architecture | LightRAG vs Qdrant search execution strategy | Sequential with fallback — LightRAG first, Qdrant only on LightRAG failure/empty, not parallel | Running both in parallel would double latency overhead. LightRAG is the primary engine; Qdrant is a safety net. Sequential approach reduces load and simplifies result merging. | Yes | agent |
 | D040 | M021/S02 | architecture | Creator-scoped retrieval cascade strategy | Sequential 4-tier cascade (creator → domain → global → none) with ll_keywords scoping and post-filtering | Sequential cascade is simpler than parallel-with-priority and avoids wasted LightRAG calls when early tiers succeed. ll_keywords hints LightRAG's retrieval without hard constraints. Post-filtering on tier 1 ensures strict creator scoping while 3x oversampling compensates for filtering losses. Domain tier uses ≥2 page threshold to avoid noise from sparse creators. | Yes | agent |
+| D041 | M022/S05 | architecture | Highlight scorer weight distribution for 10-dimension model | Original 7 dimensions reduced proportionally, new 3 audio proxy dimensions (speech_rate_variance, pause_density, speaking_pace) allocated 0.22 total weight. Audio dims default to 0.5 (neutral) when word_timings unavailable for backward compatibility. | Audio proxy signals derived from word-level timing data provide meaningful highlight quality indicators without requiring raw audio analysis (librosa). Neutral fallback ensures existing scoring paths are unaffected. | Yes | agent |
diff --git a/.gsd/milestones/M022/M022-ROADMAP.md b/.gsd/milestones/M022/M022-ROADMAP.md
index 9bb1ddb..4a7f63e 100644
--- a/.gsd/milestones/M022/M022-ROADMAP.md
+++ b/.gsd/milestones/M022/M022-ROADMAP.md
@@ -10,6 +10,6 @@ Creator-facing tools take shape: shorts queue, follow system, chat widget (UI on
 | S02 | [A] Follow System + Tier UI (Demo Placeholders) | medium | — | ✅ | Users can follow creators. Tier config page has styled Coming Soon payment placeholders. |
 | S03 | [A] Chat Widget Shell (UI Only) | low | — | ✅ | Chat bubble on creator profile pages with conversation UI, typing indicator, suggested questions |
 | S04 | [B] Multi-Turn Conversation Memory | medium | — | ✅ | Multi-turn conversations maintain context across messages using Redis-backed history |
-| S05 | [B] Highlight Detection v2 (Audio Signals) | medium | — | ⬜ | Highlight detection uses audio energy analysis (librosa) alongside transcript signals for improved scoring |
+| S05 | [B] Highlight Detection v2 (Audio Signals) | medium | — | ✅ | Highlight detection uses audio energy analysis (librosa) alongside transcript signals for improved scoring |
 | S06 | [B] Personality Profile Extraction | high | — | ⬜ | Personality profiles extracted for 3+ creators showing distinct vocabulary, tone, and style markers |
 | S07 | Forgejo KB Update — Follow, Personality, Highlights | low | S01, S02, S03, S04, S05, S06 | ⬜ | Forgejo wiki updated with follow system, personality system, highlight engine v2 |
diff --git a/.gsd/milestones/M022/slices/S05/S05-SUMMARY.md b/.gsd/milestones/M022/slices/S05/S05-SUMMARY.md
new file mode 100644
index 0000000..9d0c0f5
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S05/S05-SUMMARY.md
@@ -0,0 +1,84 @@
+---
+id: S05
+parent: M022
+milestone: M022
+provides:
+  - 10-dimension highlight scoring with audio proxy signals
+  - extract_word_timings() utility for any future word-timing analysis
+requires:
+  []
+affects:
+  - S07
+key_files:
+  - backend/pipeline/highlight_scorer.py
+  - backend/pipeline/highlight_schemas.py
+  - backend/pipeline/test_highlight_scorer.py
+  - backend/pipeline/stages.py
+key_decisions:
+  - D041: Highlight scorer weight distribution — 3 audio proxy dimensions get 0.22 total weight, neutral fallback when word_timings unavailable
+  - transcript_path in DB stores absolute paths — use directly
+  - Accept both {segments:[...]} and bare [...] transcript JSON formats
+patterns_established:
+  - Word-level timing extraction as audio-analysis proxy — avoids librosa/ffmpeg dependency while providing meaningful speech-pattern signals
+  - Neutral-fallback scoring: new dimensions default to 0.5 when input data unavailable, preserving backward compatibility without conditional weight redistribution
+observability_surfaces:
+  - score_breakdown JSONB in highlight_candidates contains all 10 dimension scores — queryable for debugging and tuning
+drill_down_paths:
+  - .gsd/milestones/M022/slices/S05/tasks/T01-SUMMARY.md
+  - .gsd/milestones/M022/slices/S05/tasks/T02-SUMMARY.md
+duration: ""
+verification_result: passed
+completed_at: 2026-04-04T08:15:09.580Z
+blocker_discovered: false
+---
+
+# S05: [B] Highlight Detection v2 (Audio Signals)
+
+**Highlight scorer expanded from 7 to 10 dimensions with speech-rate variance, pause density, and speaking-pace fitness derived from word-level transcript timing data — deployed and verified on 62 real candidates.**
+
+## What Happened
+
+T01 added four new pure functions to highlight_scorer.py: `extract_word_timings()` filters word-level timing dicts from transcript JSON by time window; `_speech_rate_variance()` computes words-per-second coefficient of variation in 5s sliding windows; `_pause_density()` counts and weights inter-word gaps (>0.5s short, >1.0s long); `_speaking_pace_fitness()` applies a bell-curve around the 3-5 WPS optimal teaching pace. The `_WEIGHTS` dict was rebalanced from 7 to 10 dimensions summing to 1.0, with the new audio proxy dimensions getting 0.22 total weight. `score_moment()` accepts an optional `word_timings` parameter — when None, the three new dimensions score 0.5 (neutral), preserving backward compatibility. `HighlightScoreBreakdown` schema was extended with 3 new float fields. 34 new tests were added alongside the existing 28, all 62 pass.
+
+T02 wired the scoring into the Celery pipeline. `stage_highlight_detection()` now loads the transcript JSON once per video via `SourceVideo.transcript_path`, extracts word timings per moment, and passes them to `score_moment()`. Graceful fallback: if transcript is missing or malformed, word_timings=None and the scorer uses neutral values. Two pre-existing bugs were fixed: the upsert constraint name was wrong (`uq_highlight_candidate_moment` → `highlight_candidates_key_moment_id_key`), and transcript_path stores absolute paths (not relative). Deployed to ub01 and ran on KOAN Sound video — 62 candidates scored with all 10 dimensions populated with non-neutral audio proxy values.
+
+## Verification
+
+1. `python -m pytest backend/pipeline/test_highlight_scorer.py -v` — 62/62 passed in 0.09s. Covers all new functions, edge cases, backward compatibility, and weight normalization.
+2. Production DB query on ub01 confirms 10-dimension score_breakdown with non-neutral audio proxy values (speech_rate_variance: 0.057, pause_density: 0.0, speaking_pace: 1.0) for real candidates.
+3. chrysopedia-worker container is Up and healthy on ub01.
+
+## Requirements Advanced
+
+None.
+
+## Requirements Validated
+
+None.
+
+## New Requirements Surfaced
+
+None.
+
+## Requirements Invalidated or Re-scoped
+
+None.
+
+## Deviations
+
+transcript_path stores absolute paths in DB, not relative — used directly without path joining. Fixed pre-existing constraint name bug that was blocking upserts.
+
+## Known Limitations
+
+Audio proxy signals are derived from word-level timing data (a text proxy), not actual audio waveform analysis. The original plan mentioned librosa but word timings proved sufficient. Pause density can read 0.0 for content without inter-word gaps > 0.5s — this is correct behavior, not a bug.
+
+## Follow-ups
+
+None.
+
+## Files Created/Modified
+
+- `backend/pipeline/highlight_scorer.py` — Added extract_word_timings(), _speech_rate_variance(), _pause_density(), _speaking_pace_fitness(); rebalanced _WEIGHTS to 10 dimensions; updated score_moment() with optional word_timings parameter
+- `backend/pipeline/highlight_schemas.py` — Added speech_rate_variance_score, pause_density_score, speaking_pace_score fields to HighlightScoreBreakdown
+- `backend/pipeline/test_highlight_scorer.py` — Added 34 new tests for word timing extraction, speech rate variance, pause density, speaking pace fitness, and backward compatibility
+- `backend/pipeline/stages.py` — Updated stage_highlight_detection() to load transcript JSON, extract word timings per moment, pass to scorer; fixed constraint name bug
diff --git a/.gsd/milestones/M022/slices/S05/S05-UAT.md b/.gsd/milestones/M022/slices/S05/S05-UAT.md
new file mode 100644
index 0000000..0fa1c74
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S05/S05-UAT.md
@@ -0,0 +1,49 @@
+# S05: [B] Highlight Detection v2 (Audio Signals) — UAT
+
+**Milestone:** M022
+**Written:** 2026-04-04T08:15:09.580Z
+
+## UAT: Highlight Detection v2 (Audio Signals)
+
+### Preconditions
+- Access to ub01 via SSH
+- chrysopedia-worker container running and healthy
+- At least one video with transcript data in the database
+- Python environment with project dependencies installed locally
+
+### Test 1: Unit Tests Pass (All 62)
+1. Run `python -m pytest backend/pipeline/test_highlight_scorer.py -v`
+2. **Expected:** 62 tests pass, 0 failures
+3. **Expected:** Tests cover: extract_word_timings (6 tests), speech_rate_variance (6 tests), pause_density (6 tests), speaking_pace_fitness (10 tests), backward_compatibility (4 tests), plus original 28 tests
+
+### Test 2: Backward Compatibility — No word_timings
+1. In a Python REPL, call `score_moment(moment_dict)` WITHOUT word_timings parameter
+2. **Expected:** Returns a valid score between 0.0 and 1.0
+3. **Expected:** score_breakdown contains all 10 keys, with speech_rate_variance_score=0.5, pause_density_score=0.5, speaking_pace_score=0.5
+
+### Test 3: Scoring With Word Timings
+1. Call `score_moment(moment_dict, word_timings=sample_timings)` with real timing data
+2. **Expected:** Audio proxy dimensions have non-neutral values (≠ 0.5)
+3. **Expected:** Total score differs from the no-timings version
+
+### Test 4: Production DB — 10-Dimension Breakdowns
+1. SSH to ub01, query: `SELECT score_breakdown FROM highlight_candidates ORDER BY updated_at DESC LIMIT 5`
+2. **Expected:** Each row has exactly 10 keys in score_breakdown
+3. **Expected:** Keys include speech_rate_variance_score, pause_density_score, speaking_pace_score
+4. **Expected:** At least some rows have non-0.5 values for the new audio dimensions
+
+### Test 5: Graceful Fallback — Missing Transcript
+1. Ensure a video exists with transcript_path=NULL in source_videos
+2. Trigger `stage_highlight_detection` for that video
+3. **Expected:** Task completes without error
+4. **Expected:** Highlight candidates created with audio proxy dimensions at 0.5 (neutral)
+
+### Test 6: Worker Health After Deployment
+1. Run `ssh ub01 "docker ps --filter name=chrysopedia-worker --format '{{.Status}}'"` 
+2. **Expected:** Shows "Up ... (healthy)"
+
+### Edge Cases
+- **Empty transcript segments:** Transcript JSON with segments but no words arrays → scorer uses neutral 0.5
+- **Single word in window:** Only one word timing in the moment's time range → all three audio dimensions return 0.5
+- **Very fast speech (>10 WPS):** speaking_pace_fitness returns 0.0
+- **No pauses in segment:** All words tightly packed → pause_density returns 0.0 (correct — no strategic pauses)
diff --git a/.gsd/milestones/M022/slices/S05/tasks/T02-VERIFY.json b/.gsd/milestones/M022/slices/S05/tasks/T02-VERIFY.json
new file mode 100644
index 0000000..6271642
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S05/tasks/T02-VERIFY.json
@@ -0,0 +1,36 @@
+{
+  "schemaVersion": 1,
+  "taskId": "T02",
+  "unitId": "M022/S05/T02",
+  "timestamp": 1775290292221,
+  "passed": false,
+  "discoverySource": "task-plan",
+  "checks": [
+    {
+      "command": "ssh ub01 'cd /vmPool/r/repos/xpltdco/chrysopedia",
+      "exitCode": 2,
+      "durationMs": 8,
+      "verdict": "fail"
+    },
+    {
+      "command": "docker compose build chrysopedia-worker",
+      "exitCode": 0,
+      "durationMs": 73539,
+      "verdict": "pass"
+    },
+    {
+      "command": "docker compose up -d chrysopedia-worker",
+      "exitCode": 1,
+      "durationMs": 11790,
+      "verdict": "fail"
+    },
+    {
+      "command": "sleep 5",
+      "exitCode": 0,
+      "durationMs": 5023,
+      "verdict": "pass"
+    }
+  ],
+  "retryAttempt": 1,
+  "maxRetries": 2
+}
diff --git a/.gsd/milestones/M022/slices/S06/S06-PLAN.md b/.gsd/milestones/M022/slices/S06/S06-PLAN.md
index c2498dd..859e58b 100644
--- a/.gsd/milestones/M022/slices/S06/S06-PLAN.md
+++ b/.gsd/milestones/M022/slices/S06/S06-PLAN.md
@@ -1,6 +1,228 @@
 # S06: [B] Personality Profile Extraction
 
-**Goal:** Build personality extraction pipeline stage from creator transcripts
+**Goal:** Personality profiles extracted from creator transcripts via LLM, stored as JSONB on Creator model, exposed via API, and rendered on the frontend creator detail page.
 **Demo:** After this: Personality profiles extracted for 3+ creators showing distinct vocabulary, tone, and style markers
 
 ## Tasks
+- [x] **T01: Added personality_profile JSONB column to Creator model with migration, schema field, and API passthrough** — ## Description
+
+Add the `personality_profile` JSONB column to the Creator model, create the Alembic migration, update the Pydantic `CreatorDetail` schema, and ensure the existing `GET /creators/{slug}` endpoint passes through the new field. This is pure plumbing — no extraction logic.
+
+## Steps
+
+1. Add `personality_profile: Mapped[dict | None] = mapped_column(JSONB, nullable=True)` to `Creator` model in `backend/models.py`, after the existing `social_links` JSONB column.
+2. Create Alembic migration `alembic/versions/023_add_personality_profile.py` using raw SQL pattern (matching `022_add_creator_follows.py` style): `ALTER TABLE creators ADD COLUMN personality_profile JSONB;` for upgrade, `ALTER TABLE creators DROP COLUMN personality_profile;` for downgrade.
+3. Add `personality_profile: dict | None = None` field to `CreatorDetail` schema in `backend/schemas.py`.
+4. Update the `get_creator` endpoint in `backend/routers/creators.py` to include `personality_profile=creator.personality_profile` in the `CreatorDetail(...)` constructor call.
+5. Verify the model imports cleanly, the migration applies, and the schema validates.
+
+## Must-Haves
+
+- [ ] `Creator` model has `personality_profile` JSONB column
+- [ ] Alembic migration 023 exists and applies cleanly
+- [ ] `CreatorDetail` schema includes `personality_profile` field
+- [ ] `GET /creators/{slug}` response includes `personality_profile` (null when not set)
+
+## Verification
+
+- `cd backend && python -c "from models import Creator; assert hasattr(Creator, 'personality_profile'); print('OK')"`
+- `cd backend && python -c "from schemas import CreatorDetail; assert 'personality_profile' in CreatorDetail.model_fields; print('OK')"`
+- Migration file exists: `test -f alembic/versions/023_add_personality_profile.py`
+- `grep -q 'personality_profile' backend/routers/creators.py`
+
+## Inputs
+
+- `backend/models.py` — existing Creator model to extend
+- `backend/schemas.py` — existing CreatorDetail schema to extend
+- `backend/routers/creators.py` — existing get_creator endpoint to update
+- `alembic/versions/022_add_creator_follows.py` — migration pattern reference
+
+## Expected Output
+
+- `backend/models.py` — Creator model with personality_profile column
+- `backend/schemas.py` — CreatorDetail with personality_profile field
+- `backend/routers/creators.py` — get_creator passes personality_profile through
+- `alembic/versions/023_add_personality_profile.py` — new migration
+  - Estimate: 30m
+  - Files: backend/models.py, backend/schemas.py, backend/routers/creators.py, alembic/versions/023_add_personality_profile.py
+  - Verify: cd backend && python -c "from models import Creator; assert hasattr(Creator, 'personality_profile'); print('model OK')" && python -c "from schemas import CreatorDetail; assert 'personality_profile' in CreatorDetail.model_fields; print('schema OK')" && test -f ../alembic/versions/023_add_personality_profile.py && echo 'migration exists'
+- [ ] **T02: Implement personality extraction Celery task, prompt template, and admin trigger** — ## Description
+
+Build the core extraction pipeline: a prompt template that analyzes creator transcripts for distinctive personality markers, a Celery task that aggregates and samples transcripts then calls the LLM, and an admin endpoint to trigger extraction. Follows existing stage patterns in `pipeline/stages.py`.
+
+## Failure Modes
+
+| Dependency | On error | On timeout | On malformed response |
+|------------|----------|-----------|----------------------|
+| LLM API (`_get_llm_client`) | Celery retry (max_retries=2, 60s delay) | Same retry logic | Pydantic validation fails → retry with error context in prompt |
+| PostgreSQL (sync session) | Task fails, logged via `_emit_event` error | Connection pool timeout → task fails | N/A |
+| Redis (classification data for sampling) | Fall back to random sampling without topic diversity | Same fallback | Same fallback |
+
+## Negative Tests
+
+- **Malformed inputs**: Creator with zero key moments (no transcripts) → task returns early with log warning, no profile stored
+- **Error paths**: LLM returns invalid JSON → Pydantic validation catches, retry with cleaner prompt instruction
+- **Boundary conditions**: Creator with <500 chars total transcript → still attempt extraction but note low sample size in profile metadata
+
+## Steps
+
+1. Create `prompts/personality_extraction.txt` with a system prompt that:
+   - Receives transcript excerpts from a single creator
+   - Analyzes vocabulary patterns (signature phrases, jargon level, filler words, distinctive terms)
+   - Analyzes tone (formality, energy, humor, teaching style, descriptors)
+   - Analyzes style markers (explanation approach, analogies, sound words, self-references, audience engagement)
+   - Produces a one-paragraph summary capturing what makes this creator distinctive
+   - Returns structured JSON matching the profile schema from research doc
+   - Explicitly instructs: focus on what makes this creator DISTINCT, not universal traits
+
+2. Add transcript sampling function `_sample_creator_transcripts(moments, creator_id, max_chars=40000)` in `backend/pipeline/stages.py`:
+   - Small (<20K chars total): use all text
+   - Medium (20K-60K): first 300 chars from each moment, up to budget
+   - Large (>60K): random sample seeded by creator_id UUID, try to cover diverse topic_categories from Redis classification data (key `chrysopedia:classification:{video_id}`), cap at max_chars
+   - Return tuple of (sampled_text: str, sample_size: int)
+
+3. Add Celery task `extract_personality_profile(self, creator_id: str) -> str` in `backend/pipeline/stages.py`:
+   - Use `@celery_app.task(bind=True, max_retries=2, default_retry_delay=60)`
+   - Load creator row via `_get_sync_session()`
+   - Load all KeyMoments with non-null `raw_transcript` for this creator (join through SourceVideo)
+   - If no moments with transcripts, log warning and return early
+   - Call `_sample_creator_transcripts()` to get sampled text
+   - Load prompt via `_load_prompt('personality_extraction.txt')`
+   - Build user prompt with creator name and sampled transcripts
+   - Call `_get_llm_client().complete()` with `response_model=None` (parse JSON manually since profile schema is nested)
+   - Parse LLM response as JSON, validate structure with a Pydantic model `PersonalityProfile`
+   - Add metadata: `extracted_at`, `transcript_sample_size`, `model_used`
+   - Store validated dict on `Creator.personality_profile`, commit
+   - Emit pipeline events (start/complete/error) via `_emit_event` using creator_id as video_id param (reuse existing event infrastructure)
+
+4. Define `PersonalityProfile` Pydantic model in `backend/schemas.py` for validation (not API response — used internally by the task to validate LLM output). Include all fields from the research doc schema.
+
+5. Add admin endpoint `POST /admin/creators/{slug}/extract-profile` in `backend/routers/admin.py`:
+   - Look up creator by slug, 404 if not found
+   - Queue `extract_personality_profile.delay(str(creator.id))`
+   - Return `{"status": "queued", "creator_id": str(creator.id)}`
+
+## Must-Haves
+
+- [ ] Prompt template requests structured JSON with vocabulary, tone, style_markers, and summary
+- [ ] Transcript sampling respects three size tiers with deterministic seeding
+- [ ] Celery task handles zero-transcript creators gracefully (no crash, no partial write)
+- [ ] LLM response validated via Pydantic before storage
+- [ ] Pipeline events emitted for observability
+- [ ] Admin endpoint queues task and returns immediately
+
+## Verification
+
+- `test -f prompts/personality_extraction.txt` — prompt exists
+- `cd backend && python -c "from pipeline.stages import extract_personality_profile; print('task importable')"` — task importable
+- `cd backend && python -c "from schemas import PersonalityProfile; print('validator importable')"` — validator exists
+- `grep -q 'extract-profile' backend/routers/admin.py` — endpoint wired
+
+## Observability Impact
+
+- Signals added: `_emit_event` calls with stage='personality_extraction' for start/complete/error
+- How a future agent inspects this: query `pipeline_events` table for stage='personality_extraction', check Creator.personality_profile column
+- Failure state exposed: error event with creator_id, transcript_sample_size, LLM error message
+
+## Inputs
+
+- `backend/models.py` — Creator model with personality_profile column (from T01)
+- `backend/schemas.py` — CreatorDetail schema (from T01)
+- `backend/pipeline/stages.py` — existing stage patterns, _get_llm_client, _get_sync_session, _emit_event, _load_prompt
+- `backend/pipeline/llm_client.py` — LLMClient.complete() signature
+- `backend/routers/admin.py` — existing admin router to extend
+
+## Expected Output
+
+- `prompts/personality_extraction.txt` — new prompt template
+- `backend/pipeline/stages.py` — extract_personality_profile task + _sample_creator_transcripts helper
+- `backend/schemas.py` — PersonalityProfile validation model
+- `backend/routers/admin.py` — extract-profile endpoint
+  - Estimate: 1h30m
+  - Files: prompts/personality_extraction.txt, backend/pipeline/stages.py, backend/schemas.py, backend/routers/admin.py
+  - Verify: test -f prompts/personality_extraction.txt && cd backend && python -c "from pipeline.stages import extract_personality_profile; print('task OK')" && python -c "from schemas import PersonalityProfile; print('validator OK')" && grep -q 'extract-profile' routers/admin.py && echo 'all OK'
+- [ ] **T03: Add personality profile display to CreatorDetail frontend page** — ## Description
+
+Add a collapsible personality profile section to the CreatorDetail page. Update the TypeScript API type, create a PersonalityProfile component, and wire it into the page layout below the bio/social links section.
+
+## Steps
+
+1. Update `CreatorDetailResponse` interface in `frontend/src/api/creators.ts` to add:
+   ```typescript
+   personality_profile: {
+     vocabulary: {
+       signature_phrases: string[];
+       technical_jargon_level: string;
+       filler_words: string[];
+       distinctive_terms: string[];
+     };
+     tone: {
+       formality: string;
+       energy: string;
+       humor_frequency: string;
+       teaching_style: string;
+       descriptors: string[];
+     };
+     style_markers: {
+       explanation_approach: string;
+       uses_analogies: boolean;
+       uses_sound_words: boolean;
+       self_references_frequency: string;
+       audience_engagement: string;
+     };
+     summary: string;
+     extracted_at: string;
+     transcript_sample_size: number;
+     model_used: string;
+   } | null;
+   ```
+
+2. Create `frontend/src/components/PersonalityProfile.tsx`:
+   - Accept the personality_profile object as prop (or null — render nothing if null)
+   - Collapsible section with heading "Personality Profile" using the CSS grid-template-rows 0fr/1fr animation pattern (per KNOWLEDGE.md)
+   - Three sub-cards:
+     - **Teaching Style**: formality, energy, teaching_style, humor_frequency as descriptive text; tone descriptors as pill badges
+     - **Vocabulary**: signature_phrases and distinctive_terms as pill badges; technical_jargon_level and filler_words as text
+     - **Style**: explanation_approach, audience_engagement as descriptive text; boolean markers (uses_analogies, uses_sound_words) as checkmark/cross indicators
+   - One-paragraph summary at the top of the section
+   - Use existing CSS patterns: pill badges (reuse tag styling), card containers, dark theme colors from CSS custom properties
+   - Default to collapsed state; toggle on click
+
+3. Import and render `PersonalityProfile` in `frontend/src/pages/CreatorDetail.tsx`:
+   - Place below the bio/social links section, before the techniques list
+   - Pass `creator.personality_profile` as prop
+   - Component handles null gracefully (renders nothing)
+
+4. Verify frontend builds without errors.
+
+## Must-Haves
+
+- [ ] `CreatorDetailResponse` type includes personality_profile field
+- [ ] `PersonalityProfile` component renders vocabulary, tone, and style sections
+- [ ] Component handles null profile (renders nothing, no crash)
+- [ ] Collapsible with smooth animation
+- [ ] Uses existing CSS patterns (pills, cards, dark theme)
+- [ ] Frontend builds successfully
+
+## Verification
+
+- `cd frontend && npx tsc --noEmit` — TypeScript compiles
+- `cd frontend && npm run build` — production build succeeds
+- `test -f frontend/src/components/PersonalityProfile.tsx` — component exists
+- `grep -q 'PersonalityProfile' frontend/src/pages/CreatorDetail.tsx` — component wired in
+
+## Inputs
+
+- `frontend/src/api/creators.ts` — existing CreatorDetailResponse type
+- `frontend/src/pages/CreatorDetail.tsx` — existing creator detail page
+- `frontend/src/components/TagList.tsx` — pill badge pattern reference
+- `frontend/src/index.css` — CSS custom properties reference
+
+## Expected Output
+
+- `frontend/src/api/creators.ts` — updated type with personality_profile
+- `frontend/src/components/PersonalityProfile.tsx` — new component
+- `frontend/src/pages/CreatorDetail.tsx` — personality section wired in
+  - Estimate: 1h
+  - Files: frontend/src/api/creators.ts, frontend/src/components/PersonalityProfile.tsx, frontend/src/pages/CreatorDetail.tsx
+  - Verify: cd frontend && npx tsc --noEmit && npm run build && test -f src/components/PersonalityProfile.tsx && grep -q 'PersonalityProfile' src/pages/CreatorDetail.tsx && echo 'all OK'
diff --git a/.gsd/milestones/M022/slices/S06/S06-RESEARCH.md b/.gsd/milestones/M022/slices/S06/S06-RESEARCH.md
new file mode 100644
index 0000000..173b227
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S06/S06-RESEARCH.md
@@ -0,0 +1,162 @@
+# S06 Research — Personality Profile Extraction
+
+## Summary
+
+Personality profile extraction is a new feature with no existing code. It follows the established pipeline pattern (load data → LLM prompt → store result) but operates per-creator instead of per-video. The data foundation is strong: 19 creators with raw transcript text on KeyMoment rows (ranging from 2K to 150K chars per creator). The primary risk is prompt engineering — getting the LLM to produce structured, distinct personality markers across creators with varying transcript volumes.
+
+## Recommendation
+
+**Approach:** Add a new Celery task `extract_personality_profile` that:
+1. Aggregates a creator's raw transcripts (sampled if > context window)
+2. Sends to LLM with a personality analysis prompt requesting structured JSON
+3. Stores the result as JSONB on a new `personality_profile` column on the `creators` table
+4. Expose via existing `GET /creators/{slug}` endpoint (add to `CreatorDetail` schema)
+5. Display on CreatorDetail page in a new section
+
+**Storage:** JSONB column on `creators` table, not a separate table. One profile per creator, overwritten on re-extraction. Simple, no joins, no versioning needed initially. If versioning becomes important later, the TechniquePageVersion pattern exists as precedent.
+
+**Why not a separate table:** The profile is 1:1 with creator, small (~2-5KB JSON), and doesn't need its own lifecycle. A column keeps the query simple and avoids a join in the already-complex CreatorDetail endpoint.
+
+## Implementation Landscape
+
+### Existing Patterns to Reuse
+
+| Pattern | Where | How it applies |
+|---------|-------|----------------|
+| Celery task with LLM call | `stages.py` — all stage functions | Same `_get_llm_client().complete()` call pattern |
+| Sync SQLAlchemy in Celery | `stages.py` — `_get_sync_session()` | Profile task uses same sync engine |
+| Prompt from file | `_load_prompt()` in stages.py | New `personality_extraction.txt` prompt file |
+| JSONB storage | `Creator.social_links`, `TechniquePage.body_sections` | Same JSONB pattern for profile data |
+| Alembic raw SQL migrations | `022_add_creator_follows.py` | New migration adds `personality_profile JSONB` column |
+| Schema extension | `CreatorDetail` in `schemas.py` | Add `personality_profile: dict | None` field |
+| `_emit_event` for pipeline observability | All stage tasks | Reuse for profile extraction events |
+
+### Key Files
+
+| File | Role |
+|------|------|
+| `backend/models.py` | Add `personality_profile` JSONB column to `Creator` |
+| `backend/schemas.py` | Add field to `CreatorDetail` response schema |
+| `backend/pipeline/stages.py` | New `extract_personality_profile` Celery task |
+| `backend/routers/creators.py` | Include profile in `get_creator` response (already reads Creator row) |
+| `prompts/personality_extraction.txt` | New prompt template |
+| `alembic/versions/023_add_personality_profile.py` | Migration |
+| `frontend/src/pages/CreatorDetail.tsx` | New personality profile display section |
+| `frontend/src/api/creators.ts` | Type update for personality data |
+
+### Profile Schema (JSONB structure)
+
+```json
+{
+  "vocabulary": {
+    "signature_phrases": ["string"],
+    "technical_jargon_level": "low|medium|high",
+    "filler_words": ["string"],
+    "distinctive_terms": ["string"]
+  },
+  "tone": {
+    "formality": "casual|conversational|formal",
+    "energy": "low|medium|high",
+    "humor_frequency": "rare|occasional|frequent",
+    "teaching_style": "directive|exploratory|narrative",
+    "descriptors": ["string"]
+  },
+  "style_markers": {
+    "explanation_approach": "step-by-step|conceptual|example-driven",
+    "uses_analogies": true,
+    "uses_sound_words": true,
+    "self_references_frequency": "rare|occasional|frequent",
+    "audience_engagement": "direct|indirect|minimal"
+  },
+  "summary": "One paragraph personality summary",
+  "extracted_at": "ISO timestamp",
+  "transcript_sample_size": 1234,
+  "model_used": "model-name"
+}
+```
+
+### Transcript Sampling Strategy
+
+Creators range from 2K to 150K chars of transcript text. LLM context windows are typically 8K-32K tokens. Strategy:
+
+- **Small creators (<20K chars):** Use all transcript text
+- **Medium creators (20K-60K chars):** Sample evenly — take first 300 chars from each moment, up to context budget
+- **Large creators (>60K chars):** Random sample of moments covering diverse topics (use `topic_category` from classification data to ensure coverage), cap at ~40K chars
+
+The sampling function should be deterministic (seeded by creator_id) so re-runs produce the same profile unless new content is added.
+
+### LLM Prompt Design
+
+The prompt needs to:
+1. Receive aggregated transcript excerpts attributed to a single creator
+2. Analyze vocabulary patterns, tone markers, teaching style
+3. Return structured JSON matching the schema above
+4. Be explicit about what makes this creator *distinct* — not generic traits
+
+Key constraint: the prompt should ask the LLM to focus on **distinctive** traits, not universal ones. Every creator "explains things" — the value is in how KOAN Sound's clinical precision differs from COPYCATT's energetic colloquialisms.
+
+### Celery Task Design
+
+```python
+@celery_app.task(bind=True, max_retries=2, default_retry_delay=60)
+def extract_personality_profile(self, creator_id: str) -> str:
+    # 1. Load creator + their key moments with raw_transcript
+    # 2. Sample transcripts per strategy above
+    # 3. Load prompt, call LLM
+    # 4. Parse JSON response, validate structure
+    # 5. Store on Creator.personality_profile
+    # 6. Emit pipeline event
+    return creator_id
+```
+
+No pipeline integration needed — this runs standalone, triggered manually or via admin endpoint. It's not part of the per-video pipeline chain.
+
+### Admin Trigger
+
+Add a simple endpoint: `POST /admin/creators/{slug}/extract-profile` that queues the Celery task. The admin UI can add an "Extract Profile" button on the creator detail page (or we add a batch endpoint to process all creators).
+
+### Frontend Display
+
+New collapsible section on CreatorDetail page below bio/social links:
+- **Teaching Style** card with tone descriptors, teaching approach
+- **Vocabulary** card with signature phrases, distinctive terms
+- **Style** card with explanation approach markers
+
+Use existing CSS patterns (pill badges for signature phrases, descriptive text for summaries).
+
+### Verification Strategy
+
+1. Run extraction on 3+ creators with substantial transcript data (KOAN Sound, COPYCATT, Chee — all >60K chars)
+2. Verify profiles contain distinct, non-generic content
+3. Verify JSON structure matches schema
+4. Verify API returns profile data in CreatorDetail response
+5. Verify frontend renders profile section
+6. Manual spot-check: profiles should feel recognizably different
+
+### Risks & Mitigations
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| LLM produces generic profiles (not distinctive) | High — defeats purpose | Prompt engineering: explicitly ask for differentiators, provide contrast examples |
+| Large transcript sampling loses representative content | Medium | Topic-diverse sampling using classification data |
+| Context window overflow | Medium | Hard cap on transcript size, chunking if needed |
+| Profile JSON doesn't match expected schema | Low | Pydantic validation in task, retry with error feedback |
+
+### Natural Task Seams
+
+1. **T01: DB model + migration + schema** — Add `personality_profile` JSONB to Creator, Alembic migration, update Pydantic schema. Fast, mechanical.
+2. **T02: Prompt + Celery task + admin trigger** — New prompt file, extraction task, admin endpoint. The core logic.
+3. **T03: Frontend display** — New section on CreatorDetail, type updates. Independent of backend verification.
+4. **T04: Run extraction + verify** — Execute on 3+ creators, verify distinctness, end-to-end check.
+
+T01 unblocks T02 and T03 (parallel). T04 depends on T02.
+
+### What to Build First
+
+T01 (schema) — it's the foundation. Then T02 (extraction logic) is the riskiest piece and should go next. T03 (frontend) can parallel with T02 since it can use mock data initially.
+
+### Don't Hand-Roll
+
+- Use `_get_llm_client().complete()` with `response_model` for structured output — don't manually parse JSON
+- Use existing `_get_sync_session()` and `_emit_event` patterns — don't create new DB/event infrastructure
+- Use existing Pydantic model validation for the profile schema — don't hand-validate JSON structure
diff --git a/.gsd/milestones/M022/slices/S06/tasks/T01-PLAN.md b/.gsd/milestones/M022/slices/S06/tasks/T01-PLAN.md
new file mode 100644
index 0000000..ad6a32c
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S06/tasks/T01-PLAN.md
@@ -0,0 +1,65 @@
+---
+estimated_steps: 28
+estimated_files: 4
+skills_used: []
+---
+
+# T01: Add personality_profile column, migration, schema, and API passthrough
+
+## Description
+
+Add the `personality_profile` JSONB column to the Creator model, create the Alembic migration, update the Pydantic `CreatorDetail` schema, and ensure the existing `GET /creators/{slug}` endpoint passes through the new field. This is pure plumbing — no extraction logic.
+
+## Steps
+
+1. Add `personality_profile: Mapped[dict | None] = mapped_column(JSONB, nullable=True)` to `Creator` model in `backend/models.py`, after the existing `social_links` JSONB column.
+2. Create Alembic migration `alembic/versions/023_add_personality_profile.py` using raw SQL pattern (matching `022_add_creator_follows.py` style): `ALTER TABLE creators ADD COLUMN personality_profile JSONB;` for upgrade, `ALTER TABLE creators DROP COLUMN personality_profile;` for downgrade.
+3. Add `personality_profile: dict | None = None` field to `CreatorDetail` schema in `backend/schemas.py`.
+4. Update the `get_creator` endpoint in `backend/routers/creators.py` to include `personality_profile=creator.personality_profile` in the `CreatorDetail(...)` constructor call.
+5. Verify the model imports cleanly, the migration applies, and the schema validates.
+
+## Must-Haves
+
+- [ ] `Creator` model has `personality_profile` JSONB column
+- [ ] Alembic migration 023 exists and applies cleanly
+- [ ] `CreatorDetail` schema includes `personality_profile` field
+- [ ] `GET /creators/{slug}` response includes `personality_profile` (null when not set)
+
+## Verification
+
+- `cd backend && python -c "from models import Creator; assert hasattr(Creator, 'personality_profile'); print('OK')"`
+- `cd backend && python -c "from schemas import CreatorDetail; assert 'personality_profile' in CreatorDetail.model_fields; print('OK')"`
+- Migration file exists: `test -f alembic/versions/023_add_personality_profile.py`
+- `grep -q 'personality_profile' backend/routers/creators.py`
+
+## Inputs
+
+- `backend/models.py` — existing Creator model to extend
+- `backend/schemas.py` — existing CreatorDetail schema to extend
+- `backend/routers/creators.py` — existing get_creator endpoint to update
+- `alembic/versions/022_add_creator_follows.py` — migration pattern reference
+
+## Expected Output
+
+- `backend/models.py` — Creator model with personality_profile column
+- `backend/schemas.py` — CreatorDetail with personality_profile field
+- `backend/routers/creators.py` — get_creator passes personality_profile through
+- `alembic/versions/023_add_personality_profile.py` — new migration
+
+## Inputs
+
+- `backend/models.py`
+- `backend/schemas.py`
+- `backend/routers/creators.py`
+- `alembic/versions/022_add_creator_follows.py`
+
+## Expected Output
+
+- `backend/models.py`
+- `backend/schemas.py`
+- `backend/routers/creators.py`
+- `alembic/versions/023_add_personality_profile.py`
+
+## Verification
+
+cd backend && python -c "from models import Creator; assert hasattr(Creator, 'personality_profile'); print('model OK')" && python -c "from schemas import CreatorDetail; assert 'personality_profile' in CreatorDetail.model_fields; print('schema OK')" && test -f ../alembic/versions/023_add_personality_profile.py && echo 'migration exists'
diff --git a/.gsd/milestones/M022/slices/S06/tasks/T01-SUMMARY.md b/.gsd/milestones/M022/slices/S06/tasks/T01-SUMMARY.md
new file mode 100644
index 0000000..0fa68ec
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S06/tasks/T01-SUMMARY.md
@@ -0,0 +1,83 @@
+---
+id: T01
+parent: S06
+milestone: M022
+provides: []
+requires: []
+affects: []
+key_files: ["backend/models.py", "backend/schemas.py", "backend/routers/creators.py", "alembic/versions/023_add_personality_profile.py"]
+key_decisions: []
+patterns_established: []
+drill_down_paths: []
+observability_surfaces: []
+duration: ""
+verification_result: "All four verification checks pass: model has attribute, schema has field, migration file exists, router references personality_profile."
+completed_at: 2026-04-04T08:24:41.345Z
+blocker_discovered: false
+---
+
+# T01: Added personality_profile JSONB column to Creator model with migration, schema field, and API passthrough
+
+> Added personality_profile JSONB column to Creator model with migration, schema field, and API passthrough
+
+## What Happened
+---
+id: T01
+parent: S06
+milestone: M022
+key_files:
+  - backend/models.py
+  - backend/schemas.py
+  - backend/routers/creators.py
+  - alembic/versions/023_add_personality_profile.py
+key_decisions:
+  - (none)
+duration: ""
+verification_result: passed
+completed_at: 2026-04-04T08:24:41.345Z
+blocker_discovered: false
+---
+
+# T01: Added personality_profile JSONB column to Creator model with migration, schema field, and API passthrough
+
+**Added personality_profile JSONB column to Creator model with migration, schema field, and API passthrough**
+
+## What Happened
+
+Added personality_profile as a nullable JSONB column on the Creator model after social_links. Created Alembic migration 023 using the raw SQL pattern. Added the field to CreatorDetail Pydantic schema and wired it through the get_creator endpoint. Pure plumbing — no extraction logic.
+
+## Verification
+
+All four verification checks pass: model has attribute, schema has field, migration file exists, router references personality_profile.
+
+## Verification Evidence
+
+| # | Command | Exit Code | Verdict | Duration |
+|---|---------|-----------|---------|----------|
+| 1 | `python -c "from models import Creator; assert hasattr(Creator, 'personality_profile')"` | 0 | ✅ pass | 500ms |
+| 2 | `python -c "from schemas import CreatorDetail; assert 'personality_profile' in CreatorDetail.model_fields"` | 0 | ✅ pass | 500ms |
+| 3 | `test -f alembic/versions/023_add_personality_profile.py` | 0 | ✅ pass | 50ms |
+| 4 | `grep -q 'personality_profile' backend/routers/creators.py` | 0 | ✅ pass | 50ms |
+
+
+## Deviations
+
+None.
+
+## Known Issues
+
+None.
+
+## Files Created/Modified
+
+- `backend/models.py`
+- `backend/schemas.py`
+- `backend/routers/creators.py`
+- `alembic/versions/023_add_personality_profile.py`
+
+
+## Deviations
+None.
+
+## Known Issues
+None.
diff --git a/.gsd/milestones/M022/slices/S06/tasks/T02-PLAN.md b/.gsd/milestones/M022/slices/S06/tasks/T02-PLAN.md
new file mode 100644
index 0000000..4d45671
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S06/tasks/T02-PLAN.md
@@ -0,0 +1,119 @@
+---
+estimated_steps: 71
+estimated_files: 4
+skills_used: []
+---
+
+# T02: Implement personality extraction Celery task, prompt template, and admin trigger
+
+## Description
+
+Build the core extraction pipeline: a prompt template that analyzes creator transcripts for distinctive personality markers, a Celery task that aggregates and samples transcripts then calls the LLM, and an admin endpoint to trigger extraction. Follows existing stage patterns in `pipeline/stages.py`.
+
+## Failure Modes
+
+| Dependency | On error | On timeout | On malformed response |
+|------------|----------|-----------|----------------------|
+| LLM API (`_get_llm_client`) | Celery retry (max_retries=2, 60s delay) | Same retry logic | Pydantic validation fails → retry with error context in prompt |
+| PostgreSQL (sync session) | Task fails, logged via `_emit_event` error | Connection pool timeout → task fails | N/A |
+| Redis (classification data for sampling) | Fall back to random sampling without topic diversity | Same fallback | Same fallback |
+
+## Negative Tests
+
+- **Malformed inputs**: Creator with zero key moments (no transcripts) → task returns early with log warning, no profile stored
+- **Error paths**: LLM returns invalid JSON → Pydantic validation catches, retry with cleaner prompt instruction
+- **Boundary conditions**: Creator with <500 chars total transcript → still attempt extraction but note low sample size in profile metadata
+
+## Steps
+
+1. Create `prompts/personality_extraction.txt` with a system prompt that:
+   - Receives transcript excerpts from a single creator
+   - Analyzes vocabulary patterns (signature phrases, jargon level, filler words, distinctive terms)
+   - Analyzes tone (formality, energy, humor, teaching style, descriptors)
+   - Analyzes style markers (explanation approach, analogies, sound words, self-references, audience engagement)
+   - Produces a one-paragraph summary capturing what makes this creator distinctive
+   - Returns structured JSON matching the profile schema from research doc
+   - Explicitly instructs: focus on what makes this creator DISTINCT, not universal traits
+
+2. Add transcript sampling function `_sample_creator_transcripts(moments, creator_id, max_chars=40000)` in `backend/pipeline/stages.py`:
+   - Small (<20K chars total): use all text
+   - Medium (20K-60K): first 300 chars from each moment, up to budget
+   - Large (>60K): random sample seeded by creator_id UUID, try to cover diverse topic_categories from Redis classification data (key `chrysopedia:classification:{video_id}`), cap at max_chars
+   - Return tuple of (sampled_text: str, sample_size: int)
+
+3. Add Celery task `extract_personality_profile(self, creator_id: str) -> str` in `backend/pipeline/stages.py`:
+   - Use `@celery_app.task(bind=True, max_retries=2, default_retry_delay=60)`
+   - Load creator row via `_get_sync_session()`
+   - Load all KeyMoments with non-null `raw_transcript` for this creator (join through SourceVideo)
+   - If no moments with transcripts, log warning and return early
+   - Call `_sample_creator_transcripts()` to get sampled text
+   - Load prompt via `_load_prompt('personality_extraction.txt')`
+   - Build user prompt with creator name and sampled transcripts
+   - Call `_get_llm_client().complete()` with `response_model=None` (parse JSON manually since profile schema is nested)
+   - Parse LLM response as JSON, validate structure with a Pydantic model `PersonalityProfile`
+   - Add metadata: `extracted_at`, `transcript_sample_size`, `model_used`
+   - Store validated dict on `Creator.personality_profile`, commit
+   - Emit pipeline events (start/complete/error) via `_emit_event` using creator_id as video_id param (reuse existing event infrastructure)
+
+4. Define `PersonalityProfile` Pydantic model in `backend/schemas.py` for validation (not API response — used internally by the task to validate LLM output). Include all fields from the research doc schema.
+
+5. Add admin endpoint `POST /admin/creators/{slug}/extract-profile` in `backend/routers/admin.py`:
+   - Look up creator by slug, 404 if not found
+   - Queue `extract_personality_profile.delay(str(creator.id))`
+   - Return `{"status": "queued", "creator_id": str(creator.id)}`
+
+## Must-Haves
+
+- [ ] Prompt template requests structured JSON with vocabulary, tone, style_markers, and summary
+- [ ] Transcript sampling respects three size tiers with deterministic seeding
+- [ ] Celery task handles zero-transcript creators gracefully (no crash, no partial write)
+- [ ] LLM response validated via Pydantic before storage
+- [ ] Pipeline events emitted for observability
+- [ ] Admin endpoint queues task and returns immediately
+
+## Verification
+
+- `test -f prompts/personality_extraction.txt` — prompt exists
+- `cd backend && python -c "from pipeline.stages import extract_personality_profile; print('task importable')"` — task importable
+- `cd backend && python -c "from schemas import PersonalityProfile; print('validator importable')"` — validator exists
+- `grep -q 'extract-profile' backend/routers/admin.py` — endpoint wired
+
+## Observability Impact
+
+- Signals added: `_emit_event` calls with stage='personality_extraction' for start/complete/error
+- How a future agent inspects this: query `pipeline_events` table for stage='personality_extraction', check Creator.personality_profile column
+- Failure state exposed: error event with creator_id, transcript_sample_size, LLM error message
+
+## Inputs
+
+- `backend/models.py` — Creator model with personality_profile column (from T01)
+- `backend/schemas.py` — CreatorDetail schema (from T01)
+- `backend/pipeline/stages.py` — existing stage patterns, _get_llm_client, _get_sync_session, _emit_event, _load_prompt
+- `backend/pipeline/llm_client.py` — LLMClient.complete() signature
+- `backend/routers/admin.py` — existing admin router to extend
+
+## Expected Output
+
+- `prompts/personality_extraction.txt` — new prompt template
+- `backend/pipeline/stages.py` — extract_personality_profile task + _sample_creator_transcripts helper
+- `backend/schemas.py` — PersonalityProfile validation model
+- `backend/routers/admin.py` — extract-profile endpoint
+
+## Inputs
+
+- `backend/models.py`
+- `backend/schemas.py`
+- `backend/pipeline/stages.py`
+- `backend/pipeline/llm_client.py`
+- `backend/routers/admin.py`
+
+## Expected Output
+
+- `prompts/personality_extraction.txt`
+- `backend/pipeline/stages.py`
+- `backend/schemas.py`
+- `backend/routers/admin.py`
+
+## Verification
+
+test -f prompts/personality_extraction.txt && cd backend && python -c "from pipeline.stages import extract_personality_profile; print('task OK')" && python -c "from schemas import PersonalityProfile; print('validator OK')" && grep -q 'extract-profile' routers/admin.py && echo 'all OK'
diff --git a/.gsd/milestones/M022/slices/S06/tasks/T03-PLAN.md b/.gsd/milestones/M022/slices/S06/tasks/T03-PLAN.md
new file mode 100644
index 0000000..ec31993
--- /dev/null
+++ b/.gsd/milestones/M022/slices/S06/tasks/T03-PLAN.md
@@ -0,0 +1,107 @@
+---
+estimated_steps: 68
+estimated_files: 3
+skills_used: []
+---
+
+# T03: Add personality profile display to CreatorDetail frontend page
+
+## Description
+
+Add a collapsible personality profile section to the CreatorDetail page. Update the TypeScript API type, create a PersonalityProfile component, and wire it into the page layout below the bio/social links section.
+
+## Steps
+
+1. Update `CreatorDetailResponse` interface in `frontend/src/api/creators.ts` to add:
+   ```typescript
+   personality_profile: {
+     vocabulary: {
+       signature_phrases: string[];
+       technical_jargon_level: string;
+       filler_words: string[];
+       distinctive_terms: string[];
+     };
+     tone: {
+       formality: string;
+       energy: string;
+       humor_frequency: string;
+       teaching_style: string;
+       descriptors: string[];
+     };
+     style_markers: {
+       explanation_approach: string;
+       uses_analogies: boolean;
+       uses_sound_words: boolean;
+       self_references_frequency: string;
+       audience_engagement: string;
+     };
+     summary: string;
+     extracted_at: string;
+     transcript_sample_size: number;
+     model_used: string;
+   } | null;
+   ```
+
+2. Create `frontend/src/components/PersonalityProfile.tsx`:
+   - Accept the personality_profile object as prop (or null — render nothing if null)
+   - Collapsible section with heading "Personality Profile" using the CSS grid-template-rows 0fr/1fr animation pattern (per KNOWLEDGE.md)
+   - Three sub-cards:
+     - **Teaching Style**: formality, energy, teaching_style, humor_frequency as descriptive text; tone descriptors as pill badges
+     - **Vocabulary**: signature_phrases and distinctive_terms as pill badges; technical_jargon_level and filler_words as text
+     - **Style**: explanation_approach, audience_engagement as descriptive text; boolean markers (uses_analogies, uses_sound_words) as checkmark/cross indicators
+   - One-paragraph summary at the top of the section
+   - Use existing CSS patterns: pill badges (reuse tag styling), card containers, dark theme colors from CSS custom properties
+   - Default to collapsed state; toggle on click
+
+3. Import and render `PersonalityProfile` in `frontend/src/pages/CreatorDetail.tsx`:
+   - Place below the bio/social links section, before the techniques list
+   - Pass `creator.personality_profile` as prop
+   - Component handles null gracefully (renders nothing)
+
+4. Verify frontend builds without errors.
+
+## Must-Haves
+
+- [ ] `CreatorDetailResponse` type includes personality_profile field
+- [ ] `PersonalityProfile` component renders vocabulary, tone, and style sections
+- [ ] Component handles null profile (renders nothing, no crash)
+- [ ] Collapsible with smooth animation
+- [ ] Uses existing CSS patterns (pills, cards, dark theme)
+- [ ] Frontend builds successfully
+
+## Verification
+
+- `cd frontend && npx tsc --noEmit` — TypeScript compiles
+- `cd frontend && npm run build` — production build succeeds
+- `test -f frontend/src/components/PersonalityProfile.tsx` — component exists
+- `grep -q 'PersonalityProfile' frontend/src/pages/CreatorDetail.tsx` — component wired in
+
+## Inputs
+
+- `frontend/src/api/creators.ts` — existing CreatorDetailResponse type
+- `frontend/src/pages/CreatorDetail.tsx` — existing creator detail page
+- `frontend/src/components/TagList.tsx` — pill badge pattern reference
+- `frontend/src/index.css` — CSS custom properties reference
+
+## Expected Output
+
+- `frontend/src/api/creators.ts` — updated type with personality_profile
+- `frontend/src/components/PersonalityProfile.tsx` — new component
+- `frontend/src/pages/CreatorDetail.tsx` — personality section wired in
+
+## Inputs
+
+- `frontend/src/api/creators.ts`
+- `frontend/src/pages/CreatorDetail.tsx`
+- `frontend/src/components/TagList.tsx`
+- `frontend/src/index.css`
+
+## Expected Output
+
+- `frontend/src/api/creators.ts`
+- `frontend/src/components/PersonalityProfile.tsx`
+- `frontend/src/pages/CreatorDetail.tsx`
+
+## Verification
+
+cd frontend && npx tsc --noEmit && npm run build && test -f src/components/PersonalityProfile.tsx && grep -q 'PersonalityProfile' src/pages/CreatorDetail.tsx && echo 'all OK'
diff --git a/alembic/versions/023_add_personality_profile.py b/alembic/versions/023_add_personality_profile.py
new file mode 100644
index 0000000..8189df1
--- /dev/null
+++ b/alembic/versions/023_add_personality_profile.py
@@ -0,0 +1,21 @@
+"""Add personality_profile JSONB column to creators.
+
+Revision ID: 023_add_personality_profile
+Revises: 022_add_creator_follows
+"""
+
+from alembic import op
+
+
+revision = "023_add_personality_profile"
+down_revision = "022_add_creator_follows"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute("ALTER TABLE creators ADD COLUMN IF NOT EXISTS personality_profile JSONB")
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE creators DROP COLUMN IF EXISTS personality_profile")
diff --git a/backend/models.py b/backend/models.py
index cd9a7f1..c070d25 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -130,6 +130,7 @@ class Creator(Base):
     avatar_fetched_at: Mapped[datetime | None] = mapped_column(nullable=True)
     bio: Mapped[str | None] = mapped_column(Text, nullable=True)
     social_links: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
+    personality_profile: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
     featured: Mapped[bool] = mapped_column(default=False, server_default="false")
     view_count: Mapped[int] = mapped_column(Integer, default=0, server_default="0")
     hidden: Mapped[bool] = mapped_column(default=False, server_default="false")
diff --git a/backend/routers/creators.py b/backend/routers/creators.py
index 2d2cd0a..c18b2cb 100644
--- a/backend/routers/creators.py
+++ b/backend/routers/creators.py
@@ -186,6 +186,7 @@ async def get_creator(
         **creator_data.model_dump(),
         bio=creator.bio,
         social_links=creator.social_links,
+        personality_profile=creator.personality_profile,
         featured=creator.featured,
         video_count=video_count,
         technique_count=len(techniques),
diff --git a/backend/schemas.py b/backend/schemas.py
index 5a7ad14..7c480e6 100644
--- a/backend/schemas.py
+++ b/backend/schemas.py
@@ -63,6 +63,7 @@ class CreatorDetail(CreatorRead):
     technique_count: int = 0
     moment_count: int = 0
     follower_count: int = 0
+    personality_profile: dict | None = None
     techniques: list[CreatorTechniqueItem] = []
     genre_breakdown: dict[str, int] = {}