promptlooper/backend/routers/runs.py
John Lightner b3fb8e3063 MAESTRO: Implement runs router with full CRUD, filtering, scoring, and leaderboard
- List runs with filtering by experiment, status, and score range plus pagination
- Get run detail with eager-loaded stage results and scores
- Ad-hoc single run creation with Celery/sync dispatch
- Human scoring endpoint (POST /{id}/score)
- Leaderboard endpoint with configurable weighted scoring from experiment scoring_config
- Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas
- 25 tests in test_runs.py, all passing (503 total tests passing)
2026-04-07 03:24:56 -05:00

281 lines
9.1 KiB
Python

"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard.
Provides filtering by experiment, status, and score range. The leaderboard
endpoint returns top N runs ranked by weighted score.
"""
import uuid
from fastapi import APIRouter, Depends, HTTPException, Query, status
from sqlalchemy.orm import Session, joinedload
from auth import get_current_user
from engine.cache import compute_config_hash
from engine.tasks import dispatch_run
from main import get_db
from models import Experiment, Run, RunStatus, Score, StageResult, User
from schemas import (
AdHocRunCreate,
LeaderboardEntry,
LeaderboardResponse,
RunDetailResponse,
RunListResponse,
RunResponse,
ScoreInput,
ScoreResponse,
StageResultResponse,
)
router = APIRouter()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run:
run = db.query(Run).filter(Run.id == run_id).first()
if run is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
return run
# ---------------------------------------------------------------------------
# List runs with filtering
# ---------------------------------------------------------------------------
@router.get("/", response_model=RunListResponse)
def list_runs(
experiment_id: uuid.UUID | None = Query(None),
run_status: RunStatus | None = Query(None, alias="status"),
min_score: float | None = Query(None, ge=0.0, le=1.0),
max_score: float | None = Query(None, ge=0.0, le=1.0),
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunListResponse:
"""List runs with optional filtering by experiment, status, and score range."""
query = db.query(Run)
if experiment_id is not None:
query = query.filter(Run.experiment_id == experiment_id)
if run_status is not None:
query = query.filter(Run.status == run_status)
# Score range filtering: filter runs whose average score falls within range
if min_score is not None or max_score is not None:
from sqlalchemy import func
score_subquery = (
db.query(Score.run_id, func.avg(Score.value).label("avg_score"))
.group_by(Score.run_id)
.subquery()
)
query = query.join(score_subquery, Run.id == score_subquery.c.run_id)
if min_score is not None:
query = query.filter(score_subquery.c.avg_score >= min_score)
if max_score is not None:
query = query.filter(score_subquery.c.avg_score <= max_score)
total = query.count()
runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all()
return RunListResponse(
items=[RunResponse.model_validate(r) for r in runs],
total=total,
)
# ---------------------------------------------------------------------------
# Get run detail
# ---------------------------------------------------------------------------
@router.get("/{run_id}", response_model=RunDetailResponse)
def get_run(
run_id: uuid.UUID,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunDetailResponse:
"""Get run detail with stage results and scores."""
run = (
db.query(Run)
.options(joinedload(Run.stage_results), joinedload(Run.scores))
.filter(Run.id == run_id)
.first()
)
if run is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
return RunDetailResponse(
id=run.id,
experiment_id=run.experiment_id,
config_hash=run.config_hash,
config=run.config,
status=run.status,
started_at=run.started_at,
completed_at=run.completed_at,
duration_ms=run.duration_ms,
tokens_in=run.tokens_in,
tokens_out=run.tokens_out,
cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None,
stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results],
scores=[ScoreResponse.model_validate(s) for s in run.scores],
)
# ---------------------------------------------------------------------------
# Ad-hoc single run
# ---------------------------------------------------------------------------
@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED)
def create_run(
body: AdHocRunCreate,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunResponse:
"""Create and dispatch an ad-hoc single run."""
# Verify experiment exists
experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first()
if experiment is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
config_hash = compute_config_hash(
prompt=body.config.get("prompt", ""),
model=body.config.get("model", ""),
params=body.config.get("params", {}),
input_data=body.config.get("input_data", ""),
)
run = Run(
experiment_id=body.experiment_id,
config=body.config,
config_hash=config_hash,
status=RunStatus.pending,
)
db.add(run)
db.commit()
db.refresh(run)
# Dispatch execution asynchronously
dispatch_run(str(run.id))
return RunResponse.model_validate(run)
# ---------------------------------------------------------------------------
# Human scoring
# ---------------------------------------------------------------------------
@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED)
def score_run(
run_id: uuid.UUID,
body: ScoreInput,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> ScoreResponse:
"""Add a human rating/score to a run."""
run = _get_run_or_404(db, run_id)
score = Score(
run_id=run.id,
scorer_name=body.scorer_name,
value=body.value,
scorer_metadata=body.metadata,
)
db.add(score)
db.commit()
db.refresh(score)
return ScoreResponse.model_validate(score)
# ---------------------------------------------------------------------------
# Leaderboard
# ---------------------------------------------------------------------------
@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse)
def leaderboard(
experiment_id: uuid.UUID,
top_n: int = Query(10, ge=1, le=100),
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> LeaderboardResponse:
"""Top N runs ranked by weighted score for an experiment.
Weighted score uses the experiment's scoring_config weights if available,
otherwise uses equal weighting across all scorers.
"""
# Verify experiment exists
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
if experiment is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
# Get scoring weights from experiment config
weights: dict[str, float] = {}
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
weights = experiment.scoring_config.get("weights", {})
# Get all completed runs for this experiment with their scores
runs = (
db.query(Run)
.options(joinedload(Run.scores))
.filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed)
.all()
)
entries: list[LeaderboardEntry] = []
for run in runs:
if not run.scores:
continue
score_map: dict[str, float] = {}
for s in run.scores:
# If multiple scores with same scorer_name, use the latest
score_map[s.scorer_name] = s.value
# Compute weighted score
if weights:
total_weight = sum(weights.get(name, 0.0) for name in score_map)
if total_weight > 0:
weighted = sum(
score_map[name] * weights.get(name, 0.0)
for name in score_map
if name in weights
) / total_weight
else:
# No matching weights — fall back to equal weighting
weighted = sum(score_map.values()) / len(score_map)
else:
# Equal weighting
weighted = sum(score_map.values()) / len(score_map)
entries.append(LeaderboardEntry(
run_id=run.id,
config_hash=run.config_hash,
config=run.config,
status=run.status,
weighted_score=weighted,
scores=score_map,
duration_ms=run.duration_ms,
tokens_in=run.tokens_in,
tokens_out=run.tokens_out,
))
# Sort by weighted_score descending, take top N
entries.sort(key=lambda e: e.weighted_score, reverse=True)
entries = entries[:top_n]
return LeaderboardResponse(
experiment_id=experiment_id,
entries=entries,
total=len(entries),
)