- List runs with filtering by experiment, status, and score range plus pagination
- Get run detail with eager-loaded stage results and scores
- Ad-hoc single run creation with Celery/sync dispatch
- Human scoring endpoint (POST /{id}/score)
- Leaderboard endpoint with configurable weighted scoring from experiment scoring_config
- Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas
- 25 tests in test_runs.py, all passing (503 total tests passing)
281 lines
9.1 KiB
Python
281 lines
9.1 KiB
Python
"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard.
|
|
|
|
Provides filtering by experiment, status, and score range. The leaderboard
|
|
endpoint returns top N runs ranked by weighted score.
|
|
"""
|
|
|
|
import uuid
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
|
from sqlalchemy.orm import Session, joinedload
|
|
|
|
from auth import get_current_user
|
|
from engine.cache import compute_config_hash
|
|
from engine.tasks import dispatch_run
|
|
from main import get_db
|
|
from models import Experiment, Run, RunStatus, Score, StageResult, User
|
|
from schemas import (
|
|
AdHocRunCreate,
|
|
LeaderboardEntry,
|
|
LeaderboardResponse,
|
|
RunDetailResponse,
|
|
RunListResponse,
|
|
RunResponse,
|
|
ScoreInput,
|
|
ScoreResponse,
|
|
StageResultResponse,
|
|
)
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run:
|
|
run = db.query(Run).filter(Run.id == run_id).first()
|
|
if run is None:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
|
|
return run
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# List runs with filtering
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("/", response_model=RunListResponse)
|
|
def list_runs(
|
|
experiment_id: uuid.UUID | None = Query(None),
|
|
run_status: RunStatus | None = Query(None, alias="status"),
|
|
min_score: float | None = Query(None, ge=0.0, le=1.0),
|
|
max_score: float | None = Query(None, ge=0.0, le=1.0),
|
|
limit: int = Query(50, ge=1, le=500),
|
|
offset: int = Query(0, ge=0),
|
|
db: Session = Depends(get_db),
|
|
_user: User = Depends(get_current_user),
|
|
) -> RunListResponse:
|
|
"""List runs with optional filtering by experiment, status, and score range."""
|
|
query = db.query(Run)
|
|
|
|
if experiment_id is not None:
|
|
query = query.filter(Run.experiment_id == experiment_id)
|
|
|
|
if run_status is not None:
|
|
query = query.filter(Run.status == run_status)
|
|
|
|
# Score range filtering: filter runs whose average score falls within range
|
|
if min_score is not None or max_score is not None:
|
|
from sqlalchemy import func
|
|
|
|
score_subquery = (
|
|
db.query(Score.run_id, func.avg(Score.value).label("avg_score"))
|
|
.group_by(Score.run_id)
|
|
.subquery()
|
|
)
|
|
query = query.join(score_subquery, Run.id == score_subquery.c.run_id)
|
|
|
|
if min_score is not None:
|
|
query = query.filter(score_subquery.c.avg_score >= min_score)
|
|
if max_score is not None:
|
|
query = query.filter(score_subquery.c.avg_score <= max_score)
|
|
|
|
total = query.count()
|
|
runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all()
|
|
|
|
return RunListResponse(
|
|
items=[RunResponse.model_validate(r) for r in runs],
|
|
total=total,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Get run detail
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("/{run_id}", response_model=RunDetailResponse)
|
|
def get_run(
|
|
run_id: uuid.UUID,
|
|
db: Session = Depends(get_db),
|
|
_user: User = Depends(get_current_user),
|
|
) -> RunDetailResponse:
|
|
"""Get run detail with stage results and scores."""
|
|
run = (
|
|
db.query(Run)
|
|
.options(joinedload(Run.stage_results), joinedload(Run.scores))
|
|
.filter(Run.id == run_id)
|
|
.first()
|
|
)
|
|
if run is None:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
|
|
|
|
return RunDetailResponse(
|
|
id=run.id,
|
|
experiment_id=run.experiment_id,
|
|
config_hash=run.config_hash,
|
|
config=run.config,
|
|
status=run.status,
|
|
started_at=run.started_at,
|
|
completed_at=run.completed_at,
|
|
duration_ms=run.duration_ms,
|
|
tokens_in=run.tokens_in,
|
|
tokens_out=run.tokens_out,
|
|
cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None,
|
|
stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results],
|
|
scores=[ScoreResponse.model_validate(s) for s in run.scores],
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Ad-hoc single run
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED)
|
|
def create_run(
|
|
body: AdHocRunCreate,
|
|
db: Session = Depends(get_db),
|
|
_user: User = Depends(get_current_user),
|
|
) -> RunResponse:
|
|
"""Create and dispatch an ad-hoc single run."""
|
|
# Verify experiment exists
|
|
experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first()
|
|
if experiment is None:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
|
|
|
config_hash = compute_config_hash(
|
|
prompt=body.config.get("prompt", ""),
|
|
model=body.config.get("model", ""),
|
|
params=body.config.get("params", {}),
|
|
input_data=body.config.get("input_data", ""),
|
|
)
|
|
|
|
run = Run(
|
|
experiment_id=body.experiment_id,
|
|
config=body.config,
|
|
config_hash=config_hash,
|
|
status=RunStatus.pending,
|
|
)
|
|
db.add(run)
|
|
db.commit()
|
|
db.refresh(run)
|
|
|
|
# Dispatch execution asynchronously
|
|
dispatch_run(str(run.id))
|
|
|
|
return RunResponse.model_validate(run)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Human scoring
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED)
|
|
def score_run(
|
|
run_id: uuid.UUID,
|
|
body: ScoreInput,
|
|
db: Session = Depends(get_db),
|
|
_user: User = Depends(get_current_user),
|
|
) -> ScoreResponse:
|
|
"""Add a human rating/score to a run."""
|
|
run = _get_run_or_404(db, run_id)
|
|
|
|
score = Score(
|
|
run_id=run.id,
|
|
scorer_name=body.scorer_name,
|
|
value=body.value,
|
|
scorer_metadata=body.metadata,
|
|
)
|
|
db.add(score)
|
|
db.commit()
|
|
db.refresh(score)
|
|
|
|
return ScoreResponse.model_validate(score)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Leaderboard
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse)
|
|
def leaderboard(
|
|
experiment_id: uuid.UUID,
|
|
top_n: int = Query(10, ge=1, le=100),
|
|
db: Session = Depends(get_db),
|
|
_user: User = Depends(get_current_user),
|
|
) -> LeaderboardResponse:
|
|
"""Top N runs ranked by weighted score for an experiment.
|
|
|
|
Weighted score uses the experiment's scoring_config weights if available,
|
|
otherwise uses equal weighting across all scorers.
|
|
"""
|
|
# Verify experiment exists
|
|
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
|
if experiment is None:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
|
|
|
# Get scoring weights from experiment config
|
|
weights: dict[str, float] = {}
|
|
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
|
|
weights = experiment.scoring_config.get("weights", {})
|
|
|
|
# Get all completed runs for this experiment with their scores
|
|
runs = (
|
|
db.query(Run)
|
|
.options(joinedload(Run.scores))
|
|
.filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed)
|
|
.all()
|
|
)
|
|
|
|
entries: list[LeaderboardEntry] = []
|
|
for run in runs:
|
|
if not run.scores:
|
|
continue
|
|
|
|
score_map: dict[str, float] = {}
|
|
for s in run.scores:
|
|
# If multiple scores with same scorer_name, use the latest
|
|
score_map[s.scorer_name] = s.value
|
|
|
|
# Compute weighted score
|
|
if weights:
|
|
total_weight = sum(weights.get(name, 0.0) for name in score_map)
|
|
if total_weight > 0:
|
|
weighted = sum(
|
|
score_map[name] * weights.get(name, 0.0)
|
|
for name in score_map
|
|
if name in weights
|
|
) / total_weight
|
|
else:
|
|
# No matching weights — fall back to equal weighting
|
|
weighted = sum(score_map.values()) / len(score_map)
|
|
else:
|
|
# Equal weighting
|
|
weighted = sum(score_map.values()) / len(score_map)
|
|
|
|
entries.append(LeaderboardEntry(
|
|
run_id=run.id,
|
|
config_hash=run.config_hash,
|
|
config=run.config,
|
|
status=run.status,
|
|
weighted_score=weighted,
|
|
scores=score_map,
|
|
duration_ms=run.duration_ms,
|
|
tokens_in=run.tokens_in,
|
|
tokens_out=run.tokens_out,
|
|
))
|
|
|
|
# Sort by weighted_score descending, take top N
|
|
entries.sort(key=lambda e: e.weighted_score, reverse=True)
|
|
entries = entries[:top_n]
|
|
|
|
return LeaderboardResponse(
|
|
experiment_id=experiment_id,
|
|
entries=entries,
|
|
total=len(entries),
|
|
)
|