"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard. Provides filtering by experiment, status, and score range. The leaderboard endpoint returns top N runs ranked by weighted score. """ import uuid from fastapi import APIRouter, Depends, HTTPException, Query, status from sqlalchemy.orm import Session, joinedload from auth import get_current_user from engine.cache import compute_config_hash from engine.tasks import dispatch_run from main import get_db from models import Experiment, Run, RunStatus, Score, StageResult, User from schemas import ( AdHocRunCreate, LeaderboardEntry, LeaderboardResponse, RunDetailResponse, RunListResponse, RunResponse, ScoreInput, ScoreResponse, StageResultResponse, ) router = APIRouter() # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run: run = db.query(Run).filter(Run.id == run_id).first() if run is None: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found") return run # --------------------------------------------------------------------------- # List runs with filtering # --------------------------------------------------------------------------- @router.get("/", response_model=RunListResponse) def list_runs( experiment_id: uuid.UUID | None = Query(None), run_status: RunStatus | None = Query(None, alias="status"), min_score: float | None = Query(None, ge=0.0, le=1.0), max_score: float | None = Query(None, ge=0.0, le=1.0), limit: int = Query(50, ge=1, le=500), offset: int = Query(0, ge=0), db: Session = Depends(get_db), _user: User = Depends(get_current_user), ) -> RunListResponse: """List runs with optional filtering by experiment, status, and score range.""" query = db.query(Run) if experiment_id is not None: query = query.filter(Run.experiment_id == experiment_id) if run_status is not None: query = query.filter(Run.status == run_status) # Score range filtering: filter runs whose average score falls within range if min_score is not None or max_score is not None: from sqlalchemy import func score_subquery = ( db.query(Score.run_id, func.avg(Score.value).label("avg_score")) .group_by(Score.run_id) .subquery() ) query = query.join(score_subquery, Run.id == score_subquery.c.run_id) if min_score is not None: query = query.filter(score_subquery.c.avg_score >= min_score) if max_score is not None: query = query.filter(score_subquery.c.avg_score <= max_score) total = query.count() runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all() return RunListResponse( items=[RunResponse.model_validate(r) for r in runs], total=total, ) # --------------------------------------------------------------------------- # Get run detail # --------------------------------------------------------------------------- @router.get("/{run_id}", response_model=RunDetailResponse) def get_run( run_id: uuid.UUID, db: Session = Depends(get_db), _user: User = Depends(get_current_user), ) -> RunDetailResponse: """Get run detail with stage results and scores.""" run = ( db.query(Run) .options(joinedload(Run.stage_results), joinedload(Run.scores)) .filter(Run.id == run_id) .first() ) if run is None: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found") return RunDetailResponse( id=run.id, experiment_id=run.experiment_id, config_hash=run.config_hash, config=run.config, status=run.status, started_at=run.started_at, completed_at=run.completed_at, duration_ms=run.duration_ms, tokens_in=run.tokens_in, tokens_out=run.tokens_out, cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None, stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results], scores=[ScoreResponse.model_validate(s) for s in run.scores], ) # --------------------------------------------------------------------------- # Ad-hoc single run # --------------------------------------------------------------------------- @router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED) def create_run( body: AdHocRunCreate, db: Session = Depends(get_db), _user: User = Depends(get_current_user), ) -> RunResponse: """Create and dispatch an ad-hoc single run.""" # Verify experiment exists experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first() if experiment is None: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found") config_hash = compute_config_hash( prompt=body.config.get("prompt", ""), model=body.config.get("model", ""), params=body.config.get("params", {}), input_data=body.config.get("input_data", ""), ) run = Run( experiment_id=body.experiment_id, config=body.config, config_hash=config_hash, status=RunStatus.pending, ) db.add(run) db.commit() db.refresh(run) # Dispatch execution asynchronously dispatch_run(str(run.id)) return RunResponse.model_validate(run) # --------------------------------------------------------------------------- # Human scoring # --------------------------------------------------------------------------- @router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED) def score_run( run_id: uuid.UUID, body: ScoreInput, db: Session = Depends(get_db), _user: User = Depends(get_current_user), ) -> ScoreResponse: """Add a human rating/score to a run.""" run = _get_run_or_404(db, run_id) score = Score( run_id=run.id, scorer_name=body.scorer_name, value=body.value, scorer_metadata=body.metadata, ) db.add(score) db.commit() db.refresh(score) return ScoreResponse.model_validate(score) # --------------------------------------------------------------------------- # Leaderboard # --------------------------------------------------------------------------- @router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse) def leaderboard( experiment_id: uuid.UUID, top_n: int = Query(10, ge=1, le=100), db: Session = Depends(get_db), _user: User = Depends(get_current_user), ) -> LeaderboardResponse: """Top N runs ranked by weighted score for an experiment. Weighted score uses the experiment's scoring_config weights if available, otherwise uses equal weighting across all scorers. """ # Verify experiment exists experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first() if experiment is None: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found") # Get scoring weights from experiment config weights: dict[str, float] = {} if experiment.scoring_config and isinstance(experiment.scoring_config, dict): weights = experiment.scoring_config.get("weights", {}) # Get all completed runs for this experiment with their scores runs = ( db.query(Run) .options(joinedload(Run.scores)) .filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed) .all() ) entries: list[LeaderboardEntry] = [] for run in runs: if not run.scores: continue score_map: dict[str, float] = {} for s in run.scores: # If multiple scores with same scorer_name, use the latest score_map[s.scorer_name] = s.value # Compute weighted score if weights: total_weight = sum(weights.get(name, 0.0) for name in score_map) if total_weight > 0: weighted = sum( score_map[name] * weights.get(name, 0.0) for name in score_map if name in weights ) / total_weight else: # No matching weights — fall back to equal weighting weighted = sum(score_map.values()) / len(score_map) else: # Equal weighting weighted = sum(score_map.values()) / len(score_map) entries.append(LeaderboardEntry( run_id=run.id, config_hash=run.config_hash, config=run.config, status=run.status, weighted_score=weighted, scores=score_map, duration_ms=run.duration_ms, tokens_in=run.tokens_in, tokens_out=run.tokens_out, )) # Sort by weighted_score descending, take top N entries.sort(key=lambda e: e.weighted_score, reverse=True) entries = entries[:top_n] return LeaderboardResponse( experiment_id=experiment_id, entries=entries, total=len(entries), )