promptlooper/backend/routers/runs.py

"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard.

Provides filtering by experiment, status, and score range. The leaderboard
endpoint returns top N runs ranked by weighted score.
"""

import uuid

from fastapi import APIRouter, Depends, HTTPException, Query, status
from sqlalchemy.orm import Session, joinedload

from auth import get_current_user
from engine.cache import compute_config_hash
from engine.tasks import dispatch_run
from main import get_db
from models import Experiment, Run, RunStatus, Score, StageResult, User
from schemas import (
    AdHocRunCreate,
    LeaderboardEntry,
    LeaderboardResponse,
    RunDetailResponse,
    RunListResponse,
    RunResponse,
    ScoreInput,
    ScoreResponse,
    StageResultResponse,
)

router = APIRouter()


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run:
    run = db.query(Run).filter(Run.id == run_id).first()
    if run is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
    return run


# ---------------------------------------------------------------------------
# List runs with filtering
# ---------------------------------------------------------------------------


@router.get("/", response_model=RunListResponse)
def list_runs(
    experiment_id: uuid.UUID | None = Query(None),
    run_status: RunStatus | None = Query(None, alias="status"),
    min_score: float | None = Query(None, ge=0.0, le=1.0),
    max_score: float | None = Query(None, ge=0.0, le=1.0),
    limit: int = Query(50, ge=1, le=500),
    offset: int = Query(0, ge=0),
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
) -> RunListResponse:
    """List runs with optional filtering by experiment, status, and score range."""
    query = db.query(Run)

    if experiment_id is not None:
        query = query.filter(Run.experiment_id == experiment_id)

    if run_status is not None:
        query = query.filter(Run.status == run_status)

    # Score range filtering: filter runs whose average score falls within range
    if min_score is not None or max_score is not None:
        from sqlalchemy import func

        score_subquery = (
            db.query(Score.run_id, func.avg(Score.value).label("avg_score"))
            .group_by(Score.run_id)
            .subquery()
        )
        query = query.join(score_subquery, Run.id == score_subquery.c.run_id)

        if min_score is not None:
            query = query.filter(score_subquery.c.avg_score >= min_score)
        if max_score is not None:
            query = query.filter(score_subquery.c.avg_score <= max_score)

    total = query.count()
    runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all()

    return RunListResponse(
        items=[RunResponse.model_validate(r) for r in runs],
        total=total,
    )


# ---------------------------------------------------------------------------
# Get run detail
# ---------------------------------------------------------------------------


@router.get("/{run_id}", response_model=RunDetailResponse)
def get_run(
    run_id: uuid.UUID,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
) -> RunDetailResponse:
    """Get run detail with stage results and scores."""
    run = (
        db.query(Run)
        .options(joinedload(Run.stage_results), joinedload(Run.scores))
        .filter(Run.id == run_id)
        .first()
    )
    if run is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")

    return RunDetailResponse(
        id=run.id,
        experiment_id=run.experiment_id,
        config_hash=run.config_hash,
        config=run.config,
        status=run.status,
        started_at=run.started_at,
        completed_at=run.completed_at,
        duration_ms=run.duration_ms,
        tokens_in=run.tokens_in,
        tokens_out=run.tokens_out,
        cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None,
        stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results],
        scores=[ScoreResponse.model_validate(s) for s in run.scores],
    )


# ---------------------------------------------------------------------------
# Ad-hoc single run
# ---------------------------------------------------------------------------


@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED)
def create_run(
    body: AdHocRunCreate,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
) -> RunResponse:
    """Create and dispatch an ad-hoc single run."""
    # Verify experiment exists
    experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first()
    if experiment is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")

    config_hash = compute_config_hash(
        prompt=body.config.get("prompt", ""),
        model=body.config.get("model", ""),
        params=body.config.get("params", {}),
        input_data=body.config.get("input_data", ""),
    )

    run = Run(
        experiment_id=body.experiment_id,
        config=body.config,
        config_hash=config_hash,
        status=RunStatus.pending,
    )
    db.add(run)
    db.commit()
    db.refresh(run)

    # Dispatch execution asynchronously
    dispatch_run(str(run.id))

    return RunResponse.model_validate(run)


# ---------------------------------------------------------------------------
# Human scoring
# ---------------------------------------------------------------------------


@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED)
def score_run(
    run_id: uuid.UUID,
    body: ScoreInput,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
) -> ScoreResponse:
    """Add a human rating/score to a run."""
    run = _get_run_or_404(db, run_id)

    score = Score(
        run_id=run.id,
        scorer_name=body.scorer_name,
        value=body.value,
        scorer_metadata=body.metadata,
    )
    db.add(score)
    db.commit()
    db.refresh(score)

    return ScoreResponse.model_validate(score)


# ---------------------------------------------------------------------------
# Leaderboard
# ---------------------------------------------------------------------------


@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse)
def leaderboard(
    experiment_id: uuid.UUID,
    top_n: int = Query(10, ge=1, le=100),
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
) -> LeaderboardResponse:
    """Top N runs ranked by weighted score for an experiment.

    Weighted score uses the experiment's scoring_config weights if available,
    otherwise uses equal weighting across all scorers.
    """
    # Verify experiment exists
    experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
    if experiment is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")

    # Get scoring weights from experiment config
    weights: dict[str, float] = {}
    if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
        weights = experiment.scoring_config.get("weights", {})

    # Get all completed runs for this experiment with their scores
    runs = (
        db.query(Run)
        .options(joinedload(Run.scores))
        .filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed)
        .all()
    )

    entries: list[LeaderboardEntry] = []
    for run in runs:
        if not run.scores:
            continue

        score_map: dict[str, float] = {}
        for s in run.scores:
            # If multiple scores with same scorer_name, use the latest
            score_map[s.scorer_name] = s.value

        # Compute weighted score
        if weights:
            total_weight = sum(weights.get(name, 0.0) for name in score_map)
            if total_weight > 0:
                weighted = sum(
                    score_map[name] * weights.get(name, 0.0)
                    for name in score_map
                    if name in weights
                ) / total_weight
            else:
                # No matching weights — fall back to equal weighting
                weighted = sum(score_map.values()) / len(score_map)
        else:
            # Equal weighting
            weighted = sum(score_map.values()) / len(score_map)

        entries.append(LeaderboardEntry(
            run_id=run.id,
            config_hash=run.config_hash,
            config=run.config,
            status=run.status,
            weighted_score=weighted,
            scores=score_map,
            duration_ms=run.duration_ms,
            tokens_in=run.tokens_in,
            tokens_out=run.tokens_out,
        ))

    # Sort by weighted_score descending, take top N
    entries.sort(key=lambda e: e.weighted_score, reverse=True)
    entries = entries[:top_n]

    return LeaderboardResponse(
        experiment_id=experiment_id,
        entries=entries,
        total=len(entries),
    )