From b3fb8e306344a50809dae86e58c7327370f2032d Mon Sep 17 00:00:00 2001 From: John Lightner Date: Tue, 7 Apr 2026 03:24:56 -0500 Subject: [PATCH] MAESTRO: Implement runs router with full CRUD, filtering, scoring, and leaderboard - List runs with filtering by experiment, status, and score range plus pagination - Get run detail with eager-loaded stage results and scores - Ad-hoc single run creation with Celery/sync dispatch - Human scoring endpoint (POST /{id}/score) - Leaderboard endpoint with configurable weighted scoring from experiment scoring_config - Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas - 25 tests in test_runs.py, all passing (503 total tests passing) --- Auto Run Docs/02a-backend-engine.md | 3 +- backend/routers/runs.py | 288 ++++++++++++++++-- backend/schemas.py | 29 ++ backend/tests/test_routers.py | 15 +- backend/tests/test_runs.py | 454 ++++++++++++++++++++++++++++ 5 files changed, 759 insertions(+), 30 deletions(-) create mode 100644 backend/tests/test_runs.py diff --git a/Auto Run Docs/02a-backend-engine.md b/Auto Run Docs/02a-backend-engine.md index 1d61005..75547c1 100644 --- a/Auto Run Docs/02a-backend-engine.md +++ b/Auto Run Docs/02a-backend-engine.md @@ -38,7 +38,8 @@ Implement the core experiment execution engine: LLM adapters, response caching, - [x] Implement backend/routers/experiments.py fully — CRUD plus sweep control. POST /experiments/{id}/sweep should validate the sweep config, create Run records for all configurations, and dispatch to Celery. Pause/resume/stop should set Redis flags that the sweep runner checks between runs. -- [ ] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score. +- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score. + - [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats. diff --git a/backend/routers/runs.py b/backend/routers/runs.py index 46a0937..094f737 100644 --- a/backend/routers/runs.py +++ b/backend/routers/runs.py @@ -1,37 +1,281 @@ -"""Runs router — execute, detail, score, and leaderboard.""" +"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard. + +Provides filtering by experiment, status, and score range. The leaderboard +endpoint returns top N runs ranked by weighted score. +""" import uuid -from fastapi import APIRouter, Response +from fastapi import APIRouter, Depends, HTTPException, Query, status +from sqlalchemy.orm import Session, joinedload + +from auth import get_current_user +from engine.cache import compute_config_hash +from engine.tasks import dispatch_run +from main import get_db +from models import Experiment, Run, RunStatus, Score, StageResult, User +from schemas import ( + AdHocRunCreate, + LeaderboardEntry, + LeaderboardResponse, + RunDetailResponse, + RunListResponse, + RunResponse, + ScoreInput, + ScoreResponse, + StageResultResponse, +) router = APIRouter() -@router.get("/experiments/{experiment_id}/runs", status_code=501) -def list_runs(experiment_id: uuid.UUID): - """List runs with scores (sortable, filterable).""" - return Response(status_code=501, content="Not Implemented") +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- -@router.get("/{run_id}", status_code=501) -def get_run(run_id: uuid.UUID): - """Run detail with stage results.""" - return Response(status_code=501, content="Not Implemented") +def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run: + run = db.query(Run).filter(Run.id == run_id).first() + if run is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found") + return run -@router.post("/", status_code=501) -def create_run(): - """Execute a single run (ad-hoc).""" - return Response(status_code=501, content="Not Implemented") +# --------------------------------------------------------------------------- +# List runs with filtering +# --------------------------------------------------------------------------- -@router.post("/{run_id}/score", status_code=501) -def score_run(run_id: uuid.UUID): - """Add human rating to a run.""" - return Response(status_code=501, content="Not Implemented") +@router.get("/", response_model=RunListResponse) +def list_runs( + experiment_id: uuid.UUID | None = Query(None), + run_status: RunStatus | None = Query(None, alias="status"), + min_score: float | None = Query(None, ge=0.0, le=1.0), + max_score: float | None = Query(None, ge=0.0, le=1.0), + limit: int = Query(50, ge=1, le=500), + offset: int = Query(0, ge=0), + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +) -> RunListResponse: + """List runs with optional filtering by experiment, status, and score range.""" + query = db.query(Run) + + if experiment_id is not None: + query = query.filter(Run.experiment_id == experiment_id) + + if run_status is not None: + query = query.filter(Run.status == run_status) + + # Score range filtering: filter runs whose average score falls within range + if min_score is not None or max_score is not None: + from sqlalchemy import func + + score_subquery = ( + db.query(Score.run_id, func.avg(Score.value).label("avg_score")) + .group_by(Score.run_id) + .subquery() + ) + query = query.join(score_subquery, Run.id == score_subquery.c.run_id) + + if min_score is not None: + query = query.filter(score_subquery.c.avg_score >= min_score) + if max_score is not None: + query = query.filter(score_subquery.c.avg_score <= max_score) + + total = query.count() + runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all() + + return RunListResponse( + items=[RunResponse.model_validate(r) for r in runs], + total=total, + ) -@router.get("/experiments/{experiment_id}/leaderboard", status_code=501) -def leaderboard(experiment_id: uuid.UUID): - """Top runs ranked by weighted score.""" - return Response(status_code=501, content="Not Implemented") +# --------------------------------------------------------------------------- +# Get run detail +# --------------------------------------------------------------------------- + + +@router.get("/{run_id}", response_model=RunDetailResponse) +def get_run( + run_id: uuid.UUID, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +) -> RunDetailResponse: + """Get run detail with stage results and scores.""" + run = ( + db.query(Run) + .options(joinedload(Run.stage_results), joinedload(Run.scores)) + .filter(Run.id == run_id) + .first() + ) + if run is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found") + + return RunDetailResponse( + id=run.id, + experiment_id=run.experiment_id, + config_hash=run.config_hash, + config=run.config, + status=run.status, + started_at=run.started_at, + completed_at=run.completed_at, + duration_ms=run.duration_ms, + tokens_in=run.tokens_in, + tokens_out=run.tokens_out, + cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None, + stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results], + scores=[ScoreResponse.model_validate(s) for s in run.scores], + ) + + +# --------------------------------------------------------------------------- +# Ad-hoc single run +# --------------------------------------------------------------------------- + + +@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED) +def create_run( + body: AdHocRunCreate, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +) -> RunResponse: + """Create and dispatch an ad-hoc single run.""" + # Verify experiment exists + experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first() + if experiment is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found") + + config_hash = compute_config_hash( + prompt=body.config.get("prompt", ""), + model=body.config.get("model", ""), + params=body.config.get("params", {}), + input_data=body.config.get("input_data", ""), + ) + + run = Run( + experiment_id=body.experiment_id, + config=body.config, + config_hash=config_hash, + status=RunStatus.pending, + ) + db.add(run) + db.commit() + db.refresh(run) + + # Dispatch execution asynchronously + dispatch_run(str(run.id)) + + return RunResponse.model_validate(run) + + +# --------------------------------------------------------------------------- +# Human scoring +# --------------------------------------------------------------------------- + + +@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED) +def score_run( + run_id: uuid.UUID, + body: ScoreInput, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +) -> ScoreResponse: + """Add a human rating/score to a run.""" + run = _get_run_or_404(db, run_id) + + score = Score( + run_id=run.id, + scorer_name=body.scorer_name, + value=body.value, + scorer_metadata=body.metadata, + ) + db.add(score) + db.commit() + db.refresh(score) + + return ScoreResponse.model_validate(score) + + +# --------------------------------------------------------------------------- +# Leaderboard +# --------------------------------------------------------------------------- + + +@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse) +def leaderboard( + experiment_id: uuid.UUID, + top_n: int = Query(10, ge=1, le=100), + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +) -> LeaderboardResponse: + """Top N runs ranked by weighted score for an experiment. + + Weighted score uses the experiment's scoring_config weights if available, + otherwise uses equal weighting across all scorers. + """ + # Verify experiment exists + experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first() + if experiment is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found") + + # Get scoring weights from experiment config + weights: dict[str, float] = {} + if experiment.scoring_config and isinstance(experiment.scoring_config, dict): + weights = experiment.scoring_config.get("weights", {}) + + # Get all completed runs for this experiment with their scores + runs = ( + db.query(Run) + .options(joinedload(Run.scores)) + .filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed) + .all() + ) + + entries: list[LeaderboardEntry] = [] + for run in runs: + if not run.scores: + continue + + score_map: dict[str, float] = {} + for s in run.scores: + # If multiple scores with same scorer_name, use the latest + score_map[s.scorer_name] = s.value + + # Compute weighted score + if weights: + total_weight = sum(weights.get(name, 0.0) for name in score_map) + if total_weight > 0: + weighted = sum( + score_map[name] * weights.get(name, 0.0) + for name in score_map + if name in weights + ) / total_weight + else: + # No matching weights — fall back to equal weighting + weighted = sum(score_map.values()) / len(score_map) + else: + # Equal weighting + weighted = sum(score_map.values()) / len(score_map) + + entries.append(LeaderboardEntry( + run_id=run.id, + config_hash=run.config_hash, + config=run.config, + status=run.status, + weighted_score=weighted, + scores=score_map, + duration_ms=run.duration_ms, + tokens_in=run.tokens_in, + tokens_out=run.tokens_out, + )) + + # Sort by weighted_score descending, take top N + entries.sort(key=lambda e: e.weighted_score, reverse=True) + entries = entries[:top_n] + + return LeaderboardResponse( + experiment_id=experiment_id, + entries=entries, + total=len(entries), + ) diff --git a/backend/schemas.py b/backend/schemas.py index e86c830..9328721 100644 --- a/backend/schemas.py +++ b/backend/schemas.py @@ -139,6 +139,35 @@ class RunListResponse(BaseModel): total: int +class AdHocRunCreate(BaseModel): + """Request body for creating an ad-hoc single run.""" + + experiment_id: uuid.UUID + config: dict + + +class LeaderboardEntry(BaseModel): + """A single entry in the leaderboard.""" + + model_config = ConfigDict(from_attributes=True) + + run_id: uuid.UUID + config_hash: str + config: dict + status: RunStatus + weighted_score: float + scores: dict[str, float] = Field(default_factory=dict) + duration_ms: int | None = None + tokens_in: int | None = None + tokens_out: int | None = None + + +class LeaderboardResponse(BaseModel): + experiment_id: uuid.UUID + entries: list[LeaderboardEntry] + total: int + + # --------------------------------------------------------------------------- # StageResult (read-only, returned inside Run details) # --------------------------------------------------------------------------- diff --git a/backend/tests/test_routers.py b/backend/tests/test_routers.py index 70be77b..0a1bad6 100644 --- a/backend/tests/test_routers.py +++ b/backend/tests/test_routers.py @@ -117,28 +117,29 @@ def test_experiments_stop(client): # ---- Runs router (/api/runs) ---- def test_runs_list(client): - resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/runs") - assert resp.status_code == 501 + resp = client.get("/api/runs/") + # Runs router is now fully implemented and requires auth (returns 401 without credentials) + assert resp.status_code == 401 def test_runs_get(client): resp = client.get("/api/runs/00000000-0000-0000-0000-000000000001") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_runs_create(client): resp = client.post("/api/runs/") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_runs_score(client): resp = client.post("/api/runs/00000000-0000-0000-0000-000000000001/score") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_runs_leaderboard(client): - resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/leaderboard") - assert resp.status_code == 501 + resp = client.get("/api/runs/leaderboard/00000000-0000-0000-0000-000000000001") + assert resp.status_code == 401 # ---- Endpoints router (/api/endpoints) ---- diff --git a/backend/tests/test_runs.py b/backend/tests/test_runs.py new file mode 100644 index 0000000..97c61f2 --- /dev/null +++ b/backend/tests/test_runs.py @@ -0,0 +1,454 @@ +"""Tests for backend/routers/runs.py — Run listing, detail, ad-hoc execution, scoring, and leaderboard.""" + +import os +import uuid +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch + +import pytest +from fastapi.testclient import TestClient + + +JWT_SECRET = "test-secret-key-for-jwt-signing" +API_KEY = "test-api-key-12345" + + +@pytest.fixture(autouse=True) +def _isolate_settings(tmp_path): + """Ensure tests use a temp SQLite DB and no Redis.""" + env = { + "DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}", + "REDIS_URL": "", + "DATA_DIR": str(tmp_path), + "JWT_SECRET": JWT_SECRET, + "API_KEY": API_KEY, + } + with patch.dict(os.environ, env, clear=False): + import config + new_settings = config.Settings(_env_file=None) + config.settings = new_settings + + import main + main.settings = new_settings + main._init_db() + main._init_redis() + + from models import Base + Base.metadata.create_all(bind=main.engine) + + import auth + auth.settings = new_settings + + yield + + +@pytest.fixture +def db_session(): + from main import get_db + gen = get_db() + session = next(gen) + yield session + try: + next(gen) + except StopIteration: + pass + + +@pytest.fixture +def admin_user(db_session): + from auth import hash_password + from models import User + user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True) + db_session.add(user) + db_session.commit() + db_session.refresh(user) + return user + + +@pytest.fixture +def project(db_session, admin_user): + from models import Project + proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id) + db_session.add(proj) + db_session.commit() + db_session.refresh(proj) + return proj + + +@pytest.fixture +def experiment(db_session, project): + from models import Experiment + exp = Experiment( + project_id=project.id, + name="Test Experiment", + description="An experiment for testing", + scoring_config={"weights": {"keyword": 0.6, "format": 0.4}}, + ) + db_session.add(exp) + db_session.commit() + db_session.refresh(exp) + return exp + + +@pytest.fixture +def auth_headers(): + return {"X-Api-Key": API_KEY} + + +@pytest.fixture +def client(): + from main import app + return TestClient(app) + + +def _create_run(db_session, experiment, status="completed", config=None, config_hash=None): + """Helper to create a Run directly in the DB.""" + from models import Run, RunStatus + run = Run( + experiment_id=experiment.id, + config=config or {"prompt": "test", "model": "gpt-test"}, + config_hash=config_hash or uuid.uuid4().hex[:64], + status=RunStatus(status), + started_at=datetime.now(timezone.utc), + completed_at=datetime.now(timezone.utc) if status == "completed" else None, + duration_ms=1000 if status == "completed" else None, + tokens_in=50, + tokens_out=100, + ) + db_session.add(run) + db_session.commit() + db_session.refresh(run) + return run + + +def _create_score(db_session, run, scorer_name="keyword", value=0.8, metadata=None): + """Helper to create a Score directly in the DB.""" + from models import Score + score = Score( + run_id=run.id, + scorer_name=scorer_name, + value=value, + scorer_metadata=metadata, + ) + db_session.add(score) + db_session.commit() + db_session.refresh(score) + return score + + +def _create_stage_result(db_session, run, stage_index=0): + """Helper to create a StageResult directly in the DB.""" + from models import StageResult + sr = StageResult( + run_id=run.id, + stage_index=stage_index, + prompt_sent="What is 2+2?", + response_raw="4", + model_used="gpt-test", + tokens_in=10, + tokens_out=5, + latency_ms=200, + ) + db_session.add(sr) + db_session.commit() + db_session.refresh(sr) + return sr + + +# --------------------------------------------------------------------------- +# List runs +# --------------------------------------------------------------------------- + + +class TestListRuns: + def test_list_runs_empty(self, client, auth_headers, experiment): + resp = client.get("/api/runs/", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert data["items"] == [] + assert data["total"] == 0 + + def test_list_runs_returns_all(self, client, auth_headers, db_session, experiment): + _create_run(db_session, experiment) + _create_run(db_session, experiment, status="pending") + resp = client.get("/api/runs/", headers=auth_headers) + assert resp.status_code == 200 + assert resp.json()["total"] == 2 + + def test_list_runs_filter_by_experiment(self, client, auth_headers, db_session, experiment, project): + from models import Experiment + exp2 = Experiment(project_id=project.id, name="Other Experiment") + db_session.add(exp2) + db_session.commit() + db_session.refresh(exp2) + + _create_run(db_session, experiment) + _create_run(db_session, exp2) + + resp = client.get(f"/api/runs/?experiment_id={experiment.id}", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 1 + assert data["items"][0]["experiment_id"] == str(experiment.id) + + def test_list_runs_filter_by_status(self, client, auth_headers, db_session, experiment): + _create_run(db_session, experiment, status="completed") + _create_run(db_session, experiment, status="failed") + _create_run(db_session, experiment, status="pending") + + resp = client.get("/api/runs/?status=completed", headers=auth_headers) + assert resp.status_code == 200 + assert resp.json()["total"] == 1 + + def test_list_runs_filter_by_score_range(self, client, auth_headers, db_session, experiment): + r1 = _create_run(db_session, experiment) + r2 = _create_run(db_session, experiment) + r3 = _create_run(db_session, experiment) + _create_score(db_session, r1, value=0.9) + _create_score(db_session, r2, value=0.5) + _create_score(db_session, r3, value=0.2) + + resp = client.get("/api/runs/?min_score=0.4&max_score=0.95", headers=auth_headers) + assert resp.status_code == 200 + assert resp.json()["total"] == 2 + + def test_list_runs_pagination(self, client, auth_headers, db_session, experiment): + for _ in range(5): + _create_run(db_session, experiment) + + resp = client.get("/api/runs/?limit=2&offset=0", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert len(data["items"]) == 2 + assert data["total"] == 5 + + resp2 = client.get("/api/runs/?limit=2&offset=3", headers=auth_headers) + assert resp2.status_code == 200 + assert len(resp2.json()["items"]) == 2 + + def test_list_runs_requires_auth(self, client, experiment): + resp = client.get("/api/runs/") + assert resp.status_code in (401, 403) + + +# --------------------------------------------------------------------------- +# Get run detail +# --------------------------------------------------------------------------- + + +class TestGetRunDetail: + def test_get_run_detail(self, client, auth_headers, db_session, experiment): + run = _create_run(db_session, experiment) + _create_stage_result(db_session, run, stage_index=0) + _create_score(db_session, run, scorer_name="keyword", value=0.85) + + resp = client.get(f"/api/runs/{run.id}", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert data["id"] == str(run.id) + assert len(data["stage_results"]) == 1 + assert data["stage_results"][0]["stage_index"] == 0 + assert len(data["scores"]) == 1 + assert data["scores"][0]["scorer_name"] == "keyword" + + def test_get_run_not_found(self, client, auth_headers, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/runs/{fake_id}", headers=auth_headers) + assert resp.status_code == 404 + + def test_get_run_detail_no_stages_or_scores(self, client, auth_headers, db_session, experiment): + run = _create_run(db_session, experiment) + resp = client.get(f"/api/runs/{run.id}", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert data["stage_results"] == [] + assert data["scores"] == [] + + +# --------------------------------------------------------------------------- +# Ad-hoc run creation +# --------------------------------------------------------------------------- + + +class TestCreateRun: + @patch("routers.runs.dispatch_run") + def test_create_adhoc_run(self, mock_dispatch, client, auth_headers, db_session, experiment): + body = { + "experiment_id": str(experiment.id), + "config": {"prompt": "Hello", "model": "gpt-test", "params": {"temperature": 0.7}}, + } + resp = client.post("/api/runs/", json=body, headers=auth_headers) + assert resp.status_code == 201 + data = resp.json() + assert data["experiment_id"] == str(experiment.id) + assert data["status"] == "pending" + assert data["config"]["prompt"] == "Hello" + assert len(data["config_hash"]) == 64 + mock_dispatch.assert_called_once_with(data["id"]) + + def test_create_run_experiment_not_found(self, client, auth_headers, admin_user): + body = { + "experiment_id": str(uuid.uuid4()), + "config": {"prompt": "Hello", "model": "test"}, + } + resp = client.post("/api/runs/", json=body, headers=auth_headers) + assert resp.status_code == 404 + + @patch("routers.runs.dispatch_run") + def test_create_run_config_hash_deterministic(self, mock_dispatch, client, auth_headers, db_session, experiment): + config = {"prompt": "Same prompt", "model": "same-model", "params": {}, "input_data": ""} + body = {"experiment_id": str(experiment.id), "config": config} + + resp1 = client.post("/api/runs/", json=body, headers=auth_headers) + resp2 = client.post("/api/runs/", json=body, headers=auth_headers) + assert resp1.status_code == 201 + assert resp2.status_code == 201 + assert resp1.json()["config_hash"] == resp2.json()["config_hash"] + + +# --------------------------------------------------------------------------- +# Human scoring +# --------------------------------------------------------------------------- + + +class TestScoreRun: + def test_add_human_score(self, client, auth_headers, db_session, experiment): + run = _create_run(db_session, experiment) + body = {"scorer_name": "human", "value": 0.9, "metadata": {"comment": "Great output"}} + resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers) + assert resp.status_code == 201 + data = resp.json() + assert data["scorer_name"] == "human" + assert data["value"] == 0.9 + assert data["scorer_metadata"]["comment"] == "Great output" + assert data["run_id"] == str(run.id) + + def test_score_run_not_found(self, client, auth_headers, admin_user): + fake_id = uuid.uuid4() + body = {"scorer_name": "human", "value": 0.5} + resp = client.post(f"/api/runs/{fake_id}/score", json=body, headers=auth_headers) + assert resp.status_code == 404 + + def test_add_multiple_scores(self, client, auth_headers, db_session, experiment): + run = _create_run(db_session, experiment) + client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "human", "value": 0.8}, headers=auth_headers) + client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "keyword", "value": 0.6}, headers=auth_headers) + + detail = client.get(f"/api/runs/{run.id}", headers=auth_headers) + assert len(detail.json()["scores"]) == 2 + + def test_score_requires_scorer_name(self, client, auth_headers, db_session, experiment): + run = _create_run(db_session, experiment) + body = {"value": 0.5} + resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers) + assert resp.status_code == 422 + + +# --------------------------------------------------------------------------- +# Leaderboard +# --------------------------------------------------------------------------- + + +class TestLeaderboard: + def test_leaderboard_basic(self, client, auth_headers, db_session, experiment): + r1 = _create_run(db_session, experiment) + r2 = _create_run(db_session, experiment) + r3 = _create_run(db_session, experiment) + + _create_score(db_session, r1, scorer_name="keyword", value=0.9) + _create_score(db_session, r1, scorer_name="format", value=0.8) + _create_score(db_session, r2, scorer_name="keyword", value=0.5) + _create_score(db_session, r2, scorer_name="format", value=0.6) + _create_score(db_session, r3, scorer_name="keyword", value=0.3) + _create_score(db_session, r3, scorer_name="format", value=0.4) + + resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) + assert resp.status_code == 200 + data = resp.json() + assert data["experiment_id"] == str(experiment.id) + assert len(data["entries"]) == 3 + # First entry should have highest weighted score + assert data["entries"][0]["run_id"] == str(r1.id) + assert data["entries"][0]["weighted_score"] > data["entries"][1]["weighted_score"] + + def test_leaderboard_top_n(self, client, auth_headers, db_session, experiment): + for i in range(5): + r = _create_run(db_session, experiment) + _create_score(db_session, r, value=i * 0.2) + + resp = client.get(f"/api/runs/leaderboard/{experiment.id}?top_n=3", headers=auth_headers) + assert resp.status_code == 200 + assert len(resp.json()["entries"]) == 3 + + def test_leaderboard_weighted_scores(self, client, auth_headers, db_session, experiment): + """Experiment has weights: keyword=0.6, format=0.4.""" + r1 = _create_run(db_session, experiment) + _create_score(db_session, r1, scorer_name="keyword", value=1.0) + _create_score(db_session, r1, scorer_name="format", value=0.0) + + r2 = _create_run(db_session, experiment) + _create_score(db_session, r2, scorer_name="keyword", value=0.0) + _create_score(db_session, r2, scorer_name="format", value=1.0) + + resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) + data = resp.json() + # r1: keyword=1.0*0.6 + format=0.0*0.4 = 0.6 + # r2: keyword=0.0*0.6 + format=1.0*0.4 = 0.4 + assert data["entries"][0]["run_id"] == str(r1.id) + assert abs(data["entries"][0]["weighted_score"] - 0.6) < 0.001 + assert abs(data["entries"][1]["weighted_score"] - 0.4) < 0.001 + + def test_leaderboard_equal_weights_no_config(self, client, auth_headers, db_session, project): + """Experiment without scoring_config uses equal weighting.""" + from models import Experiment + exp_no_weights = Experiment( + project_id=project.id, + name="No Weights Experiment", + scoring_config=None, + ) + db_session.add(exp_no_weights) + db_session.commit() + db_session.refresh(exp_no_weights) + + r = _create_run(db_session, exp_no_weights) + _create_score(db_session, r, scorer_name="keyword", value=0.8) + _create_score(db_session, r, scorer_name="format", value=0.6) + + resp = client.get(f"/api/runs/leaderboard/{exp_no_weights.id}", headers=auth_headers) + data = resp.json() + assert len(data["entries"]) == 1 + # Equal weight: (0.8 + 0.6) / 2 = 0.7 + assert abs(data["entries"][0]["weighted_score"] - 0.7) < 0.001 + + def test_leaderboard_experiment_not_found(self, client, auth_headers, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/runs/leaderboard/{fake_id}", headers=auth_headers) + assert resp.status_code == 404 + + def test_leaderboard_no_completed_runs(self, client, auth_headers, db_session, experiment): + _create_run(db_session, experiment, status="pending") + resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) + assert resp.status_code == 200 + assert resp.json()["entries"] == [] + + def test_leaderboard_skips_runs_without_scores(self, client, auth_headers, db_session, experiment): + _create_run(db_session, experiment) # no scores + r2 = _create_run(db_session, experiment) + _create_score(db_session, r2, value=0.7) + + resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) + data = resp.json() + assert len(data["entries"]) == 1 + assert data["entries"][0]["run_id"] == str(r2.id) + + def test_leaderboard_includes_run_metadata(self, client, auth_headers, db_session, experiment): + r = _create_run(db_session, experiment) + _create_score(db_session, r, value=0.9) + + resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) + entry = resp.json()["entries"][0] + assert "config_hash" in entry + assert "config" in entry + assert "duration_ms" in entry + assert "tokens_in" in entry + assert "tokens_out" in entry