MAESTRO: Implement runs router with full CRUD, filtering, scoring, and leaderboard

- List runs with filtering by experiment, status, and score range plus pagination
- Get run detail with eager-loaded stage results and scores
- Ad-hoc single run creation with Celery/sync dispatch
- Human scoring endpoint (POST /{id}/score)
- Leaderboard endpoint with configurable weighted scoring from experiment scoring_config
- Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas
- 25 tests in test_runs.py, all passing (503 total tests passing)
This commit is contained in:
John Lightner 2026-04-07 03:24:56 -05:00
parent e6c344d554
commit b3fb8e3063
5 changed files with 759 additions and 30 deletions

View file

@ -38,7 +38,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
- [x] Implement backend/routers/experiments.py fully — CRUD plus sweep control. POST /experiments/{id}/sweep should validate the sweep config, create Run records for all configurations, and dispatch to Celery. Pause/resume/stop should set Redis flags that the sweep runner checks between runs. - [x] Implement backend/routers/experiments.py fully — CRUD plus sweep control. POST /experiments/{id}/sweep should validate the sweep config, create Run records for all configurations, and dispatch to Celery. Pause/resume/stop should set Redis flags that the sweep runner checks between runs.
<!-- Completed: Full CRUD (list with project filter, get, create, update, delete) + sweep control (start/pause/resume/stop + status). SweepRequest/SweepStatusResponse schemas added. Sweep dispatch via Celery/sync fallback. Redis flags for pause/resume/stop, with single-container mode fallback. 34 tests in test_experiments.py, all passing. --> <!-- Completed: Full CRUD (list with project filter, get, create, update, delete) + sweep control (start/pause/resume/stop + status). SweepRequest/SweepStatusResponse schemas added. Sweep dispatch via Celery/sync fallback. Redis flags for pause/resume/stop, with single-container mode fallback. 34 tests in test_experiments.py, all passing. -->
- [ ] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score. - [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats. - [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.

View file

@ -1,37 +1,281 @@
"""Runs router — execute, detail, score, and leaderboard.""" """Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard.
Provides filtering by experiment, status, and score range. The leaderboard
endpoint returns top N runs ranked by weighted score.
"""
import uuid import uuid
from fastapi import APIRouter, Response from fastapi import APIRouter, Depends, HTTPException, Query, status
from sqlalchemy.orm import Session, joinedload
from auth import get_current_user
from engine.cache import compute_config_hash
from engine.tasks import dispatch_run
from main import get_db
from models import Experiment, Run, RunStatus, Score, StageResult, User
from schemas import (
AdHocRunCreate,
LeaderboardEntry,
LeaderboardResponse,
RunDetailResponse,
RunListResponse,
RunResponse,
ScoreInput,
ScoreResponse,
StageResultResponse,
)
router = APIRouter() router = APIRouter()
@router.get("/experiments/{experiment_id}/runs", status_code=501) # ---------------------------------------------------------------------------
def list_runs(experiment_id: uuid.UUID): # Helpers
"""List runs with scores (sortable, filterable).""" # ---------------------------------------------------------------------------
return Response(status_code=501, content="Not Implemented")
@router.get("/{run_id}", status_code=501) def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run:
def get_run(run_id: uuid.UUID): run = db.query(Run).filter(Run.id == run_id).first()
"""Run detail with stage results.""" if run is None:
return Response(status_code=501, content="Not Implemented") raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
return run
@router.post("/", status_code=501) # ---------------------------------------------------------------------------
def create_run(): # List runs with filtering
"""Execute a single run (ad-hoc).""" # ---------------------------------------------------------------------------
return Response(status_code=501, content="Not Implemented")
@router.post("/{run_id}/score", status_code=501) @router.get("/", response_model=RunListResponse)
def score_run(run_id: uuid.UUID): def list_runs(
"""Add human rating to a run.""" experiment_id: uuid.UUID | None = Query(None),
return Response(status_code=501, content="Not Implemented") run_status: RunStatus | None = Query(None, alias="status"),
min_score: float | None = Query(None, ge=0.0, le=1.0),
max_score: float | None = Query(None, ge=0.0, le=1.0),
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunListResponse:
"""List runs with optional filtering by experiment, status, and score range."""
query = db.query(Run)
if experiment_id is not None:
query = query.filter(Run.experiment_id == experiment_id)
if run_status is not None:
query = query.filter(Run.status == run_status)
# Score range filtering: filter runs whose average score falls within range
if min_score is not None or max_score is not None:
from sqlalchemy import func
score_subquery = (
db.query(Score.run_id, func.avg(Score.value).label("avg_score"))
.group_by(Score.run_id)
.subquery()
)
query = query.join(score_subquery, Run.id == score_subquery.c.run_id)
if min_score is not None:
query = query.filter(score_subquery.c.avg_score >= min_score)
if max_score is not None:
query = query.filter(score_subquery.c.avg_score <= max_score)
total = query.count()
runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all()
return RunListResponse(
items=[RunResponse.model_validate(r) for r in runs],
total=total,
)
@router.get("/experiments/{experiment_id}/leaderboard", status_code=501) # ---------------------------------------------------------------------------
def leaderboard(experiment_id: uuid.UUID): # Get run detail
"""Top runs ranked by weighted score.""" # ---------------------------------------------------------------------------
return Response(status_code=501, content="Not Implemented")
@router.get("/{run_id}", response_model=RunDetailResponse)
def get_run(
run_id: uuid.UUID,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunDetailResponse:
"""Get run detail with stage results and scores."""
run = (
db.query(Run)
.options(joinedload(Run.stage_results), joinedload(Run.scores))
.filter(Run.id == run_id)
.first()
)
if run is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
return RunDetailResponse(
id=run.id,
experiment_id=run.experiment_id,
config_hash=run.config_hash,
config=run.config,
status=run.status,
started_at=run.started_at,
completed_at=run.completed_at,
duration_ms=run.duration_ms,
tokens_in=run.tokens_in,
tokens_out=run.tokens_out,
cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None,
stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results],
scores=[ScoreResponse.model_validate(s) for s in run.scores],
)
# ---------------------------------------------------------------------------
# Ad-hoc single run
# ---------------------------------------------------------------------------
@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED)
def create_run(
body: AdHocRunCreate,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> RunResponse:
"""Create and dispatch an ad-hoc single run."""
# Verify experiment exists
experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first()
if experiment is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
config_hash = compute_config_hash(
prompt=body.config.get("prompt", ""),
model=body.config.get("model", ""),
params=body.config.get("params", {}),
input_data=body.config.get("input_data", ""),
)
run = Run(
experiment_id=body.experiment_id,
config=body.config,
config_hash=config_hash,
status=RunStatus.pending,
)
db.add(run)
db.commit()
db.refresh(run)
# Dispatch execution asynchronously
dispatch_run(str(run.id))
return RunResponse.model_validate(run)
# ---------------------------------------------------------------------------
# Human scoring
# ---------------------------------------------------------------------------
@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED)
def score_run(
run_id: uuid.UUID,
body: ScoreInput,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> ScoreResponse:
"""Add a human rating/score to a run."""
run = _get_run_or_404(db, run_id)
score = Score(
run_id=run.id,
scorer_name=body.scorer_name,
value=body.value,
scorer_metadata=body.metadata,
)
db.add(score)
db.commit()
db.refresh(score)
return ScoreResponse.model_validate(score)
# ---------------------------------------------------------------------------
# Leaderboard
# ---------------------------------------------------------------------------
@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse)
def leaderboard(
experiment_id: uuid.UUID,
top_n: int = Query(10, ge=1, le=100),
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
) -> LeaderboardResponse:
"""Top N runs ranked by weighted score for an experiment.
Weighted score uses the experiment's scoring_config weights if available,
otherwise uses equal weighting across all scorers.
"""
# Verify experiment exists
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
if experiment is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
# Get scoring weights from experiment config
weights: dict[str, float] = {}
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
weights = experiment.scoring_config.get("weights", {})
# Get all completed runs for this experiment with their scores
runs = (
db.query(Run)
.options(joinedload(Run.scores))
.filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed)
.all()
)
entries: list[LeaderboardEntry] = []
for run in runs:
if not run.scores:
continue
score_map: dict[str, float] = {}
for s in run.scores:
# If multiple scores with same scorer_name, use the latest
score_map[s.scorer_name] = s.value
# Compute weighted score
if weights:
total_weight = sum(weights.get(name, 0.0) for name in score_map)
if total_weight > 0:
weighted = sum(
score_map[name] * weights.get(name, 0.0)
for name in score_map
if name in weights
) / total_weight
else:
# No matching weights — fall back to equal weighting
weighted = sum(score_map.values()) / len(score_map)
else:
# Equal weighting
weighted = sum(score_map.values()) / len(score_map)
entries.append(LeaderboardEntry(
run_id=run.id,
config_hash=run.config_hash,
config=run.config,
status=run.status,
weighted_score=weighted,
scores=score_map,
duration_ms=run.duration_ms,
tokens_in=run.tokens_in,
tokens_out=run.tokens_out,
))
# Sort by weighted_score descending, take top N
entries.sort(key=lambda e: e.weighted_score, reverse=True)
entries = entries[:top_n]
return LeaderboardResponse(
experiment_id=experiment_id,
entries=entries,
total=len(entries),
)

View file

@ -139,6 +139,35 @@ class RunListResponse(BaseModel):
total: int total: int
class AdHocRunCreate(BaseModel):
"""Request body for creating an ad-hoc single run."""
experiment_id: uuid.UUID
config: dict
class LeaderboardEntry(BaseModel):
"""A single entry in the leaderboard."""
model_config = ConfigDict(from_attributes=True)
run_id: uuid.UUID
config_hash: str
config: dict
status: RunStatus
weighted_score: float
scores: dict[str, float] = Field(default_factory=dict)
duration_ms: int | None = None
tokens_in: int | None = None
tokens_out: int | None = None
class LeaderboardResponse(BaseModel):
experiment_id: uuid.UUID
entries: list[LeaderboardEntry]
total: int
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# StageResult (read-only, returned inside Run details) # StageResult (read-only, returned inside Run details)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View file

@ -117,28 +117,29 @@ def test_experiments_stop(client):
# ---- Runs router (/api/runs) ---- # ---- Runs router (/api/runs) ----
def test_runs_list(client): def test_runs_list(client):
resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/runs") resp = client.get("/api/runs/")
assert resp.status_code == 501 # Runs router is now fully implemented and requires auth (returns 401 without credentials)
assert resp.status_code == 401
def test_runs_get(client): def test_runs_get(client):
resp = client.get("/api/runs/00000000-0000-0000-0000-000000000001") resp = client.get("/api/runs/00000000-0000-0000-0000-000000000001")
assert resp.status_code == 501 assert resp.status_code == 401
def test_runs_create(client): def test_runs_create(client):
resp = client.post("/api/runs/") resp = client.post("/api/runs/")
assert resp.status_code == 501 assert resp.status_code == 401
def test_runs_score(client): def test_runs_score(client):
resp = client.post("/api/runs/00000000-0000-0000-0000-000000000001/score") resp = client.post("/api/runs/00000000-0000-0000-0000-000000000001/score")
assert resp.status_code == 501 assert resp.status_code == 401
def test_runs_leaderboard(client): def test_runs_leaderboard(client):
resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/leaderboard") resp = client.get("/api/runs/leaderboard/00000000-0000-0000-0000-000000000001")
assert resp.status_code == 501 assert resp.status_code == 401
# ---- Endpoints router (/api/endpoints) ---- # ---- Endpoints router (/api/endpoints) ----

454
backend/tests/test_runs.py Normal file
View file

@ -0,0 +1,454 @@
"""Tests for backend/routers/runs.py — Run listing, detail, ad-hoc execution, scoring, and leaderboard."""
import os
import uuid
from datetime import datetime, timezone
from unittest.mock import MagicMock, patch
import pytest
from fastapi.testclient import TestClient
JWT_SECRET = "test-secret-key-for-jwt-signing"
API_KEY = "test-api-key-12345"
@pytest.fixture(autouse=True)
def _isolate_settings(tmp_path):
"""Ensure tests use a temp SQLite DB and no Redis."""
env = {
"DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
"REDIS_URL": "",
"DATA_DIR": str(tmp_path),
"JWT_SECRET": JWT_SECRET,
"API_KEY": API_KEY,
}
with patch.dict(os.environ, env, clear=False):
import config
new_settings = config.Settings(_env_file=None)
config.settings = new_settings
import main
main.settings = new_settings
main._init_db()
main._init_redis()
from models import Base
Base.metadata.create_all(bind=main.engine)
import auth
auth.settings = new_settings
yield
@pytest.fixture
def db_session():
from main import get_db
gen = get_db()
session = next(gen)
yield session
try:
next(gen)
except StopIteration:
pass
@pytest.fixture
def admin_user(db_session):
from auth import hash_password
from models import User
user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
db_session.add(user)
db_session.commit()
db_session.refresh(user)
return user
@pytest.fixture
def project(db_session, admin_user):
from models import Project
proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
db_session.add(proj)
db_session.commit()
db_session.refresh(proj)
return proj
@pytest.fixture
def experiment(db_session, project):
from models import Experiment
exp = Experiment(
project_id=project.id,
name="Test Experiment",
description="An experiment for testing",
scoring_config={"weights": {"keyword": 0.6, "format": 0.4}},
)
db_session.add(exp)
db_session.commit()
db_session.refresh(exp)
return exp
@pytest.fixture
def auth_headers():
return {"X-Api-Key": API_KEY}
@pytest.fixture
def client():
from main import app
return TestClient(app)
def _create_run(db_session, experiment, status="completed", config=None, config_hash=None):
"""Helper to create a Run directly in the DB."""
from models import Run, RunStatus
run = Run(
experiment_id=experiment.id,
config=config or {"prompt": "test", "model": "gpt-test"},
config_hash=config_hash or uuid.uuid4().hex[:64],
status=RunStatus(status),
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc) if status == "completed" else None,
duration_ms=1000 if status == "completed" else None,
tokens_in=50,
tokens_out=100,
)
db_session.add(run)
db_session.commit()
db_session.refresh(run)
return run
def _create_score(db_session, run, scorer_name="keyword", value=0.8, metadata=None):
"""Helper to create a Score directly in the DB."""
from models import Score
score = Score(
run_id=run.id,
scorer_name=scorer_name,
value=value,
scorer_metadata=metadata,
)
db_session.add(score)
db_session.commit()
db_session.refresh(score)
return score
def _create_stage_result(db_session, run, stage_index=0):
"""Helper to create a StageResult directly in the DB."""
from models import StageResult
sr = StageResult(
run_id=run.id,
stage_index=stage_index,
prompt_sent="What is 2+2?",
response_raw="4",
model_used="gpt-test",
tokens_in=10,
tokens_out=5,
latency_ms=200,
)
db_session.add(sr)
db_session.commit()
db_session.refresh(sr)
return sr
# ---------------------------------------------------------------------------
# List runs
# ---------------------------------------------------------------------------
class TestListRuns:
def test_list_runs_empty(self, client, auth_headers, experiment):
resp = client.get("/api/runs/", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert data["items"] == []
assert data["total"] == 0
def test_list_runs_returns_all(self, client, auth_headers, db_session, experiment):
_create_run(db_session, experiment)
_create_run(db_session, experiment, status="pending")
resp = client.get("/api/runs/", headers=auth_headers)
assert resp.status_code == 200
assert resp.json()["total"] == 2
def test_list_runs_filter_by_experiment(self, client, auth_headers, db_session, experiment, project):
from models import Experiment
exp2 = Experiment(project_id=project.id, name="Other Experiment")
db_session.add(exp2)
db_session.commit()
db_session.refresh(exp2)
_create_run(db_session, experiment)
_create_run(db_session, exp2)
resp = client.get(f"/api/runs/?experiment_id={experiment.id}", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert data["total"] == 1
assert data["items"][0]["experiment_id"] == str(experiment.id)
def test_list_runs_filter_by_status(self, client, auth_headers, db_session, experiment):
_create_run(db_session, experiment, status="completed")
_create_run(db_session, experiment, status="failed")
_create_run(db_session, experiment, status="pending")
resp = client.get("/api/runs/?status=completed", headers=auth_headers)
assert resp.status_code == 200
assert resp.json()["total"] == 1
def test_list_runs_filter_by_score_range(self, client, auth_headers, db_session, experiment):
r1 = _create_run(db_session, experiment)
r2 = _create_run(db_session, experiment)
r3 = _create_run(db_session, experiment)
_create_score(db_session, r1, value=0.9)
_create_score(db_session, r2, value=0.5)
_create_score(db_session, r3, value=0.2)
resp = client.get("/api/runs/?min_score=0.4&max_score=0.95", headers=auth_headers)
assert resp.status_code == 200
assert resp.json()["total"] == 2
def test_list_runs_pagination(self, client, auth_headers, db_session, experiment):
for _ in range(5):
_create_run(db_session, experiment)
resp = client.get("/api/runs/?limit=2&offset=0", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert len(data["items"]) == 2
assert data["total"] == 5
resp2 = client.get("/api/runs/?limit=2&offset=3", headers=auth_headers)
assert resp2.status_code == 200
assert len(resp2.json()["items"]) == 2
def test_list_runs_requires_auth(self, client, experiment):
resp = client.get("/api/runs/")
assert resp.status_code in (401, 403)
# ---------------------------------------------------------------------------
# Get run detail
# ---------------------------------------------------------------------------
class TestGetRunDetail:
def test_get_run_detail(self, client, auth_headers, db_session, experiment):
run = _create_run(db_session, experiment)
_create_stage_result(db_session, run, stage_index=0)
_create_score(db_session, run, scorer_name="keyword", value=0.85)
resp = client.get(f"/api/runs/{run.id}", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert data["id"] == str(run.id)
assert len(data["stage_results"]) == 1
assert data["stage_results"][0]["stage_index"] == 0
assert len(data["scores"]) == 1
assert data["scores"][0]["scorer_name"] == "keyword"
def test_get_run_not_found(self, client, auth_headers, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/runs/{fake_id}", headers=auth_headers)
assert resp.status_code == 404
def test_get_run_detail_no_stages_or_scores(self, client, auth_headers, db_session, experiment):
run = _create_run(db_session, experiment)
resp = client.get(f"/api/runs/{run.id}", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert data["stage_results"] == []
assert data["scores"] == []
# ---------------------------------------------------------------------------
# Ad-hoc run creation
# ---------------------------------------------------------------------------
class TestCreateRun:
@patch("routers.runs.dispatch_run")
def test_create_adhoc_run(self, mock_dispatch, client, auth_headers, db_session, experiment):
body = {
"experiment_id": str(experiment.id),
"config": {"prompt": "Hello", "model": "gpt-test", "params": {"temperature": 0.7}},
}
resp = client.post("/api/runs/", json=body, headers=auth_headers)
assert resp.status_code == 201
data = resp.json()
assert data["experiment_id"] == str(experiment.id)
assert data["status"] == "pending"
assert data["config"]["prompt"] == "Hello"
assert len(data["config_hash"]) == 64
mock_dispatch.assert_called_once_with(data["id"])
def test_create_run_experiment_not_found(self, client, auth_headers, admin_user):
body = {
"experiment_id": str(uuid.uuid4()),
"config": {"prompt": "Hello", "model": "test"},
}
resp = client.post("/api/runs/", json=body, headers=auth_headers)
assert resp.status_code == 404
@patch("routers.runs.dispatch_run")
def test_create_run_config_hash_deterministic(self, mock_dispatch, client, auth_headers, db_session, experiment):
config = {"prompt": "Same prompt", "model": "same-model", "params": {}, "input_data": ""}
body = {"experiment_id": str(experiment.id), "config": config}
resp1 = client.post("/api/runs/", json=body, headers=auth_headers)
resp2 = client.post("/api/runs/", json=body, headers=auth_headers)
assert resp1.status_code == 201
assert resp2.status_code == 201
assert resp1.json()["config_hash"] == resp2.json()["config_hash"]
# ---------------------------------------------------------------------------
# Human scoring
# ---------------------------------------------------------------------------
class TestScoreRun:
def test_add_human_score(self, client, auth_headers, db_session, experiment):
run = _create_run(db_session, experiment)
body = {"scorer_name": "human", "value": 0.9, "metadata": {"comment": "Great output"}}
resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers)
assert resp.status_code == 201
data = resp.json()
assert data["scorer_name"] == "human"
assert data["value"] == 0.9
assert data["scorer_metadata"]["comment"] == "Great output"
assert data["run_id"] == str(run.id)
def test_score_run_not_found(self, client, auth_headers, admin_user):
fake_id = uuid.uuid4()
body = {"scorer_name": "human", "value": 0.5}
resp = client.post(f"/api/runs/{fake_id}/score", json=body, headers=auth_headers)
assert resp.status_code == 404
def test_add_multiple_scores(self, client, auth_headers, db_session, experiment):
run = _create_run(db_session, experiment)
client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "human", "value": 0.8}, headers=auth_headers)
client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "keyword", "value": 0.6}, headers=auth_headers)
detail = client.get(f"/api/runs/{run.id}", headers=auth_headers)
assert len(detail.json()["scores"]) == 2
def test_score_requires_scorer_name(self, client, auth_headers, db_session, experiment):
run = _create_run(db_session, experiment)
body = {"value": 0.5}
resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers)
assert resp.status_code == 422
# ---------------------------------------------------------------------------
# Leaderboard
# ---------------------------------------------------------------------------
class TestLeaderboard:
def test_leaderboard_basic(self, client, auth_headers, db_session, experiment):
r1 = _create_run(db_session, experiment)
r2 = _create_run(db_session, experiment)
r3 = _create_run(db_session, experiment)
_create_score(db_session, r1, scorer_name="keyword", value=0.9)
_create_score(db_session, r1, scorer_name="format", value=0.8)
_create_score(db_session, r2, scorer_name="keyword", value=0.5)
_create_score(db_session, r2, scorer_name="format", value=0.6)
_create_score(db_session, r3, scorer_name="keyword", value=0.3)
_create_score(db_session, r3, scorer_name="format", value=0.4)
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
assert resp.status_code == 200
data = resp.json()
assert data["experiment_id"] == str(experiment.id)
assert len(data["entries"]) == 3
# First entry should have highest weighted score
assert data["entries"][0]["run_id"] == str(r1.id)
assert data["entries"][0]["weighted_score"] > data["entries"][1]["weighted_score"]
def test_leaderboard_top_n(self, client, auth_headers, db_session, experiment):
for i in range(5):
r = _create_run(db_session, experiment)
_create_score(db_session, r, value=i * 0.2)
resp = client.get(f"/api/runs/leaderboard/{experiment.id}?top_n=3", headers=auth_headers)
assert resp.status_code == 200
assert len(resp.json()["entries"]) == 3
def test_leaderboard_weighted_scores(self, client, auth_headers, db_session, experiment):
"""Experiment has weights: keyword=0.6, format=0.4."""
r1 = _create_run(db_session, experiment)
_create_score(db_session, r1, scorer_name="keyword", value=1.0)
_create_score(db_session, r1, scorer_name="format", value=0.0)
r2 = _create_run(db_session, experiment)
_create_score(db_session, r2, scorer_name="keyword", value=0.0)
_create_score(db_session, r2, scorer_name="format", value=1.0)
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
data = resp.json()
# r1: keyword=1.0*0.6 + format=0.0*0.4 = 0.6
# r2: keyword=0.0*0.6 + format=1.0*0.4 = 0.4
assert data["entries"][0]["run_id"] == str(r1.id)
assert abs(data["entries"][0]["weighted_score"] - 0.6) < 0.001
assert abs(data["entries"][1]["weighted_score"] - 0.4) < 0.001
def test_leaderboard_equal_weights_no_config(self, client, auth_headers, db_session, project):
"""Experiment without scoring_config uses equal weighting."""
from models import Experiment
exp_no_weights = Experiment(
project_id=project.id,
name="No Weights Experiment",
scoring_config=None,
)
db_session.add(exp_no_weights)
db_session.commit()
db_session.refresh(exp_no_weights)
r = _create_run(db_session, exp_no_weights)
_create_score(db_session, r, scorer_name="keyword", value=0.8)
_create_score(db_session, r, scorer_name="format", value=0.6)
resp = client.get(f"/api/runs/leaderboard/{exp_no_weights.id}", headers=auth_headers)
data = resp.json()
assert len(data["entries"]) == 1
# Equal weight: (0.8 + 0.6) / 2 = 0.7
assert abs(data["entries"][0]["weighted_score"] - 0.7) < 0.001
def test_leaderboard_experiment_not_found(self, client, auth_headers, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/runs/leaderboard/{fake_id}", headers=auth_headers)
assert resp.status_code == 404
def test_leaderboard_no_completed_runs(self, client, auth_headers, db_session, experiment):
_create_run(db_session, experiment, status="pending")
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
assert resp.status_code == 200
assert resp.json()["entries"] == []
def test_leaderboard_skips_runs_without_scores(self, client, auth_headers, db_session, experiment):
_create_run(db_session, experiment) # no scores
r2 = _create_run(db_session, experiment)
_create_score(db_session, r2, value=0.7)
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
data = resp.json()
assert len(data["entries"]) == 1
assert data["entries"][0]["run_id"] == str(r2.id)
def test_leaderboard_includes_run_metadata(self, client, auth_headers, db_session, experiment):
r = _create_run(db_session, experiment)
_create_score(db_session, r, value=0.9)
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
entry = resp.json()["entries"][0]
assert "config_hash" in entry
assert "config" in entry
assert "duration_ms" in entry
assert "tokens_in" in entry
assert "tokens_out" in entry