MAESTRO: Implement runs router with full CRUD, filtering, scoring, and leaderboard
- List runs with filtering by experiment, status, and score range plus pagination
- Get run detail with eager-loaded stage results and scores
- Ad-hoc single run creation with Celery/sync dispatch
- Human scoring endpoint (POST /{id}/score)
- Leaderboard endpoint with configurable weighted scoring from experiment scoring_config
- Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas
- 25 tests in test_runs.py, all passing (503 total tests passing)
This commit is contained in:
parent
e6c344d554
commit
b3fb8e3063
5 changed files with 759 additions and 30 deletions
|
|
@ -38,7 +38,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
|
||||||
- [x] Implement backend/routers/experiments.py fully — CRUD plus sweep control. POST /experiments/{id}/sweep should validate the sweep config, create Run records for all configurations, and dispatch to Celery. Pause/resume/stop should set Redis flags that the sweep runner checks between runs.
|
- [x] Implement backend/routers/experiments.py fully — CRUD plus sweep control. POST /experiments/{id}/sweep should validate the sweep config, create Run records for all configurations, and dispatch to Celery. Pause/resume/stop should set Redis flags that the sweep runner checks between runs.
|
||||||
<!-- Completed: Full CRUD (list with project filter, get, create, update, delete) + sweep control (start/pause/resume/stop + status). SweepRequest/SweepStatusResponse schemas added. Sweep dispatch via Celery/sync fallback. Redis flags for pause/resume/stop, with single-container mode fallback. 34 tests in test_experiments.py, all passing. -->
|
<!-- Completed: Full CRUD (list with project filter, get, create, update, delete) + sweep control (start/pause/resume/stop + status). SweepRequest/SweepStatusResponse schemas added. Sweep dispatch via Celery/sync fallback. Redis flags for pause/resume/stop, with single-container mode fallback. 34 tests in test_experiments.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
|
- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
|
||||||
|
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,37 +1,281 @@
|
||||||
"""Runs router — execute, detail, score, and leaderboard."""
|
"""Runs router — list, detail, ad-hoc execution, human scoring, and leaderboard.
|
||||||
|
|
||||||
|
Provides filtering by experiment, status, and score range. The leaderboard
|
||||||
|
endpoint returns top N runs ranked by weighted score.
|
||||||
|
"""
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from fastapi import APIRouter, Response
|
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||||
|
from sqlalchemy.orm import Session, joinedload
|
||||||
|
|
||||||
|
from auth import get_current_user
|
||||||
|
from engine.cache import compute_config_hash
|
||||||
|
from engine.tasks import dispatch_run
|
||||||
|
from main import get_db
|
||||||
|
from models import Experiment, Run, RunStatus, Score, StageResult, User
|
||||||
|
from schemas import (
|
||||||
|
AdHocRunCreate,
|
||||||
|
LeaderboardEntry,
|
||||||
|
LeaderboardResponse,
|
||||||
|
RunDetailResponse,
|
||||||
|
RunListResponse,
|
||||||
|
RunResponse,
|
||||||
|
ScoreInput,
|
||||||
|
ScoreResponse,
|
||||||
|
StageResultResponse,
|
||||||
|
)
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/runs", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def list_runs(experiment_id: uuid.UUID):
|
# Helpers
|
||||||
"""List runs with scores (sortable, filterable)."""
|
# ---------------------------------------------------------------------------
|
||||||
return Response(status_code=501, content="Not Implemented")
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{run_id}", status_code=501)
|
def _get_run_or_404(db: Session, run_id: uuid.UUID) -> Run:
|
||||||
def get_run(run_id: uuid.UUID):
|
run = db.query(Run).filter(Run.id == run_id).first()
|
||||||
"""Run detail with stage results."""
|
if run is None:
|
||||||
return Response(status_code=501, content="Not Implemented")
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
@router.post("/", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def create_run():
|
# List runs with filtering
|
||||||
"""Execute a single run (ad-hoc)."""
|
# ---------------------------------------------------------------------------
|
||||||
return Response(status_code=501, content="Not Implemented")
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{run_id}/score", status_code=501)
|
@router.get("/", response_model=RunListResponse)
|
||||||
def score_run(run_id: uuid.UUID):
|
def list_runs(
|
||||||
"""Add human rating to a run."""
|
experiment_id: uuid.UUID | None = Query(None),
|
||||||
return Response(status_code=501, content="Not Implemented")
|
run_status: RunStatus | None = Query(None, alias="status"),
|
||||||
|
min_score: float | None = Query(None, ge=0.0, le=1.0),
|
||||||
|
max_score: float | None = Query(None, ge=0.0, le=1.0),
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
) -> RunListResponse:
|
||||||
|
"""List runs with optional filtering by experiment, status, and score range."""
|
||||||
|
query = db.query(Run)
|
||||||
|
|
||||||
|
if experiment_id is not None:
|
||||||
|
query = query.filter(Run.experiment_id == experiment_id)
|
||||||
|
|
||||||
|
if run_status is not None:
|
||||||
|
query = query.filter(Run.status == run_status)
|
||||||
|
|
||||||
|
# Score range filtering: filter runs whose average score falls within range
|
||||||
|
if min_score is not None or max_score is not None:
|
||||||
|
from sqlalchemy import func
|
||||||
|
|
||||||
|
score_subquery = (
|
||||||
|
db.query(Score.run_id, func.avg(Score.value).label("avg_score"))
|
||||||
|
.group_by(Score.run_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
query = query.join(score_subquery, Run.id == score_subquery.c.run_id)
|
||||||
|
|
||||||
|
if min_score is not None:
|
||||||
|
query = query.filter(score_subquery.c.avg_score >= min_score)
|
||||||
|
if max_score is not None:
|
||||||
|
query = query.filter(score_subquery.c.avg_score <= max_score)
|
||||||
|
|
||||||
|
total = query.count()
|
||||||
|
runs = query.order_by(Run.started_at.desc().nullslast()).offset(offset).limit(limit).all()
|
||||||
|
|
||||||
|
return RunListResponse(
|
||||||
|
items=[RunResponse.model_validate(r) for r in runs],
|
||||||
|
total=total,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/leaderboard", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def leaderboard(experiment_id: uuid.UUID):
|
# Get run detail
|
||||||
"""Top runs ranked by weighted score."""
|
# ---------------------------------------------------------------------------
|
||||||
return Response(status_code=501, content="Not Implemented")
|
|
||||||
|
|
||||||
|
@router.get("/{run_id}", response_model=RunDetailResponse)
|
||||||
|
def get_run(
|
||||||
|
run_id: uuid.UUID,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
) -> RunDetailResponse:
|
||||||
|
"""Get run detail with stage results and scores."""
|
||||||
|
run = (
|
||||||
|
db.query(Run)
|
||||||
|
.options(joinedload(Run.stage_results), joinedload(Run.scores))
|
||||||
|
.filter(Run.id == run_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if run is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run not found")
|
||||||
|
|
||||||
|
return RunDetailResponse(
|
||||||
|
id=run.id,
|
||||||
|
experiment_id=run.experiment_id,
|
||||||
|
config_hash=run.config_hash,
|
||||||
|
config=run.config,
|
||||||
|
status=run.status,
|
||||||
|
started_at=run.started_at,
|
||||||
|
completed_at=run.completed_at,
|
||||||
|
duration_ms=run.duration_ms,
|
||||||
|
tokens_in=run.tokens_in,
|
||||||
|
tokens_out=run.tokens_out,
|
||||||
|
cost_estimate=float(run.cost_estimate) if run.cost_estimate is not None else None,
|
||||||
|
stage_results=[StageResultResponse.model_validate(sr) for sr in run.stage_results],
|
||||||
|
scores=[ScoreResponse.model_validate(s) for s in run.scores],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Ad-hoc single run
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/", response_model=RunResponse, status_code=status.HTTP_201_CREATED)
|
||||||
|
def create_run(
|
||||||
|
body: AdHocRunCreate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
) -> RunResponse:
|
||||||
|
"""Create and dispatch an ad-hoc single run."""
|
||||||
|
# Verify experiment exists
|
||||||
|
experiment = db.query(Experiment).filter(Experiment.id == body.experiment_id).first()
|
||||||
|
if experiment is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
||||||
|
|
||||||
|
config_hash = compute_config_hash(
|
||||||
|
prompt=body.config.get("prompt", ""),
|
||||||
|
model=body.config.get("model", ""),
|
||||||
|
params=body.config.get("params", {}),
|
||||||
|
input_data=body.config.get("input_data", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
run = Run(
|
||||||
|
experiment_id=body.experiment_id,
|
||||||
|
config=body.config,
|
||||||
|
config_hash=config_hash,
|
||||||
|
status=RunStatus.pending,
|
||||||
|
)
|
||||||
|
db.add(run)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(run)
|
||||||
|
|
||||||
|
# Dispatch execution asynchronously
|
||||||
|
dispatch_run(str(run.id))
|
||||||
|
|
||||||
|
return RunResponse.model_validate(run)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Human scoring
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{run_id}/score", response_model=ScoreResponse, status_code=status.HTTP_201_CREATED)
|
||||||
|
def score_run(
|
||||||
|
run_id: uuid.UUID,
|
||||||
|
body: ScoreInput,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""Add a human rating/score to a run."""
|
||||||
|
run = _get_run_or_404(db, run_id)
|
||||||
|
|
||||||
|
score = Score(
|
||||||
|
run_id=run.id,
|
||||||
|
scorer_name=body.scorer_name,
|
||||||
|
value=body.value,
|
||||||
|
scorer_metadata=body.metadata,
|
||||||
|
)
|
||||||
|
db.add(score)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(score)
|
||||||
|
|
||||||
|
return ScoreResponse.model_validate(score)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Leaderboard
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/leaderboard/{experiment_id}", response_model=LeaderboardResponse)
|
||||||
|
def leaderboard(
|
||||||
|
experiment_id: uuid.UUID,
|
||||||
|
top_n: int = Query(10, ge=1, le=100),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
) -> LeaderboardResponse:
|
||||||
|
"""Top N runs ranked by weighted score for an experiment.
|
||||||
|
|
||||||
|
Weighted score uses the experiment's scoring_config weights if available,
|
||||||
|
otherwise uses equal weighting across all scorers.
|
||||||
|
"""
|
||||||
|
# Verify experiment exists
|
||||||
|
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
||||||
|
if experiment is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
||||||
|
|
||||||
|
# Get scoring weights from experiment config
|
||||||
|
weights: dict[str, float] = {}
|
||||||
|
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
|
||||||
|
weights = experiment.scoring_config.get("weights", {})
|
||||||
|
|
||||||
|
# Get all completed runs for this experiment with their scores
|
||||||
|
runs = (
|
||||||
|
db.query(Run)
|
||||||
|
.options(joinedload(Run.scores))
|
||||||
|
.filter(Run.experiment_id == experiment_id, Run.status == RunStatus.completed)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
entries: list[LeaderboardEntry] = []
|
||||||
|
for run in runs:
|
||||||
|
if not run.scores:
|
||||||
|
continue
|
||||||
|
|
||||||
|
score_map: dict[str, float] = {}
|
||||||
|
for s in run.scores:
|
||||||
|
# If multiple scores with same scorer_name, use the latest
|
||||||
|
score_map[s.scorer_name] = s.value
|
||||||
|
|
||||||
|
# Compute weighted score
|
||||||
|
if weights:
|
||||||
|
total_weight = sum(weights.get(name, 0.0) for name in score_map)
|
||||||
|
if total_weight > 0:
|
||||||
|
weighted = sum(
|
||||||
|
score_map[name] * weights.get(name, 0.0)
|
||||||
|
for name in score_map
|
||||||
|
if name in weights
|
||||||
|
) / total_weight
|
||||||
|
else:
|
||||||
|
# No matching weights — fall back to equal weighting
|
||||||
|
weighted = sum(score_map.values()) / len(score_map)
|
||||||
|
else:
|
||||||
|
# Equal weighting
|
||||||
|
weighted = sum(score_map.values()) / len(score_map)
|
||||||
|
|
||||||
|
entries.append(LeaderboardEntry(
|
||||||
|
run_id=run.id,
|
||||||
|
config_hash=run.config_hash,
|
||||||
|
config=run.config,
|
||||||
|
status=run.status,
|
||||||
|
weighted_score=weighted,
|
||||||
|
scores=score_map,
|
||||||
|
duration_ms=run.duration_ms,
|
||||||
|
tokens_in=run.tokens_in,
|
||||||
|
tokens_out=run.tokens_out,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Sort by weighted_score descending, take top N
|
||||||
|
entries.sort(key=lambda e: e.weighted_score, reverse=True)
|
||||||
|
entries = entries[:top_n]
|
||||||
|
|
||||||
|
return LeaderboardResponse(
|
||||||
|
experiment_id=experiment_id,
|
||||||
|
entries=entries,
|
||||||
|
total=len(entries),
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -139,6 +139,35 @@ class RunListResponse(BaseModel):
|
||||||
total: int
|
total: int
|
||||||
|
|
||||||
|
|
||||||
|
class AdHocRunCreate(BaseModel):
|
||||||
|
"""Request body for creating an ad-hoc single run."""
|
||||||
|
|
||||||
|
experiment_id: uuid.UUID
|
||||||
|
config: dict
|
||||||
|
|
||||||
|
|
||||||
|
class LeaderboardEntry(BaseModel):
|
||||||
|
"""A single entry in the leaderboard."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
run_id: uuid.UUID
|
||||||
|
config_hash: str
|
||||||
|
config: dict
|
||||||
|
status: RunStatus
|
||||||
|
weighted_score: float
|
||||||
|
scores: dict[str, float] = Field(default_factory=dict)
|
||||||
|
duration_ms: int | None = None
|
||||||
|
tokens_in: int | None = None
|
||||||
|
tokens_out: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class LeaderboardResponse(BaseModel):
|
||||||
|
experiment_id: uuid.UUID
|
||||||
|
entries: list[LeaderboardEntry]
|
||||||
|
total: int
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# StageResult (read-only, returned inside Run details)
|
# StageResult (read-only, returned inside Run details)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -117,28 +117,29 @@ def test_experiments_stop(client):
|
||||||
# ---- Runs router (/api/runs) ----
|
# ---- Runs router (/api/runs) ----
|
||||||
|
|
||||||
def test_runs_list(client):
|
def test_runs_list(client):
|
||||||
resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/runs")
|
resp = client.get("/api/runs/")
|
||||||
assert resp.status_code == 501
|
# Runs router is now fully implemented and requires auth (returns 401 without credentials)
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_runs_get(client):
|
def test_runs_get(client):
|
||||||
resp = client.get("/api/runs/00000000-0000-0000-0000-000000000001")
|
resp = client.get("/api/runs/00000000-0000-0000-0000-000000000001")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_runs_create(client):
|
def test_runs_create(client):
|
||||||
resp = client.post("/api/runs/")
|
resp = client.post("/api/runs/")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_runs_score(client):
|
def test_runs_score(client):
|
||||||
resp = client.post("/api/runs/00000000-0000-0000-0000-000000000001/score")
|
resp = client.post("/api/runs/00000000-0000-0000-0000-000000000001/score")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_runs_leaderboard(client):
|
def test_runs_leaderboard(client):
|
||||||
resp = client.get("/api/runs/experiments/00000000-0000-0000-0000-000000000001/leaderboard")
|
resp = client.get("/api/runs/leaderboard/00000000-0000-0000-0000-000000000001")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
# ---- Endpoints router (/api/endpoints) ----
|
# ---- Endpoints router (/api/endpoints) ----
|
||||||
|
|
|
||||||
454
backend/tests/test_runs.py
Normal file
454
backend/tests/test_runs.py
Normal file
|
|
@ -0,0 +1,454 @@
|
||||||
|
"""Tests for backend/routers/runs.py — Run listing, detail, ad-hoc execution, scoring, and leaderboard."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
JWT_SECRET = "test-secret-key-for-jwt-signing"
|
||||||
|
API_KEY = "test-api-key-12345"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _isolate_settings(tmp_path):
|
||||||
|
"""Ensure tests use a temp SQLite DB and no Redis."""
|
||||||
|
env = {
|
||||||
|
"DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
|
||||||
|
"REDIS_URL": "",
|
||||||
|
"DATA_DIR": str(tmp_path),
|
||||||
|
"JWT_SECRET": JWT_SECRET,
|
||||||
|
"API_KEY": API_KEY,
|
||||||
|
}
|
||||||
|
with patch.dict(os.environ, env, clear=False):
|
||||||
|
import config
|
||||||
|
new_settings = config.Settings(_env_file=None)
|
||||||
|
config.settings = new_settings
|
||||||
|
|
||||||
|
import main
|
||||||
|
main.settings = new_settings
|
||||||
|
main._init_db()
|
||||||
|
main._init_redis()
|
||||||
|
|
||||||
|
from models import Base
|
||||||
|
Base.metadata.create_all(bind=main.engine)
|
||||||
|
|
||||||
|
import auth
|
||||||
|
auth.settings = new_settings
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session():
|
||||||
|
from main import get_db
|
||||||
|
gen = get_db()
|
||||||
|
session = next(gen)
|
||||||
|
yield session
|
||||||
|
try:
|
||||||
|
next(gen)
|
||||||
|
except StopIteration:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def admin_user(db_session):
|
||||||
|
from auth import hash_password
|
||||||
|
from models import User
|
||||||
|
user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
|
||||||
|
db_session.add(user)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(user)
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def project(db_session, admin_user):
|
||||||
|
from models import Project
|
||||||
|
proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
|
||||||
|
db_session.add(proj)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(proj)
|
||||||
|
return proj
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def experiment(db_session, project):
|
||||||
|
from models import Experiment
|
||||||
|
exp = Experiment(
|
||||||
|
project_id=project.id,
|
||||||
|
name="Test Experiment",
|
||||||
|
description="An experiment for testing",
|
||||||
|
scoring_config={"weights": {"keyword": 0.6, "format": 0.4}},
|
||||||
|
)
|
||||||
|
db_session.add(exp)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(exp)
|
||||||
|
return exp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def auth_headers():
|
||||||
|
return {"X-Api-Key": API_KEY}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
from main import app
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_run(db_session, experiment, status="completed", config=None, config_hash=None):
|
||||||
|
"""Helper to create a Run directly in the DB."""
|
||||||
|
from models import Run, RunStatus
|
||||||
|
run = Run(
|
||||||
|
experiment_id=experiment.id,
|
||||||
|
config=config or {"prompt": "test", "model": "gpt-test"},
|
||||||
|
config_hash=config_hash or uuid.uuid4().hex[:64],
|
||||||
|
status=RunStatus(status),
|
||||||
|
started_at=datetime.now(timezone.utc),
|
||||||
|
completed_at=datetime.now(timezone.utc) if status == "completed" else None,
|
||||||
|
duration_ms=1000 if status == "completed" else None,
|
||||||
|
tokens_in=50,
|
||||||
|
tokens_out=100,
|
||||||
|
)
|
||||||
|
db_session.add(run)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(run)
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
def _create_score(db_session, run, scorer_name="keyword", value=0.8, metadata=None):
|
||||||
|
"""Helper to create a Score directly in the DB."""
|
||||||
|
from models import Score
|
||||||
|
score = Score(
|
||||||
|
run_id=run.id,
|
||||||
|
scorer_name=scorer_name,
|
||||||
|
value=value,
|
||||||
|
scorer_metadata=metadata,
|
||||||
|
)
|
||||||
|
db_session.add(score)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(score)
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _create_stage_result(db_session, run, stage_index=0):
|
||||||
|
"""Helper to create a StageResult directly in the DB."""
|
||||||
|
from models import StageResult
|
||||||
|
sr = StageResult(
|
||||||
|
run_id=run.id,
|
||||||
|
stage_index=stage_index,
|
||||||
|
prompt_sent="What is 2+2?",
|
||||||
|
response_raw="4",
|
||||||
|
model_used="gpt-test",
|
||||||
|
tokens_in=10,
|
||||||
|
tokens_out=5,
|
||||||
|
latency_ms=200,
|
||||||
|
)
|
||||||
|
db_session.add(sr)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(sr)
|
||||||
|
return sr
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# List runs
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestListRuns:
|
||||||
|
def test_list_runs_empty(self, client, auth_headers, experiment):
|
||||||
|
resp = client.get("/api/runs/", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["items"] == []
|
||||||
|
assert data["total"] == 0
|
||||||
|
|
||||||
|
def test_list_runs_returns_all(self, client, auth_headers, db_session, experiment):
|
||||||
|
_create_run(db_session, experiment)
|
||||||
|
_create_run(db_session, experiment, status="pending")
|
||||||
|
resp = client.get("/api/runs/", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["total"] == 2
|
||||||
|
|
||||||
|
def test_list_runs_filter_by_experiment(self, client, auth_headers, db_session, experiment, project):
|
||||||
|
from models import Experiment
|
||||||
|
exp2 = Experiment(project_id=project.id, name="Other Experiment")
|
||||||
|
db_session.add(exp2)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(exp2)
|
||||||
|
|
||||||
|
_create_run(db_session, experiment)
|
||||||
|
_create_run(db_session, exp2)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/?experiment_id={experiment.id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["total"] == 1
|
||||||
|
assert data["items"][0]["experiment_id"] == str(experiment.id)
|
||||||
|
|
||||||
|
def test_list_runs_filter_by_status(self, client, auth_headers, db_session, experiment):
|
||||||
|
_create_run(db_session, experiment, status="completed")
|
||||||
|
_create_run(db_session, experiment, status="failed")
|
||||||
|
_create_run(db_session, experiment, status="pending")
|
||||||
|
|
||||||
|
resp = client.get("/api/runs/?status=completed", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["total"] == 1
|
||||||
|
|
||||||
|
def test_list_runs_filter_by_score_range(self, client, auth_headers, db_session, experiment):
|
||||||
|
r1 = _create_run(db_session, experiment)
|
||||||
|
r2 = _create_run(db_session, experiment)
|
||||||
|
r3 = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r1, value=0.9)
|
||||||
|
_create_score(db_session, r2, value=0.5)
|
||||||
|
_create_score(db_session, r3, value=0.2)
|
||||||
|
|
||||||
|
resp = client.get("/api/runs/?min_score=0.4&max_score=0.95", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["total"] == 2
|
||||||
|
|
||||||
|
def test_list_runs_pagination(self, client, auth_headers, db_session, experiment):
|
||||||
|
for _ in range(5):
|
||||||
|
_create_run(db_session, experiment)
|
||||||
|
|
||||||
|
resp = client.get("/api/runs/?limit=2&offset=0", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert len(data["items"]) == 2
|
||||||
|
assert data["total"] == 5
|
||||||
|
|
||||||
|
resp2 = client.get("/api/runs/?limit=2&offset=3", headers=auth_headers)
|
||||||
|
assert resp2.status_code == 200
|
||||||
|
assert len(resp2.json()["items"]) == 2
|
||||||
|
|
||||||
|
def test_list_runs_requires_auth(self, client, experiment):
|
||||||
|
resp = client.get("/api/runs/")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Get run detail
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetRunDetail:
|
||||||
|
def test_get_run_detail(self, client, auth_headers, db_session, experiment):
|
||||||
|
run = _create_run(db_session, experiment)
|
||||||
|
_create_stage_result(db_session, run, stage_index=0)
|
||||||
|
_create_score(db_session, run, scorer_name="keyword", value=0.85)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/{run.id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["id"] == str(run.id)
|
||||||
|
assert len(data["stage_results"]) == 1
|
||||||
|
assert data["stage_results"][0]["stage_index"] == 0
|
||||||
|
assert len(data["scores"]) == 1
|
||||||
|
assert data["scores"][0]["scorer_name"] == "keyword"
|
||||||
|
|
||||||
|
def test_get_run_not_found(self, client, auth_headers, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/runs/{fake_id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_get_run_detail_no_stages_or_scores(self, client, auth_headers, db_session, experiment):
|
||||||
|
run = _create_run(db_session, experiment)
|
||||||
|
resp = client.get(f"/api/runs/{run.id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["stage_results"] == []
|
||||||
|
assert data["scores"] == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Ad-hoc run creation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestCreateRun:
|
||||||
|
@patch("routers.runs.dispatch_run")
|
||||||
|
def test_create_adhoc_run(self, mock_dispatch, client, auth_headers, db_session, experiment):
|
||||||
|
body = {
|
||||||
|
"experiment_id": str(experiment.id),
|
||||||
|
"config": {"prompt": "Hello", "model": "gpt-test", "params": {"temperature": 0.7}},
|
||||||
|
}
|
||||||
|
resp = client.post("/api/runs/", json=body, headers=auth_headers)
|
||||||
|
assert resp.status_code == 201
|
||||||
|
data = resp.json()
|
||||||
|
assert data["experiment_id"] == str(experiment.id)
|
||||||
|
assert data["status"] == "pending"
|
||||||
|
assert data["config"]["prompt"] == "Hello"
|
||||||
|
assert len(data["config_hash"]) == 64
|
||||||
|
mock_dispatch.assert_called_once_with(data["id"])
|
||||||
|
|
||||||
|
def test_create_run_experiment_not_found(self, client, auth_headers, admin_user):
|
||||||
|
body = {
|
||||||
|
"experiment_id": str(uuid.uuid4()),
|
||||||
|
"config": {"prompt": "Hello", "model": "test"},
|
||||||
|
}
|
||||||
|
resp = client.post("/api/runs/", json=body, headers=auth_headers)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
@patch("routers.runs.dispatch_run")
|
||||||
|
def test_create_run_config_hash_deterministic(self, mock_dispatch, client, auth_headers, db_session, experiment):
|
||||||
|
config = {"prompt": "Same prompt", "model": "same-model", "params": {}, "input_data": ""}
|
||||||
|
body = {"experiment_id": str(experiment.id), "config": config}
|
||||||
|
|
||||||
|
resp1 = client.post("/api/runs/", json=body, headers=auth_headers)
|
||||||
|
resp2 = client.post("/api/runs/", json=body, headers=auth_headers)
|
||||||
|
assert resp1.status_code == 201
|
||||||
|
assert resp2.status_code == 201
|
||||||
|
assert resp1.json()["config_hash"] == resp2.json()["config_hash"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Human scoring
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreRun:
|
||||||
|
def test_add_human_score(self, client, auth_headers, db_session, experiment):
|
||||||
|
run = _create_run(db_session, experiment)
|
||||||
|
body = {"scorer_name": "human", "value": 0.9, "metadata": {"comment": "Great output"}}
|
||||||
|
resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers)
|
||||||
|
assert resp.status_code == 201
|
||||||
|
data = resp.json()
|
||||||
|
assert data["scorer_name"] == "human"
|
||||||
|
assert data["value"] == 0.9
|
||||||
|
assert data["scorer_metadata"]["comment"] == "Great output"
|
||||||
|
assert data["run_id"] == str(run.id)
|
||||||
|
|
||||||
|
def test_score_run_not_found(self, client, auth_headers, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
body = {"scorer_name": "human", "value": 0.5}
|
||||||
|
resp = client.post(f"/api/runs/{fake_id}/score", json=body, headers=auth_headers)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_add_multiple_scores(self, client, auth_headers, db_session, experiment):
|
||||||
|
run = _create_run(db_session, experiment)
|
||||||
|
client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "human", "value": 0.8}, headers=auth_headers)
|
||||||
|
client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "keyword", "value": 0.6}, headers=auth_headers)
|
||||||
|
|
||||||
|
detail = client.get(f"/api/runs/{run.id}", headers=auth_headers)
|
||||||
|
assert len(detail.json()["scores"]) == 2
|
||||||
|
|
||||||
|
def test_score_requires_scorer_name(self, client, auth_headers, db_session, experiment):
|
||||||
|
run = _create_run(db_session, experiment)
|
||||||
|
body = {"value": 0.5}
|
||||||
|
resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers)
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Leaderboard
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestLeaderboard:
|
||||||
|
def test_leaderboard_basic(self, client, auth_headers, db_session, experiment):
|
||||||
|
r1 = _create_run(db_session, experiment)
|
||||||
|
r2 = _create_run(db_session, experiment)
|
||||||
|
r3 = _create_run(db_session, experiment)
|
||||||
|
|
||||||
|
_create_score(db_session, r1, scorer_name="keyword", value=0.9)
|
||||||
|
_create_score(db_session, r1, scorer_name="format", value=0.8)
|
||||||
|
_create_score(db_session, r2, scorer_name="keyword", value=0.5)
|
||||||
|
_create_score(db_session, r2, scorer_name="format", value=0.6)
|
||||||
|
_create_score(db_session, r3, scorer_name="keyword", value=0.3)
|
||||||
|
_create_score(db_session, r3, scorer_name="format", value=0.4)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["experiment_id"] == str(experiment.id)
|
||||||
|
assert len(data["entries"]) == 3
|
||||||
|
# First entry should have highest weighted score
|
||||||
|
assert data["entries"][0]["run_id"] == str(r1.id)
|
||||||
|
assert data["entries"][0]["weighted_score"] > data["entries"][1]["weighted_score"]
|
||||||
|
|
||||||
|
def test_leaderboard_top_n(self, client, auth_headers, db_session, experiment):
|
||||||
|
for i in range(5):
|
||||||
|
r = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r, value=i * 0.2)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}?top_n=3", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert len(resp.json()["entries"]) == 3
|
||||||
|
|
||||||
|
def test_leaderboard_weighted_scores(self, client, auth_headers, db_session, experiment):
|
||||||
|
"""Experiment has weights: keyword=0.6, format=0.4."""
|
||||||
|
r1 = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r1, scorer_name="keyword", value=1.0)
|
||||||
|
_create_score(db_session, r1, scorer_name="format", value=0.0)
|
||||||
|
|
||||||
|
r2 = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r2, scorer_name="keyword", value=0.0)
|
||||||
|
_create_score(db_session, r2, scorer_name="format", value=1.0)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
|
||||||
|
data = resp.json()
|
||||||
|
# r1: keyword=1.0*0.6 + format=0.0*0.4 = 0.6
|
||||||
|
# r2: keyword=0.0*0.6 + format=1.0*0.4 = 0.4
|
||||||
|
assert data["entries"][0]["run_id"] == str(r1.id)
|
||||||
|
assert abs(data["entries"][0]["weighted_score"] - 0.6) < 0.001
|
||||||
|
assert abs(data["entries"][1]["weighted_score"] - 0.4) < 0.001
|
||||||
|
|
||||||
|
def test_leaderboard_equal_weights_no_config(self, client, auth_headers, db_session, project):
|
||||||
|
"""Experiment without scoring_config uses equal weighting."""
|
||||||
|
from models import Experiment
|
||||||
|
exp_no_weights = Experiment(
|
||||||
|
project_id=project.id,
|
||||||
|
name="No Weights Experiment",
|
||||||
|
scoring_config=None,
|
||||||
|
)
|
||||||
|
db_session.add(exp_no_weights)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(exp_no_weights)
|
||||||
|
|
||||||
|
r = _create_run(db_session, exp_no_weights)
|
||||||
|
_create_score(db_session, r, scorer_name="keyword", value=0.8)
|
||||||
|
_create_score(db_session, r, scorer_name="format", value=0.6)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{exp_no_weights.id}", headers=auth_headers)
|
||||||
|
data = resp.json()
|
||||||
|
assert len(data["entries"]) == 1
|
||||||
|
# Equal weight: (0.8 + 0.6) / 2 = 0.7
|
||||||
|
assert abs(data["entries"][0]["weighted_score"] - 0.7) < 0.001
|
||||||
|
|
||||||
|
def test_leaderboard_experiment_not_found(self, client, auth_headers, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{fake_id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_leaderboard_no_completed_runs(self, client, auth_headers, db_session, experiment):
|
||||||
|
_create_run(db_session, experiment, status="pending")
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["entries"] == []
|
||||||
|
|
||||||
|
def test_leaderboard_skips_runs_without_scores(self, client, auth_headers, db_session, experiment):
|
||||||
|
_create_run(db_session, experiment) # no scores
|
||||||
|
r2 = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r2, value=0.7)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
|
||||||
|
data = resp.json()
|
||||||
|
assert len(data["entries"]) == 1
|
||||||
|
assert data["entries"][0]["run_id"] == str(r2.id)
|
||||||
|
|
||||||
|
def test_leaderboard_includes_run_metadata(self, client, auth_headers, db_session, experiment):
|
||||||
|
r = _create_run(db_session, experiment)
|
||||||
|
_create_score(db_session, r, value=0.9)
|
||||||
|
|
||||||
|
resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers)
|
||||||
|
entry = resp.json()["entries"][0]
|
||||||
|
assert "config_hash" in entry
|
||||||
|
assert "config" in entry
|
||||||
|
assert "duration_ms" in entry
|
||||||
|
assert "tokens_in" in entry
|
||||||
|
assert "tokens_out" in entry
|
||||||
Loading…
Add table
Reference in a new issue