"""Tests for backend/routers/runs.py — Run listing, detail, ad-hoc execution, scoring, and leaderboard.""" import os import uuid from datetime import datetime, timezone from unittest.mock import MagicMock, patch import pytest from fastapi.testclient import TestClient JWT_SECRET = "test-secret-key-for-jwt-signing" API_KEY = "test-api-key-12345" @pytest.fixture(autouse=True) def _isolate_settings(tmp_path): """Ensure tests use a temp SQLite DB and no Redis.""" env = { "DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}", "REDIS_URL": "", "DATA_DIR": str(tmp_path), "JWT_SECRET": JWT_SECRET, "API_KEY": API_KEY, } with patch.dict(os.environ, env, clear=False): import config new_settings = config.Settings(_env_file=None) config.settings = new_settings import main main.settings = new_settings main._init_db() main._init_redis() from models import Base Base.metadata.create_all(bind=main.engine) import auth auth.settings = new_settings yield @pytest.fixture def db_session(): from main import get_db gen = get_db() session = next(gen) yield session try: next(gen) except StopIteration: pass @pytest.fixture def admin_user(db_session): from auth import hash_password from models import User user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True) db_session.add(user) db_session.commit() db_session.refresh(user) return user @pytest.fixture def project(db_session, admin_user): from models import Project proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id) db_session.add(proj) db_session.commit() db_session.refresh(proj) return proj @pytest.fixture def experiment(db_session, project): from models import Experiment exp = Experiment( project_id=project.id, name="Test Experiment", description="An experiment for testing", scoring_config={"weights": {"keyword": 0.6, "format": 0.4}}, ) db_session.add(exp) db_session.commit() db_session.refresh(exp) return exp @pytest.fixture def auth_headers(): return {"X-Api-Key": API_KEY} @pytest.fixture def client(): from main import app return TestClient(app) def _create_run(db_session, experiment, status="completed", config=None, config_hash=None): """Helper to create a Run directly in the DB.""" from models import Run, RunStatus run = Run( experiment_id=experiment.id, config=config or {"prompt": "test", "model": "gpt-test"}, config_hash=config_hash or uuid.uuid4().hex[:64], status=RunStatus(status), started_at=datetime.now(timezone.utc), completed_at=datetime.now(timezone.utc) if status == "completed" else None, duration_ms=1000 if status == "completed" else None, tokens_in=50, tokens_out=100, ) db_session.add(run) db_session.commit() db_session.refresh(run) return run def _create_score(db_session, run, scorer_name="keyword", value=0.8, metadata=None): """Helper to create a Score directly in the DB.""" from models import Score score = Score( run_id=run.id, scorer_name=scorer_name, value=value, scorer_metadata=metadata, ) db_session.add(score) db_session.commit() db_session.refresh(score) return score def _create_stage_result(db_session, run, stage_index=0): """Helper to create a StageResult directly in the DB.""" from models import StageResult sr = StageResult( run_id=run.id, stage_index=stage_index, prompt_sent="What is 2+2?", response_raw="4", model_used="gpt-test", tokens_in=10, tokens_out=5, latency_ms=200, ) db_session.add(sr) db_session.commit() db_session.refresh(sr) return sr # --------------------------------------------------------------------------- # List runs # --------------------------------------------------------------------------- class TestListRuns: def test_list_runs_empty(self, client, auth_headers, experiment): resp = client.get("/api/runs/", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert data["items"] == [] assert data["total"] == 0 def test_list_runs_returns_all(self, client, auth_headers, db_session, experiment): _create_run(db_session, experiment) _create_run(db_session, experiment, status="pending") resp = client.get("/api/runs/", headers=auth_headers) assert resp.status_code == 200 assert resp.json()["total"] == 2 def test_list_runs_filter_by_experiment(self, client, auth_headers, db_session, experiment, project): from models import Experiment exp2 = Experiment(project_id=project.id, name="Other Experiment") db_session.add(exp2) db_session.commit() db_session.refresh(exp2) _create_run(db_session, experiment) _create_run(db_session, exp2) resp = client.get(f"/api/runs/?experiment_id={experiment.id}", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert data["total"] == 1 assert data["items"][0]["experiment_id"] == str(experiment.id) def test_list_runs_filter_by_status(self, client, auth_headers, db_session, experiment): _create_run(db_session, experiment, status="completed") _create_run(db_session, experiment, status="failed") _create_run(db_session, experiment, status="pending") resp = client.get("/api/runs/?status=completed", headers=auth_headers) assert resp.status_code == 200 assert resp.json()["total"] == 1 def test_list_runs_filter_by_score_range(self, client, auth_headers, db_session, experiment): r1 = _create_run(db_session, experiment) r2 = _create_run(db_session, experiment) r3 = _create_run(db_session, experiment) _create_score(db_session, r1, value=0.9) _create_score(db_session, r2, value=0.5) _create_score(db_session, r3, value=0.2) resp = client.get("/api/runs/?min_score=0.4&max_score=0.95", headers=auth_headers) assert resp.status_code == 200 assert resp.json()["total"] == 2 def test_list_runs_pagination(self, client, auth_headers, db_session, experiment): for _ in range(5): _create_run(db_session, experiment) resp = client.get("/api/runs/?limit=2&offset=0", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert len(data["items"]) == 2 assert data["total"] == 5 resp2 = client.get("/api/runs/?limit=2&offset=3", headers=auth_headers) assert resp2.status_code == 200 assert len(resp2.json()["items"]) == 2 def test_list_runs_requires_auth(self, client, experiment): resp = client.get("/api/runs/") assert resp.status_code in (401, 403) # --------------------------------------------------------------------------- # Get run detail # --------------------------------------------------------------------------- class TestGetRunDetail: def test_get_run_detail(self, client, auth_headers, db_session, experiment): run = _create_run(db_session, experiment) _create_stage_result(db_session, run, stage_index=0) _create_score(db_session, run, scorer_name="keyword", value=0.85) resp = client.get(f"/api/runs/{run.id}", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert data["id"] == str(run.id) assert len(data["stage_results"]) == 1 assert data["stage_results"][0]["stage_index"] == 0 assert len(data["scores"]) == 1 assert data["scores"][0]["scorer_name"] == "keyword" def test_get_run_not_found(self, client, auth_headers, admin_user): fake_id = uuid.uuid4() resp = client.get(f"/api/runs/{fake_id}", headers=auth_headers) assert resp.status_code == 404 def test_get_run_detail_no_stages_or_scores(self, client, auth_headers, db_session, experiment): run = _create_run(db_session, experiment) resp = client.get(f"/api/runs/{run.id}", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert data["stage_results"] == [] assert data["scores"] == [] # --------------------------------------------------------------------------- # Ad-hoc run creation # --------------------------------------------------------------------------- class TestCreateRun: @patch("routers.runs.dispatch_run") def test_create_adhoc_run(self, mock_dispatch, client, auth_headers, db_session, experiment): body = { "experiment_id": str(experiment.id), "config": {"prompt": "Hello", "model": "gpt-test", "params": {"temperature": 0.7}}, } resp = client.post("/api/runs/", json=body, headers=auth_headers) assert resp.status_code == 201 data = resp.json() assert data["experiment_id"] == str(experiment.id) assert data["status"] == "pending" assert data["config"]["prompt"] == "Hello" assert len(data["config_hash"]) == 64 mock_dispatch.assert_called_once_with(data["id"]) def test_create_run_experiment_not_found(self, client, auth_headers, admin_user): body = { "experiment_id": str(uuid.uuid4()), "config": {"prompt": "Hello", "model": "test"}, } resp = client.post("/api/runs/", json=body, headers=auth_headers) assert resp.status_code == 404 @patch("routers.runs.dispatch_run") def test_create_run_config_hash_deterministic(self, mock_dispatch, client, auth_headers, db_session, experiment): config = {"prompt": "Same prompt", "model": "same-model", "params": {}, "input_data": ""} body = {"experiment_id": str(experiment.id), "config": config} resp1 = client.post("/api/runs/", json=body, headers=auth_headers) resp2 = client.post("/api/runs/", json=body, headers=auth_headers) assert resp1.status_code == 201 assert resp2.status_code == 201 assert resp1.json()["config_hash"] == resp2.json()["config_hash"] # --------------------------------------------------------------------------- # Human scoring # --------------------------------------------------------------------------- class TestScoreRun: def test_add_human_score(self, client, auth_headers, db_session, experiment): run = _create_run(db_session, experiment) body = {"scorer_name": "human", "value": 0.9, "metadata": {"comment": "Great output"}} resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers) assert resp.status_code == 201 data = resp.json() assert data["scorer_name"] == "human" assert data["value"] == 0.9 assert data["scorer_metadata"]["comment"] == "Great output" assert data["run_id"] == str(run.id) def test_score_run_not_found(self, client, auth_headers, admin_user): fake_id = uuid.uuid4() body = {"scorer_name": "human", "value": 0.5} resp = client.post(f"/api/runs/{fake_id}/score", json=body, headers=auth_headers) assert resp.status_code == 404 def test_add_multiple_scores(self, client, auth_headers, db_session, experiment): run = _create_run(db_session, experiment) client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "human", "value": 0.8}, headers=auth_headers) client.post(f"/api/runs/{run.id}/score", json={"scorer_name": "keyword", "value": 0.6}, headers=auth_headers) detail = client.get(f"/api/runs/{run.id}", headers=auth_headers) assert len(detail.json()["scores"]) == 2 def test_score_requires_scorer_name(self, client, auth_headers, db_session, experiment): run = _create_run(db_session, experiment) body = {"value": 0.5} resp = client.post(f"/api/runs/{run.id}/score", json=body, headers=auth_headers) assert resp.status_code == 422 # --------------------------------------------------------------------------- # Leaderboard # --------------------------------------------------------------------------- class TestLeaderboard: def test_leaderboard_basic(self, client, auth_headers, db_session, experiment): r1 = _create_run(db_session, experiment) r2 = _create_run(db_session, experiment) r3 = _create_run(db_session, experiment) _create_score(db_session, r1, scorer_name="keyword", value=0.9) _create_score(db_session, r1, scorer_name="format", value=0.8) _create_score(db_session, r2, scorer_name="keyword", value=0.5) _create_score(db_session, r2, scorer_name="format", value=0.6) _create_score(db_session, r3, scorer_name="keyword", value=0.3) _create_score(db_session, r3, scorer_name="format", value=0.4) resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) assert resp.status_code == 200 data = resp.json() assert data["experiment_id"] == str(experiment.id) assert len(data["entries"]) == 3 # First entry should have highest weighted score assert data["entries"][0]["run_id"] == str(r1.id) assert data["entries"][0]["weighted_score"] > data["entries"][1]["weighted_score"] def test_leaderboard_top_n(self, client, auth_headers, db_session, experiment): for i in range(5): r = _create_run(db_session, experiment) _create_score(db_session, r, value=i * 0.2) resp = client.get(f"/api/runs/leaderboard/{experiment.id}?top_n=3", headers=auth_headers) assert resp.status_code == 200 assert len(resp.json()["entries"]) == 3 def test_leaderboard_weighted_scores(self, client, auth_headers, db_session, experiment): """Experiment has weights: keyword=0.6, format=0.4.""" r1 = _create_run(db_session, experiment) _create_score(db_session, r1, scorer_name="keyword", value=1.0) _create_score(db_session, r1, scorer_name="format", value=0.0) r2 = _create_run(db_session, experiment) _create_score(db_session, r2, scorer_name="keyword", value=0.0) _create_score(db_session, r2, scorer_name="format", value=1.0) resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) data = resp.json() # r1: keyword=1.0*0.6 + format=0.0*0.4 = 0.6 # r2: keyword=0.0*0.6 + format=1.0*0.4 = 0.4 assert data["entries"][0]["run_id"] == str(r1.id) assert abs(data["entries"][0]["weighted_score"] - 0.6) < 0.001 assert abs(data["entries"][1]["weighted_score"] - 0.4) < 0.001 def test_leaderboard_equal_weights_no_config(self, client, auth_headers, db_session, project): """Experiment without scoring_config uses equal weighting.""" from models import Experiment exp_no_weights = Experiment( project_id=project.id, name="No Weights Experiment", scoring_config=None, ) db_session.add(exp_no_weights) db_session.commit() db_session.refresh(exp_no_weights) r = _create_run(db_session, exp_no_weights) _create_score(db_session, r, scorer_name="keyword", value=0.8) _create_score(db_session, r, scorer_name="format", value=0.6) resp = client.get(f"/api/runs/leaderboard/{exp_no_weights.id}", headers=auth_headers) data = resp.json() assert len(data["entries"]) == 1 # Equal weight: (0.8 + 0.6) / 2 = 0.7 assert abs(data["entries"][0]["weighted_score"] - 0.7) < 0.001 def test_leaderboard_experiment_not_found(self, client, auth_headers, admin_user): fake_id = uuid.uuid4() resp = client.get(f"/api/runs/leaderboard/{fake_id}", headers=auth_headers) assert resp.status_code == 404 def test_leaderboard_no_completed_runs(self, client, auth_headers, db_session, experiment): _create_run(db_session, experiment, status="pending") resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) assert resp.status_code == 200 assert resp.json()["entries"] == [] def test_leaderboard_skips_runs_without_scores(self, client, auth_headers, db_session, experiment): _create_run(db_session, experiment) # no scores r2 = _create_run(db_session, experiment) _create_score(db_session, r2, value=0.7) resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) data = resp.json() assert len(data["entries"]) == 1 assert data["entries"][0]["run_id"] == str(r2.id) def test_leaderboard_includes_run_metadata(self, client, auth_headers, db_session, experiment): r = _create_run(db_session, experiment) _create_score(db_session, r, value=0.9) resp = client.get(f"/api/runs/leaderboard/{experiment.id}", headers=auth_headers) entry = resp.json()["entries"][0] assert "config_hash" in entry assert "config" in entry assert "duration_ms" in entry assert "tokens_in" in entry assert "tokens_out" in entry