MAESTRO: Implement export router with JSON, .env, YAML, and markdown report endpoints

Four fully authenticated endpoints at /api/export/experiments/{id}/:
- /best: Returns best config as JSON with weighted score and metadata
- /env: Flattened KEY=VALUE format with metadata comments
- /yaml: Simple YAML serialization (no external dependency)
- /report: Full markdown report with config space, top N configs,
  score distributions, token usage, and timing stats

34 tests in test_export.py covering all endpoints, auth, 404s, and helpers.
Updated test_routers.py to expect 401 (auth required) instead of 501 (stub).
This commit is contained in:
John Lightner 2026-04-07 03:30:45 -05:00
parent 32535a92ea
commit e42117c8ee
4 changed files with 768 additions and 18 deletions

View file

@ -41,7 +41,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score. - [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. --> <!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats. - [x] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
<!-- Completed: Full export router with 4 endpoints: /best (JSON with weighted score, metadata), /env (flattened KEY=VALUE with comments), /yaml (simple serializer, no PyYAML dependency), /report (markdown with config space, top N configs, score distributions, token usage, timing stats). Auth required on all endpoints. 34 tests in test_export.py, all passing. -->
- [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events). - [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events).

View file

@ -1,31 +1,387 @@
"""Export router — export experiment results in various formats.""" """Export router — export experiment results in various formats."""
import json
import uuid import uuid
from datetime import datetime, timezone
from fastapi import APIRouter, Response from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
from sqlalchemy.orm import Session, joinedload
from auth import get_current_user
from main import get_db
from models import Experiment, Run, RunStatus, Score, StageResult, User
router = APIRouter() router = APIRouter()
@router.get("/experiments/{experiment_id}/best", status_code=501) # ---------------------------------------------------------------------------
def export_best(experiment_id: uuid.UUID): # Helpers
# ---------------------------------------------------------------------------
def _get_experiment_or_404(db: Session, experiment_id: uuid.UUID) -> Experiment:
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
if experiment is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
return experiment
def _get_scoring_weights(experiment: Experiment) -> dict[str, float]:
weights: dict[str, float] = {}
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
weights = experiment.scoring_config.get("weights", {})
return weights
def _compute_weighted_score(scores: list[Score], weights: dict[str, float]) -> float:
"""Compute weighted score for a run's scores."""
if not scores:
return 0.0
score_map: dict[str, float] = {}
for s in scores:
score_map[s.scorer_name] = s.value
if weights:
total_weight = sum(weights.get(name, 0.0) for name in score_map)
if total_weight > 0:
return sum(
score_map[name] * weights.get(name, 0.0)
for name in score_map
if name in weights
) / total_weight
else:
return sum(score_map.values()) / len(score_map)
else:
return sum(score_map.values()) / len(score_map)
def _get_best_run(db: Session, experiment: Experiment) -> Run | None:
"""Return the best completed run by weighted score, or None."""
weights = _get_scoring_weights(experiment)
runs = (
db.query(Run)
.options(joinedload(Run.scores))
.filter(Run.experiment_id == experiment.id, Run.status == RunStatus.completed)
.all()
)
best_run = None
best_score = -1.0
for run in runs:
if not run.scores:
continue
ws = _compute_weighted_score(run.scores, weights)
if ws > best_score:
best_score = ws
best_run = run
return best_run
def _build_best_config_payload(experiment: Experiment, run: Run, weights: dict[str, float]) -> dict:
"""Build the metadata+config dict for the best run."""
score_map = {s.scorer_name: s.value for s in run.scores}
return {
"experiment_name": experiment.name,
"experiment_id": str(experiment.id),
"exported_at": datetime.now(timezone.utc).isoformat(),
"weighted_score": _compute_weighted_score(run.scores, weights),
"scores": score_map,
"run_id": str(run.id),
"config_hash": run.config_hash,
"config": run.config,
}
# ---------------------------------------------------------------------------
# Export Best Config — JSON
# ---------------------------------------------------------------------------
@router.get("/experiments/{experiment_id}/best")
def export_best(
experiment_id: uuid.UUID,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
):
"""Best config as JSON.""" """Best config as JSON."""
return Response(status_code=501, content="Not Implemented") experiment = _get_experiment_or_404(db, experiment_id)
best_run = _get_best_run(db, experiment)
if best_run is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed runs with scores found",
)
weights = _get_scoring_weights(experiment)
payload = _build_best_config_payload(experiment, best_run, weights)
return payload
@router.get("/experiments/{experiment_id}/env", status_code=501) # ---------------------------------------------------------------------------
def export_env(experiment_id: uuid.UUID): # Export Best Config — .env
# ---------------------------------------------------------------------------
def _flatten_dict(d: dict, prefix: str = "") -> dict[str, str]:
"""Flatten nested dict into KEY=value pairs for .env format."""
items: dict[str, str] = {}
for k, v in d.items():
key = f"{prefix}{k}".upper() if prefix else k.upper()
if isinstance(v, dict):
items.update(_flatten_dict(v, f"{key}_"))
elif isinstance(v, list):
items[key] = json.dumps(v)
else:
items[key] = str(v)
return items
@router.get("/experiments/{experiment_id}/env")
def export_env(
experiment_id: uuid.UUID,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
):
"""Best config as .env snippet.""" """Best config as .env snippet."""
return Response(status_code=501, content="Not Implemented") experiment = _get_experiment_or_404(db, experiment_id)
best_run = _get_best_run(db, experiment)
if best_run is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed runs with scores found",
)
weights = _get_scoring_weights(experiment)
payload = _build_best_config_payload(experiment, best_run, weights)
lines = [
f"# PromptLooper — Best config for: {experiment.name}",
f"# Exported: {payload['exported_at']}",
f"# Weighted score: {payload['weighted_score']:.4f}",
f"# Run ID: {payload['run_id']}",
"",
]
flat = _flatten_dict(payload["config"])
for key, value in sorted(flat.items()):
lines.append(f"{key}={value}")
content = "\n".join(lines) + "\n"
return Response(content=content, media_type="text/plain")
@router.get("/experiments/{experiment_id}/yaml", status_code=501) # ---------------------------------------------------------------------------
def export_yaml(experiment_id: uuid.UUID): # Export Best Config — YAML
# ---------------------------------------------------------------------------
def _dict_to_yaml(d: dict, indent: int = 0) -> str:
"""Simple YAML serializer for config dicts (no external dependency)."""
lines: list[str] = []
prefix = " " * indent
for k, v in d.items():
if isinstance(v, dict):
lines.append(f"{prefix}{k}:")
lines.append(_dict_to_yaml(v, indent + 1))
elif isinstance(v, list):
lines.append(f"{prefix}{k}:")
for item in v:
if isinstance(item, dict):
lines.append(f"{prefix} -")
lines.append(_dict_to_yaml(item, indent + 2))
else:
lines.append(f"{prefix} - {item}")
elif isinstance(v, bool):
lines.append(f"{prefix}{k}: {'true' if v else 'false'}")
elif v is None:
lines.append(f"{prefix}{k}: null")
else:
lines.append(f"{prefix}{k}: {v}")
return "\n".join(lines)
@router.get("/experiments/{experiment_id}/yaml")
def export_yaml(
experiment_id: uuid.UUID,
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
):
"""Best config as YAML.""" """Best config as YAML."""
return Response(status_code=501, content="Not Implemented") experiment = _get_experiment_or_404(db, experiment_id)
best_run = _get_best_run(db, experiment)
if best_run is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed runs with scores found",
)
weights = _get_scoring_weights(experiment)
payload = _build_best_config_payload(experiment, best_run, weights)
header = (
f"# PromptLooper — Best config for: {experiment.name}\n"
f"# Exported: {payload['exported_at']}\n"
f"# Weighted score: {payload['weighted_score']:.4f}\n"
f"# Run ID: {payload['run_id']}\n\n"
)
content = header + _dict_to_yaml(payload) + "\n"
return Response(content=content, media_type="text/yaml")
@router.get("/experiments/{experiment_id}/report", status_code=501) # ---------------------------------------------------------------------------
def export_report(experiment_id: uuid.UUID): # Export Report — Markdown
# ---------------------------------------------------------------------------
@router.get("/experiments/{experiment_id}/report")
def export_report(
experiment_id: uuid.UUID,
top_n: int = Query(5, ge=1, le=50),
db: Session = Depends(get_db),
_user: User = Depends(get_current_user),
):
"""Full experiment report (markdown).""" """Full experiment report (markdown)."""
return Response(status_code=501, content="Not Implemented") experiment = _get_experiment_or_404(db, experiment_id)
runs = (
db.query(Run)
.options(joinedload(Run.scores), joinedload(Run.stage_results))
.filter(Run.experiment_id == experiment_id)
.all()
)
weights = _get_scoring_weights(experiment)
completed = [r for r in runs if r.status == RunStatus.completed]
failed = [r for r in runs if r.status == RunStatus.failed]
# Compute scored entries
scored_entries: list[tuple[Run, float]] = []
for run in completed:
if run.scores:
ws = _compute_weighted_score(run.scores, weights)
scored_entries.append((run, ws))
scored_entries.sort(key=lambda e: e[1], reverse=True)
# Collect all scorer names
all_scorer_names: set[str] = set()
for run in completed:
for s in run.scores:
all_scorer_names.add(s.scorer_name)
# Score distributions per scorer
score_values: dict[str, list[float]] = {name: [] for name in sorted(all_scorer_names)}
for run in completed:
for s in run.scores:
score_values[s.scorer_name].append(s.value)
# Token and timing stats
total_tokens_in = sum(r.tokens_in or 0 for r in runs)
total_tokens_out = sum(r.tokens_out or 0 for r in runs)
durations = [r.duration_ms for r in completed if r.duration_ms is not None]
now = datetime.now(timezone.utc).isoformat()
lines: list[str] = []
lines.append(f"# Experiment Report: {experiment.name}")
lines.append("")
lines.append(f"**Generated:** {now} ")
lines.append(f"**Experiment ID:** `{experiment.id}` ")
if experiment.description:
lines.append(f"**Description:** {experiment.description} ")
lines.append(f"**Status:** {experiment.status.value if hasattr(experiment.status, 'value') else experiment.status} ")
lines.append("")
# Config space
lines.append("## Configuration Space")
lines.append("")
if experiment.parameter_space:
lines.append("```json")
lines.append(json.dumps(experiment.parameter_space, indent=2))
lines.append("```")
else:
lines.append("_No parameter space defined._")
lines.append("")
# Run summary
lines.append("## Run Summary")
lines.append("")
lines.append(f"| Metric | Value |")
lines.append(f"|--------|-------|")
lines.append(f"| Total runs | {len(runs)} |")
lines.append(f"| Completed | {len(completed)} |")
lines.append(f"| Failed | {len(failed)} |")
lines.append(f"| Scored | {len(scored_entries)} |")
lines.append("")
# Top N configs
lines.append(f"## Top {min(top_n, len(scored_entries))} Configurations")
lines.append("")
if scored_entries:
lines.append("| Rank | Run ID | Weighted Score | Config Hash |")
lines.append("|------|--------|---------------|-------------|")
for i, (run, ws) in enumerate(scored_entries[:top_n], 1):
lines.append(f"| {i} | `{str(run.id)[:8]}...` | {ws:.4f} | `{run.config_hash[:12]}...` |")
lines.append("")
# Detail for top entry
best_run, best_score = scored_entries[0]
lines.append("### Best Configuration Detail")
lines.append("")
lines.append("```json")
lines.append(json.dumps(best_run.config, indent=2))
lines.append("```")
lines.append("")
lines.append("**Scores:**")
lines.append("")
for s in best_run.scores:
lines.append(f"- **{s.scorer_name}:** {s.value:.4f}")
lines.append("")
else:
lines.append("_No scored runs available._")
lines.append("")
# Score distributions
if score_values:
lines.append("## Score Distributions")
lines.append("")
lines.append("| Scorer | Min | Max | Mean | Count |")
lines.append("|--------|-----|-----|------|-------|")
for name in sorted(score_values.keys()):
vals = score_values[name]
if vals:
lines.append(
f"| {name} | {min(vals):.4f} | {max(vals):.4f} | "
f"{sum(vals)/len(vals):.4f} | {len(vals)} |"
)
lines.append("")
# Token usage
lines.append("## Token Usage")
lines.append("")
lines.append(f"| Metric | Value |")
lines.append(f"|--------|-------|")
lines.append(f"| Total tokens in | {total_tokens_in:,} |")
lines.append(f"| Total tokens out | {total_tokens_out:,} |")
lines.append(f"| Total tokens | {total_tokens_in + total_tokens_out:,} |")
lines.append("")
# Timing stats
lines.append("## Timing")
lines.append("")
if durations:
avg_ms = sum(durations) / len(durations)
lines.append(f"| Metric | Value |")
lines.append(f"|--------|-------|")
lines.append(f"| Fastest run | {min(durations):,} ms |")
lines.append(f"| Slowest run | {max(durations):,} ms |")
lines.append(f"| Average | {avg_ms:,.0f} ms |")
lines.append(f"| Total time | {sum(durations):,} ms |")
else:
lines.append("_No timing data available._")
lines.append("")
content = "\n".join(lines)
return Response(content=content, media_type="text/markdown")

View file

@ -0,0 +1,393 @@
"""Tests for backend/routers/export.py — Export best config (JSON, .env, YAML) and report."""
import json
import os
import uuid
from datetime import datetime, timezone
from unittest.mock import patch
import pytest
from fastapi.testclient import TestClient
JWT_SECRET = "test-secret-key-for-jwt-signing"
API_KEY = "test-api-key-12345"
@pytest.fixture(autouse=True)
def _isolate_settings(tmp_path):
"""Ensure tests use a temp SQLite DB and no Redis."""
env = {
"DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
"REDIS_URL": "",
"DATA_DIR": str(tmp_path),
"JWT_SECRET": JWT_SECRET,
"API_KEY": API_KEY,
}
with patch.dict(os.environ, env, clear=False):
import config
new_settings = config.Settings(_env_file=None)
config.settings = new_settings
import main
main.settings = new_settings
main._init_db()
main._init_redis()
from models import Base
Base.metadata.create_all(bind=main.engine)
import auth
auth.settings = new_settings
yield
@pytest.fixture
def db_session():
from main import get_db
gen = get_db()
session = next(gen)
yield session
try:
next(gen)
except StopIteration:
pass
@pytest.fixture
def admin_user(db_session):
from auth import hash_password
from models import User
user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
db_session.add(user)
db_session.commit()
db_session.refresh(user)
return user
@pytest.fixture
def project(db_session, admin_user):
from models import Project
proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
db_session.add(proj)
db_session.commit()
db_session.refresh(proj)
return proj
@pytest.fixture
def experiment(db_session, project):
from models import Experiment
exp = Experiment(
name="Test Experiment",
description="An experiment for testing exports",
project_id=project.id,
scoring_config={"weights": {"accuracy": 0.7, "fluency": 0.3}},
parameter_space={"temperature": [0.1, 0.5, 0.9], "model": ["gpt-4", "gpt-3.5"]},
)
db_session.add(exp)
db_session.commit()
db_session.refresh(exp)
return exp
@pytest.fixture
def completed_runs(db_session, experiment):
"""Create 3 completed runs with scores."""
from models import Run, RunStatus, Score
runs = []
configs = [
{"model": "gpt-4", "temperature": 0.1},
{"model": "gpt-4", "temperature": 0.5},
{"model": "gpt-3.5", "temperature": 0.9},
]
scores_data = [
[("accuracy", 0.95), ("fluency", 0.80)],
[("accuracy", 0.85), ("fluency", 0.90)],
[("accuracy", 0.70), ("fluency", 0.60)],
]
for i, (cfg, sc) in enumerate(zip(configs, scores_data)):
run = Run(
experiment_id=experiment.id,
config=cfg,
config_hash=f"hash_{i:03d}",
status=RunStatus.completed,
duration_ms=1000 + i * 500,
tokens_in=100 + i * 50,
tokens_out=200 + i * 100,
)
db_session.add(run)
db_session.flush()
for scorer_name, value in sc:
score = Score(run_id=run.id, scorer_name=scorer_name, value=value)
db_session.add(score)
runs.append(run)
db_session.commit()
for r in runs:
db_session.refresh(r)
return runs
@pytest.fixture
def auth_header():
return {"X-Api-Key": API_KEY}
@pytest.fixture
def client():
from main import app
return TestClient(app)
# ---------------------------------------------------------------------------
# Export Best — JSON
# ---------------------------------------------------------------------------
class TestExportBest:
def test_returns_best_config_json(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
assert resp.status_code == 200
data = resp.json()
assert data["experiment_name"] == "Test Experiment"
assert data["config"]["model"] == "gpt-4"
assert data["config"]["temperature"] == 0.1
assert data["weighted_score"] > 0
assert "run_id" in data
assert "config_hash" in data
assert "exported_at" in data
def test_best_uses_weighted_scores(self, client, auth_header, experiment, completed_runs):
"""Run 0 has accuracy=0.95, fluency=0.80. With weights 0.7/0.3, score = (0.95*0.7 + 0.80*0.3)/1.0 = 0.905."""
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
data = resp.json()
assert abs(data["weighted_score"] - 0.905) < 0.001
def test_best_404_no_experiment(self, client, auth_header, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/export/experiments/{fake_id}/best", headers=auth_header)
assert resp.status_code == 404
def test_best_404_no_completed_runs(self, client, auth_header, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
assert resp.status_code == 404
assert "No completed runs" in resp.json()["detail"]
def test_best_requires_auth(self, client, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/best")
assert resp.status_code in (401, 403)
# ---------------------------------------------------------------------------
# Export Best — .env
# ---------------------------------------------------------------------------
class TestExportEnv:
def test_returns_env_format(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
assert resp.status_code == 200
assert resp.headers["content-type"] == "text/plain; charset=utf-8"
content = resp.text
assert "# PromptLooper" in content
assert "MODEL=" in content
assert "TEMPERATURE=" in content
def test_env_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
content = resp.text
assert "Test Experiment" in content
assert "Weighted score" in content
def test_env_404_no_experiment(self, client, auth_header, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/export/experiments/{fake_id}/env", headers=auth_header)
assert resp.status_code == 404
def test_env_404_no_runs(self, client, auth_header, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
assert resp.status_code == 404
def test_env_requires_auth(self, client, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/env")
assert resp.status_code in (401, 403)
# ---------------------------------------------------------------------------
# Export Best — YAML
# ---------------------------------------------------------------------------
class TestExportYaml:
def test_returns_yaml_format(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
assert resp.status_code == 200
assert "text/yaml" in resp.headers["content-type"]
content = resp.text
assert "experiment_name: Test Experiment" in content
assert "config:" in content
def test_yaml_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
content = resp.text
assert "# PromptLooper" in content
assert "# Weighted score" in content
def test_yaml_404_no_experiment(self, client, auth_header, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/export/experiments/{fake_id}/yaml", headers=auth_header)
assert resp.status_code == 404
def test_yaml_404_no_runs(self, client, auth_header, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
assert resp.status_code == 404
def test_yaml_requires_auth(self, client, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml")
assert resp.status_code in (401, 403)
# ---------------------------------------------------------------------------
# Export Report — Markdown
# ---------------------------------------------------------------------------
class TestExportReport:
def test_returns_markdown_report(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
assert resp.status_code == 200
assert "text/markdown" in resp.headers["content-type"]
content = resp.text
assert "# Experiment Report: Test Experiment" in content
def test_report_contains_config_space(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Configuration Space" in content
assert "temperature" in content
def test_report_contains_top_configs(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Top" in content
assert "Weighted Score" in content
def test_report_contains_score_distributions(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Score Distributions" in content
assert "accuracy" in content
assert "fluency" in content
def test_report_contains_token_usage(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Token Usage" in content
assert "Total tokens in" in content
def test_report_contains_timing(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Timing" in content
assert "Fastest run" in content
def test_report_run_summary(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "## Run Summary" in content
assert "Total runs" in content
assert "Completed" in content
def test_report_custom_top_n(self, client, auth_header, experiment, completed_runs):
resp = client.get(
f"/api/export/experiments/{experiment.id}/report?top_n=2",
headers=auth_header,
)
assert resp.status_code == 200
content = resp.text
assert "## Top 2 Configurations" in content
def test_report_empty_experiment(self, client, auth_header, experiment):
"""Report should work even with no runs."""
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
assert resp.status_code == 200
content = resp.text
assert "Total runs | 0" in content
assert "_No scored runs available._" in content
def test_report_404_no_experiment(self, client, auth_header, admin_user):
fake_id = uuid.uuid4()
resp = client.get(f"/api/export/experiments/{fake_id}/report", headers=auth_header)
assert resp.status_code == 404
def test_report_requires_auth(self, client, experiment):
resp = client.get(f"/api/export/experiments/{experiment.id}/report")
assert resp.status_code in (401, 403)
def test_report_with_failed_runs(self, client, auth_header, experiment, completed_runs, db_session):
"""Report should count failed runs separately."""
from models import Run, RunStatus
failed = Run(
experiment_id=experiment.id,
config={"model": "bad", "temperature": 0.5},
config_hash="hash_fail",
status=RunStatus.failed,
)
db_session.add(failed)
db_session.commit()
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "Total runs | 4" in content
assert "Failed | 1" in content
def test_report_description_shown(self, client, auth_header, experiment, completed_runs):
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
content = resp.text
assert "An experiment for testing exports" in content
# ---------------------------------------------------------------------------
# Helper function tests
# ---------------------------------------------------------------------------
class TestHelpers:
def test_flatten_dict_simple(self):
from routers.export import _flatten_dict
result = _flatten_dict({"model": "gpt-4", "temperature": 0.5})
assert result == {"MODEL": "gpt-4", "TEMPERATURE": "0.5"}
def test_flatten_dict_nested(self):
from routers.export import _flatten_dict
result = _flatten_dict({"llm": {"model": "gpt-4", "temp": 0.1}})
assert result == {"LLM_MODEL": "gpt-4", "LLM_TEMP": "0.1"}
def test_flatten_dict_list(self):
from routers.export import _flatten_dict
result = _flatten_dict({"tags": ["a", "b"]})
assert result == {"TAGS": '["a", "b"]'}
def test_dict_to_yaml_simple(self):
from routers.export import _dict_to_yaml
result = _dict_to_yaml({"name": "test", "value": 42})
assert "name: test" in result
assert "value: 42" in result
def test_dict_to_yaml_nested(self):
from routers.export import _dict_to_yaml
result = _dict_to_yaml({"config": {"model": "gpt-4"}})
assert "config:" in result
assert " model: gpt-4" in result
def test_dict_to_yaml_bool_and_none(self):
from routers.export import _dict_to_yaml
result = _dict_to_yaml({"enabled": True, "disabled": False, "empty": None})
assert "enabled: true" in result
assert "disabled: false" in result
assert "empty: null" in result

View file

@ -174,22 +174,22 @@ def test_endpoints_test(client):
def test_export_best(client): def test_export_best(client):
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best") resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best")
assert resp.status_code == 501 assert resp.status_code == 401
def test_export_env(client): def test_export_env(client):
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env") resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env")
assert resp.status_code == 501 assert resp.status_code == 401
def test_export_yaml(client): def test_export_yaml(client):
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml") resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml")
assert resp.status_code == 501 assert resp.status_code == 401
def test_export_report(client): def test_export_report(client):
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report") resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report")
assert resp.status_code == 501 assert resp.status_code == 401
# ---- Webhooks router (/api/webhooks) ---- # ---- Webhooks router (/api/webhooks) ----