MAESTRO: Implement export router with JSON, .env, YAML, and markdown report endpoints
Four fully authenticated endpoints at /api/export/experiments/{id}/:
- /best: Returns best config as JSON with weighted score and metadata
- /env: Flattened KEY=VALUE format with metadata comments
- /yaml: Simple YAML serialization (no external dependency)
- /report: Full markdown report with config space, top N configs,
score distributions, token usage, and timing stats
34 tests in test_export.py covering all endpoints, auth, 404s, and helpers.
Updated test_routers.py to expect 401 (auth required) instead of 501 (stub).
This commit is contained in:
parent
32535a92ea
commit
e42117c8ee
4 changed files with 768 additions and 18 deletions
|
|
@ -41,7 +41,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
|
|||
- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
|
||||
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
|
||||
|
||||
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
||||
- [x] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
||||
<!-- Completed: Full export router with 4 endpoints: /best (JSON with weighted score, metadata), /env (flattened KEY=VALUE with comments), /yaml (simple serializer, no PyYAML dependency), /report (markdown with config space, top N configs, score distributions, token usage, timing stats). Auth required on all endpoints. 34 tests in test_export.py, all passing. -->
|
||||
|
||||
- [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events).
|
||||
|
||||
|
|
|
|||
|
|
@ -1,31 +1,387 @@
|
|||
"""Export router — export experiment results in various formats."""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from fastapi import APIRouter, Response
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
|
||||
from sqlalchemy.orm import Session, joinedload
|
||||
|
||||
from auth import get_current_user
|
||||
from main import get_db
|
||||
from models import Experiment, Run, RunStatus, Score, StageResult, User
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/best", status_code=501)
|
||||
def export_best(experiment_id: uuid.UUID):
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _get_experiment_or_404(db: Session, experiment_id: uuid.UUID) -> Experiment:
|
||||
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
||||
if experiment is None:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
||||
return experiment
|
||||
|
||||
|
||||
def _get_scoring_weights(experiment: Experiment) -> dict[str, float]:
|
||||
weights: dict[str, float] = {}
|
||||
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
|
||||
weights = experiment.scoring_config.get("weights", {})
|
||||
return weights
|
||||
|
||||
|
||||
def _compute_weighted_score(scores: list[Score], weights: dict[str, float]) -> float:
|
||||
"""Compute weighted score for a run's scores."""
|
||||
if not scores:
|
||||
return 0.0
|
||||
|
||||
score_map: dict[str, float] = {}
|
||||
for s in scores:
|
||||
score_map[s.scorer_name] = s.value
|
||||
|
||||
if weights:
|
||||
total_weight = sum(weights.get(name, 0.0) for name in score_map)
|
||||
if total_weight > 0:
|
||||
return sum(
|
||||
score_map[name] * weights.get(name, 0.0)
|
||||
for name in score_map
|
||||
if name in weights
|
||||
) / total_weight
|
||||
else:
|
||||
return sum(score_map.values()) / len(score_map)
|
||||
else:
|
||||
return sum(score_map.values()) / len(score_map)
|
||||
|
||||
|
||||
def _get_best_run(db: Session, experiment: Experiment) -> Run | None:
|
||||
"""Return the best completed run by weighted score, or None."""
|
||||
weights = _get_scoring_weights(experiment)
|
||||
runs = (
|
||||
db.query(Run)
|
||||
.options(joinedload(Run.scores))
|
||||
.filter(Run.experiment_id == experiment.id, Run.status == RunStatus.completed)
|
||||
.all()
|
||||
)
|
||||
|
||||
best_run = None
|
||||
best_score = -1.0
|
||||
for run in runs:
|
||||
if not run.scores:
|
||||
continue
|
||||
ws = _compute_weighted_score(run.scores, weights)
|
||||
if ws > best_score:
|
||||
best_score = ws
|
||||
best_run = run
|
||||
|
||||
return best_run
|
||||
|
||||
|
||||
def _build_best_config_payload(experiment: Experiment, run: Run, weights: dict[str, float]) -> dict:
|
||||
"""Build the metadata+config dict for the best run."""
|
||||
score_map = {s.scorer_name: s.value for s in run.scores}
|
||||
return {
|
||||
"experiment_name": experiment.name,
|
||||
"experiment_id": str(experiment.id),
|
||||
"exported_at": datetime.now(timezone.utc).isoformat(),
|
||||
"weighted_score": _compute_weighted_score(run.scores, weights),
|
||||
"scores": score_map,
|
||||
"run_id": str(run.id),
|
||||
"config_hash": run.config_hash,
|
||||
"config": run.config,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best Config — JSON
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/best")
|
||||
def export_best(
|
||||
experiment_id: uuid.UUID,
|
||||
db: Session = Depends(get_db),
|
||||
_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Best config as JSON."""
|
||||
return Response(status_code=501, content="Not Implemented")
|
||||
experiment = _get_experiment_or_404(db, experiment_id)
|
||||
best_run = _get_best_run(db, experiment)
|
||||
if best_run is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="No completed runs with scores found",
|
||||
)
|
||||
|
||||
weights = _get_scoring_weights(experiment)
|
||||
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||
return payload
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/env", status_code=501)
|
||||
def export_env(experiment_id: uuid.UUID):
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best Config — .env
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _flatten_dict(d: dict, prefix: str = "") -> dict[str, str]:
|
||||
"""Flatten nested dict into KEY=value pairs for .env format."""
|
||||
items: dict[str, str] = {}
|
||||
for k, v in d.items():
|
||||
key = f"{prefix}{k}".upper() if prefix else k.upper()
|
||||
if isinstance(v, dict):
|
||||
items.update(_flatten_dict(v, f"{key}_"))
|
||||
elif isinstance(v, list):
|
||||
items[key] = json.dumps(v)
|
||||
else:
|
||||
items[key] = str(v)
|
||||
return items
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/env")
|
||||
def export_env(
|
||||
experiment_id: uuid.UUID,
|
||||
db: Session = Depends(get_db),
|
||||
_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Best config as .env snippet."""
|
||||
return Response(status_code=501, content="Not Implemented")
|
||||
experiment = _get_experiment_or_404(db, experiment_id)
|
||||
best_run = _get_best_run(db, experiment)
|
||||
if best_run is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="No completed runs with scores found",
|
||||
)
|
||||
|
||||
weights = _get_scoring_weights(experiment)
|
||||
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||
|
||||
lines = [
|
||||
f"# PromptLooper — Best config for: {experiment.name}",
|
||||
f"# Exported: {payload['exported_at']}",
|
||||
f"# Weighted score: {payload['weighted_score']:.4f}",
|
||||
f"# Run ID: {payload['run_id']}",
|
||||
"",
|
||||
]
|
||||
|
||||
flat = _flatten_dict(payload["config"])
|
||||
for key, value in sorted(flat.items()):
|
||||
lines.append(f"{key}={value}")
|
||||
|
||||
content = "\n".join(lines) + "\n"
|
||||
return Response(content=content, media_type="text/plain")
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/yaml", status_code=501)
|
||||
def export_yaml(experiment_id: uuid.UUID):
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best Config — YAML
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _dict_to_yaml(d: dict, indent: int = 0) -> str:
|
||||
"""Simple YAML serializer for config dicts (no external dependency)."""
|
||||
lines: list[str] = []
|
||||
prefix = " " * indent
|
||||
for k, v in d.items():
|
||||
if isinstance(v, dict):
|
||||
lines.append(f"{prefix}{k}:")
|
||||
lines.append(_dict_to_yaml(v, indent + 1))
|
||||
elif isinstance(v, list):
|
||||
lines.append(f"{prefix}{k}:")
|
||||
for item in v:
|
||||
if isinstance(item, dict):
|
||||
lines.append(f"{prefix} -")
|
||||
lines.append(_dict_to_yaml(item, indent + 2))
|
||||
else:
|
||||
lines.append(f"{prefix} - {item}")
|
||||
elif isinstance(v, bool):
|
||||
lines.append(f"{prefix}{k}: {'true' if v else 'false'}")
|
||||
elif v is None:
|
||||
lines.append(f"{prefix}{k}: null")
|
||||
else:
|
||||
lines.append(f"{prefix}{k}: {v}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/yaml")
|
||||
def export_yaml(
|
||||
experiment_id: uuid.UUID,
|
||||
db: Session = Depends(get_db),
|
||||
_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Best config as YAML."""
|
||||
return Response(status_code=501, content="Not Implemented")
|
||||
experiment = _get_experiment_or_404(db, experiment_id)
|
||||
best_run = _get_best_run(db, experiment)
|
||||
if best_run is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="No completed runs with scores found",
|
||||
)
|
||||
|
||||
weights = _get_scoring_weights(experiment)
|
||||
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||
|
||||
header = (
|
||||
f"# PromptLooper — Best config for: {experiment.name}\n"
|
||||
f"# Exported: {payload['exported_at']}\n"
|
||||
f"# Weighted score: {payload['weighted_score']:.4f}\n"
|
||||
f"# Run ID: {payload['run_id']}\n\n"
|
||||
)
|
||||
content = header + _dict_to_yaml(payload) + "\n"
|
||||
return Response(content=content, media_type="text/yaml")
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/report", status_code=501)
|
||||
def export_report(experiment_id: uuid.UUID):
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Report — Markdown
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@router.get("/experiments/{experiment_id}/report")
|
||||
def export_report(
|
||||
experiment_id: uuid.UUID,
|
||||
top_n: int = Query(5, ge=1, le=50),
|
||||
db: Session = Depends(get_db),
|
||||
_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""Full experiment report (markdown)."""
|
||||
return Response(status_code=501, content="Not Implemented")
|
||||
experiment = _get_experiment_or_404(db, experiment_id)
|
||||
|
||||
runs = (
|
||||
db.query(Run)
|
||||
.options(joinedload(Run.scores), joinedload(Run.stage_results))
|
||||
.filter(Run.experiment_id == experiment_id)
|
||||
.all()
|
||||
)
|
||||
|
||||
weights = _get_scoring_weights(experiment)
|
||||
completed = [r for r in runs if r.status == RunStatus.completed]
|
||||
failed = [r for r in runs if r.status == RunStatus.failed]
|
||||
|
||||
# Compute scored entries
|
||||
scored_entries: list[tuple[Run, float]] = []
|
||||
for run in completed:
|
||||
if run.scores:
|
||||
ws = _compute_weighted_score(run.scores, weights)
|
||||
scored_entries.append((run, ws))
|
||||
scored_entries.sort(key=lambda e: e[1], reverse=True)
|
||||
|
||||
# Collect all scorer names
|
||||
all_scorer_names: set[str] = set()
|
||||
for run in completed:
|
||||
for s in run.scores:
|
||||
all_scorer_names.add(s.scorer_name)
|
||||
|
||||
# Score distributions per scorer
|
||||
score_values: dict[str, list[float]] = {name: [] for name in sorted(all_scorer_names)}
|
||||
for run in completed:
|
||||
for s in run.scores:
|
||||
score_values[s.scorer_name].append(s.value)
|
||||
|
||||
# Token and timing stats
|
||||
total_tokens_in = sum(r.tokens_in or 0 for r in runs)
|
||||
total_tokens_out = sum(r.tokens_out or 0 for r in runs)
|
||||
durations = [r.duration_ms for r in completed if r.duration_ms is not None]
|
||||
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append(f"# Experiment Report: {experiment.name}")
|
||||
lines.append("")
|
||||
lines.append(f"**Generated:** {now} ")
|
||||
lines.append(f"**Experiment ID:** `{experiment.id}` ")
|
||||
if experiment.description:
|
||||
lines.append(f"**Description:** {experiment.description} ")
|
||||
lines.append(f"**Status:** {experiment.status.value if hasattr(experiment.status, 'value') else experiment.status} ")
|
||||
lines.append("")
|
||||
|
||||
# Config space
|
||||
lines.append("## Configuration Space")
|
||||
lines.append("")
|
||||
if experiment.parameter_space:
|
||||
lines.append("```json")
|
||||
lines.append(json.dumps(experiment.parameter_space, indent=2))
|
||||
lines.append("```")
|
||||
else:
|
||||
lines.append("_No parameter space defined._")
|
||||
lines.append("")
|
||||
|
||||
# Run summary
|
||||
lines.append("## Run Summary")
|
||||
lines.append("")
|
||||
lines.append(f"| Metric | Value |")
|
||||
lines.append(f"|--------|-------|")
|
||||
lines.append(f"| Total runs | {len(runs)} |")
|
||||
lines.append(f"| Completed | {len(completed)} |")
|
||||
lines.append(f"| Failed | {len(failed)} |")
|
||||
lines.append(f"| Scored | {len(scored_entries)} |")
|
||||
lines.append("")
|
||||
|
||||
# Top N configs
|
||||
lines.append(f"## Top {min(top_n, len(scored_entries))} Configurations")
|
||||
lines.append("")
|
||||
if scored_entries:
|
||||
lines.append("| Rank | Run ID | Weighted Score | Config Hash |")
|
||||
lines.append("|------|--------|---------------|-------------|")
|
||||
for i, (run, ws) in enumerate(scored_entries[:top_n], 1):
|
||||
lines.append(f"| {i} | `{str(run.id)[:8]}...` | {ws:.4f} | `{run.config_hash[:12]}...` |")
|
||||
lines.append("")
|
||||
|
||||
# Detail for top entry
|
||||
best_run, best_score = scored_entries[0]
|
||||
lines.append("### Best Configuration Detail")
|
||||
lines.append("")
|
||||
lines.append("```json")
|
||||
lines.append(json.dumps(best_run.config, indent=2))
|
||||
lines.append("```")
|
||||
lines.append("")
|
||||
|
||||
lines.append("**Scores:**")
|
||||
lines.append("")
|
||||
for s in best_run.scores:
|
||||
lines.append(f"- **{s.scorer_name}:** {s.value:.4f}")
|
||||
lines.append("")
|
||||
else:
|
||||
lines.append("_No scored runs available._")
|
||||
lines.append("")
|
||||
|
||||
# Score distributions
|
||||
if score_values:
|
||||
lines.append("## Score Distributions")
|
||||
lines.append("")
|
||||
lines.append("| Scorer | Min | Max | Mean | Count |")
|
||||
lines.append("|--------|-----|-----|------|-------|")
|
||||
for name in sorted(score_values.keys()):
|
||||
vals = score_values[name]
|
||||
if vals:
|
||||
lines.append(
|
||||
f"| {name} | {min(vals):.4f} | {max(vals):.4f} | "
|
||||
f"{sum(vals)/len(vals):.4f} | {len(vals)} |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Token usage
|
||||
lines.append("## Token Usage")
|
||||
lines.append("")
|
||||
lines.append(f"| Metric | Value |")
|
||||
lines.append(f"|--------|-------|")
|
||||
lines.append(f"| Total tokens in | {total_tokens_in:,} |")
|
||||
lines.append(f"| Total tokens out | {total_tokens_out:,} |")
|
||||
lines.append(f"| Total tokens | {total_tokens_in + total_tokens_out:,} |")
|
||||
lines.append("")
|
||||
|
||||
# Timing stats
|
||||
lines.append("## Timing")
|
||||
lines.append("")
|
||||
if durations:
|
||||
avg_ms = sum(durations) / len(durations)
|
||||
lines.append(f"| Metric | Value |")
|
||||
lines.append(f"|--------|-------|")
|
||||
lines.append(f"| Fastest run | {min(durations):,} ms |")
|
||||
lines.append(f"| Slowest run | {max(durations):,} ms |")
|
||||
lines.append(f"| Average | {avg_ms:,.0f} ms |")
|
||||
lines.append(f"| Total time | {sum(durations):,} ms |")
|
||||
else:
|
||||
lines.append("_No timing data available._")
|
||||
lines.append("")
|
||||
|
||||
content = "\n".join(lines)
|
||||
return Response(content=content, media_type="text/markdown")
|
||||
|
|
|
|||
393
backend/tests/test_export.py
Normal file
393
backend/tests/test_export.py
Normal file
|
|
@ -0,0 +1,393 @@
|
|||
"""Tests for backend/routers/export.py — Export best config (JSON, .env, YAML) and report."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
JWT_SECRET = "test-secret-key-for-jwt-signing"
|
||||
API_KEY = "test-api-key-12345"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _isolate_settings(tmp_path):
|
||||
"""Ensure tests use a temp SQLite DB and no Redis."""
|
||||
env = {
|
||||
"DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
|
||||
"REDIS_URL": "",
|
||||
"DATA_DIR": str(tmp_path),
|
||||
"JWT_SECRET": JWT_SECRET,
|
||||
"API_KEY": API_KEY,
|
||||
}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
import config
|
||||
new_settings = config.Settings(_env_file=None)
|
||||
config.settings = new_settings
|
||||
|
||||
import main
|
||||
main.settings = new_settings
|
||||
main._init_db()
|
||||
main._init_redis()
|
||||
|
||||
from models import Base
|
||||
Base.metadata.create_all(bind=main.engine)
|
||||
|
||||
import auth
|
||||
auth.settings = new_settings
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db_session():
|
||||
from main import get_db
|
||||
gen = get_db()
|
||||
session = next(gen)
|
||||
yield session
|
||||
try:
|
||||
next(gen)
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db_session):
|
||||
from auth import hash_password
|
||||
from models import User
|
||||
user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
|
||||
db_session.add(user)
|
||||
db_session.commit()
|
||||
db_session.refresh(user)
|
||||
return user
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def project(db_session, admin_user):
|
||||
from models import Project
|
||||
proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
|
||||
db_session.add(proj)
|
||||
db_session.commit()
|
||||
db_session.refresh(proj)
|
||||
return proj
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def experiment(db_session, project):
|
||||
from models import Experiment
|
||||
exp = Experiment(
|
||||
name="Test Experiment",
|
||||
description="An experiment for testing exports",
|
||||
project_id=project.id,
|
||||
scoring_config={"weights": {"accuracy": 0.7, "fluency": 0.3}},
|
||||
parameter_space={"temperature": [0.1, 0.5, 0.9], "model": ["gpt-4", "gpt-3.5"]},
|
||||
)
|
||||
db_session.add(exp)
|
||||
db_session.commit()
|
||||
db_session.refresh(exp)
|
||||
return exp
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def completed_runs(db_session, experiment):
|
||||
"""Create 3 completed runs with scores."""
|
||||
from models import Run, RunStatus, Score
|
||||
|
||||
runs = []
|
||||
configs = [
|
||||
{"model": "gpt-4", "temperature": 0.1},
|
||||
{"model": "gpt-4", "temperature": 0.5},
|
||||
{"model": "gpt-3.5", "temperature": 0.9},
|
||||
]
|
||||
scores_data = [
|
||||
[("accuracy", 0.95), ("fluency", 0.80)],
|
||||
[("accuracy", 0.85), ("fluency", 0.90)],
|
||||
[("accuracy", 0.70), ("fluency", 0.60)],
|
||||
]
|
||||
for i, (cfg, sc) in enumerate(zip(configs, scores_data)):
|
||||
run = Run(
|
||||
experiment_id=experiment.id,
|
||||
config=cfg,
|
||||
config_hash=f"hash_{i:03d}",
|
||||
status=RunStatus.completed,
|
||||
duration_ms=1000 + i * 500,
|
||||
tokens_in=100 + i * 50,
|
||||
tokens_out=200 + i * 100,
|
||||
)
|
||||
db_session.add(run)
|
||||
db_session.flush()
|
||||
|
||||
for scorer_name, value in sc:
|
||||
score = Score(run_id=run.id, scorer_name=scorer_name, value=value)
|
||||
db_session.add(score)
|
||||
|
||||
runs.append(run)
|
||||
|
||||
db_session.commit()
|
||||
for r in runs:
|
||||
db_session.refresh(r)
|
||||
return runs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def auth_header():
|
||||
return {"X-Api-Key": API_KEY}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
from main import app
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best — JSON
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExportBest:
|
||||
def test_returns_best_config_json(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["experiment_name"] == "Test Experiment"
|
||||
assert data["config"]["model"] == "gpt-4"
|
||||
assert data["config"]["temperature"] == 0.1
|
||||
assert data["weighted_score"] > 0
|
||||
assert "run_id" in data
|
||||
assert "config_hash" in data
|
||||
assert "exported_at" in data
|
||||
|
||||
def test_best_uses_weighted_scores(self, client, auth_header, experiment, completed_runs):
|
||||
"""Run 0 has accuracy=0.95, fluency=0.80. With weights 0.7/0.3, score = (0.95*0.7 + 0.80*0.3)/1.0 = 0.905."""
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||
data = resp.json()
|
||||
assert abs(data["weighted_score"] - 0.905) < 0.001
|
||||
|
||||
def test_best_404_no_experiment(self, client, auth_header, admin_user):
|
||||
fake_id = uuid.uuid4()
|
||||
resp = client.get(f"/api/export/experiments/{fake_id}/best", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_best_404_no_completed_runs(self, client, auth_header, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
assert "No completed runs" in resp.json()["detail"]
|
||||
|
||||
def test_best_requires_auth(self, client, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/best")
|
||||
assert resp.status_code in (401, 403)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best — .env
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExportEnv:
|
||||
def test_returns_env_format(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||
assert resp.status_code == 200
|
||||
assert resp.headers["content-type"] == "text/plain; charset=utf-8"
|
||||
content = resp.text
|
||||
assert "# PromptLooper" in content
|
||||
assert "MODEL=" in content
|
||||
assert "TEMPERATURE=" in content
|
||||
|
||||
def test_env_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "Test Experiment" in content
|
||||
assert "Weighted score" in content
|
||||
|
||||
def test_env_404_no_experiment(self, client, auth_header, admin_user):
|
||||
fake_id = uuid.uuid4()
|
||||
resp = client.get(f"/api/export/experiments/{fake_id}/env", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_env_404_no_runs(self, client, auth_header, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_env_requires_auth(self, client, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/env")
|
||||
assert resp.status_code in (401, 403)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Best — YAML
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExportYaml:
|
||||
def test_returns_yaml_format(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||
assert resp.status_code == 200
|
||||
assert "text/yaml" in resp.headers["content-type"]
|
||||
content = resp.text
|
||||
assert "experiment_name: Test Experiment" in content
|
||||
assert "config:" in content
|
||||
|
||||
def test_yaml_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "# PromptLooper" in content
|
||||
assert "# Weighted score" in content
|
||||
|
||||
def test_yaml_404_no_experiment(self, client, auth_header, admin_user):
|
||||
fake_id = uuid.uuid4()
|
||||
resp = client.get(f"/api/export/experiments/{fake_id}/yaml", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_yaml_404_no_runs(self, client, auth_header, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_yaml_requires_auth(self, client, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml")
|
||||
assert resp.status_code in (401, 403)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Export Report — Markdown
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExportReport:
|
||||
def test_returns_markdown_report(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
assert resp.status_code == 200
|
||||
assert "text/markdown" in resp.headers["content-type"]
|
||||
content = resp.text
|
||||
assert "# Experiment Report: Test Experiment" in content
|
||||
|
||||
def test_report_contains_config_space(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Configuration Space" in content
|
||||
assert "temperature" in content
|
||||
|
||||
def test_report_contains_top_configs(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Top" in content
|
||||
assert "Weighted Score" in content
|
||||
|
||||
def test_report_contains_score_distributions(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Score Distributions" in content
|
||||
assert "accuracy" in content
|
||||
assert "fluency" in content
|
||||
|
||||
def test_report_contains_token_usage(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Token Usage" in content
|
||||
assert "Total tokens in" in content
|
||||
|
||||
def test_report_contains_timing(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Timing" in content
|
||||
assert "Fastest run" in content
|
||||
|
||||
def test_report_run_summary(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "## Run Summary" in content
|
||||
assert "Total runs" in content
|
||||
assert "Completed" in content
|
||||
|
||||
def test_report_custom_top_n(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(
|
||||
f"/api/export/experiments/{experiment.id}/report?top_n=2",
|
||||
headers=auth_header,
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
content = resp.text
|
||||
assert "## Top 2 Configurations" in content
|
||||
|
||||
def test_report_empty_experiment(self, client, auth_header, experiment):
|
||||
"""Report should work even with no runs."""
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
assert resp.status_code == 200
|
||||
content = resp.text
|
||||
assert "Total runs | 0" in content
|
||||
assert "_No scored runs available._" in content
|
||||
|
||||
def test_report_404_no_experiment(self, client, auth_header, admin_user):
|
||||
fake_id = uuid.uuid4()
|
||||
resp = client.get(f"/api/export/experiments/{fake_id}/report", headers=auth_header)
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_report_requires_auth(self, client, experiment):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report")
|
||||
assert resp.status_code in (401, 403)
|
||||
|
||||
def test_report_with_failed_runs(self, client, auth_header, experiment, completed_runs, db_session):
|
||||
"""Report should count failed runs separately."""
|
||||
from models import Run, RunStatus
|
||||
failed = Run(
|
||||
experiment_id=experiment.id,
|
||||
config={"model": "bad", "temperature": 0.5},
|
||||
config_hash="hash_fail",
|
||||
status=RunStatus.failed,
|
||||
)
|
||||
db_session.add(failed)
|
||||
db_session.commit()
|
||||
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "Total runs | 4" in content
|
||||
assert "Failed | 1" in content
|
||||
|
||||
def test_report_description_shown(self, client, auth_header, experiment, completed_runs):
|
||||
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||
content = resp.text
|
||||
assert "An experiment for testing exports" in content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper function tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHelpers:
|
||||
def test_flatten_dict_simple(self):
|
||||
from routers.export import _flatten_dict
|
||||
result = _flatten_dict({"model": "gpt-4", "temperature": 0.5})
|
||||
assert result == {"MODEL": "gpt-4", "TEMPERATURE": "0.5"}
|
||||
|
||||
def test_flatten_dict_nested(self):
|
||||
from routers.export import _flatten_dict
|
||||
result = _flatten_dict({"llm": {"model": "gpt-4", "temp": 0.1}})
|
||||
assert result == {"LLM_MODEL": "gpt-4", "LLM_TEMP": "0.1"}
|
||||
|
||||
def test_flatten_dict_list(self):
|
||||
from routers.export import _flatten_dict
|
||||
result = _flatten_dict({"tags": ["a", "b"]})
|
||||
assert result == {"TAGS": '["a", "b"]'}
|
||||
|
||||
def test_dict_to_yaml_simple(self):
|
||||
from routers.export import _dict_to_yaml
|
||||
result = _dict_to_yaml({"name": "test", "value": 42})
|
||||
assert "name: test" in result
|
||||
assert "value: 42" in result
|
||||
|
||||
def test_dict_to_yaml_nested(self):
|
||||
from routers.export import _dict_to_yaml
|
||||
result = _dict_to_yaml({"config": {"model": "gpt-4"}})
|
||||
assert "config:" in result
|
||||
assert " model: gpt-4" in result
|
||||
|
||||
def test_dict_to_yaml_bool_and_none(self):
|
||||
from routers.export import _dict_to_yaml
|
||||
result = _dict_to_yaml({"enabled": True, "disabled": False, "empty": None})
|
||||
assert "enabled: true" in result
|
||||
assert "disabled: false" in result
|
||||
assert "empty: null" in result
|
||||
|
|
@ -174,22 +174,22 @@ def test_endpoints_test(client):
|
|||
|
||||
def test_export_best(client):
|
||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best")
|
||||
assert resp.status_code == 501
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
def test_export_env(client):
|
||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env")
|
||||
assert resp.status_code == 501
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
def test_export_yaml(client):
|
||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml")
|
||||
assert resp.status_code == 501
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
def test_export_report(client):
|
||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report")
|
||||
assert resp.status_code == 501
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
# ---- Webhooks router (/api/webhooks) ----
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue