From e42117c8ee9acb9b9e257e2663b45f64701efb5b Mon Sep 17 00:00:00 2001 From: John Lightner Date: Tue, 7 Apr 2026 03:30:45 -0500 Subject: [PATCH] MAESTRO: Implement export router with JSON, .env, YAML, and markdown report endpoints Four fully authenticated endpoints at /api/export/experiments/{id}/: - /best: Returns best config as JSON with weighted score and metadata - /env: Flattened KEY=VALUE format with metadata comments - /yaml: Simple YAML serialization (no external dependency) - /report: Full markdown report with config space, top N configs, score distributions, token usage, and timing stats 34 tests in test_export.py covering all endpoints, auth, 404s, and helpers. Updated test_routers.py to expect 401 (auth required) instead of 501 (stub). --- Auto Run Docs/02a-backend-engine.md | 3 +- backend/routers/export.py | 382 ++++++++++++++++++++++++++- backend/tests/test_export.py | 393 ++++++++++++++++++++++++++++ backend/tests/test_routers.py | 8 +- 4 files changed, 768 insertions(+), 18 deletions(-) create mode 100644 backend/tests/test_export.py diff --git a/Auto Run Docs/02a-backend-engine.md b/Auto Run Docs/02a-backend-engine.md index 75547c1..bcf1207 100644 --- a/Auto Run Docs/02a-backend-engine.md +++ b/Auto Run Docs/02a-backend-engine.md @@ -41,7 +41,8 @@ Implement the core experiment execution engine: LLM adapters, response caching, - [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score. -- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats. +- [x] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats. + - [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events). diff --git a/backend/routers/export.py b/backend/routers/export.py index 5cc04c2..a84825e 100644 --- a/backend/routers/export.py +++ b/backend/routers/export.py @@ -1,31 +1,387 @@ """Export router — export experiment results in various formats.""" +import json import uuid +from datetime import datetime, timezone -from fastapi import APIRouter, Response +from fastapi import APIRouter, Depends, HTTPException, Query, Response, status +from sqlalchemy.orm import Session, joinedload + +from auth import get_current_user +from main import get_db +from models import Experiment, Run, RunStatus, Score, StageResult, User router = APIRouter() -@router.get("/experiments/{experiment_id}/best", status_code=501) -def export_best(experiment_id: uuid.UUID): +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _get_experiment_or_404(db: Session, experiment_id: uuid.UUID) -> Experiment: + experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first() + if experiment is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found") + return experiment + + +def _get_scoring_weights(experiment: Experiment) -> dict[str, float]: + weights: dict[str, float] = {} + if experiment.scoring_config and isinstance(experiment.scoring_config, dict): + weights = experiment.scoring_config.get("weights", {}) + return weights + + +def _compute_weighted_score(scores: list[Score], weights: dict[str, float]) -> float: + """Compute weighted score for a run's scores.""" + if not scores: + return 0.0 + + score_map: dict[str, float] = {} + for s in scores: + score_map[s.scorer_name] = s.value + + if weights: + total_weight = sum(weights.get(name, 0.0) for name in score_map) + if total_weight > 0: + return sum( + score_map[name] * weights.get(name, 0.0) + for name in score_map + if name in weights + ) / total_weight + else: + return sum(score_map.values()) / len(score_map) + else: + return sum(score_map.values()) / len(score_map) + + +def _get_best_run(db: Session, experiment: Experiment) -> Run | None: + """Return the best completed run by weighted score, or None.""" + weights = _get_scoring_weights(experiment) + runs = ( + db.query(Run) + .options(joinedload(Run.scores)) + .filter(Run.experiment_id == experiment.id, Run.status == RunStatus.completed) + .all() + ) + + best_run = None + best_score = -1.0 + for run in runs: + if not run.scores: + continue + ws = _compute_weighted_score(run.scores, weights) + if ws > best_score: + best_score = ws + best_run = run + + return best_run + + +def _build_best_config_payload(experiment: Experiment, run: Run, weights: dict[str, float]) -> dict: + """Build the metadata+config dict for the best run.""" + score_map = {s.scorer_name: s.value for s in run.scores} + return { + "experiment_name": experiment.name, + "experiment_id": str(experiment.id), + "exported_at": datetime.now(timezone.utc).isoformat(), + "weighted_score": _compute_weighted_score(run.scores, weights), + "scores": score_map, + "run_id": str(run.id), + "config_hash": run.config_hash, + "config": run.config, + } + + +# --------------------------------------------------------------------------- +# Export Best Config — JSON +# --------------------------------------------------------------------------- + + +@router.get("/experiments/{experiment_id}/best") +def export_best( + experiment_id: uuid.UUID, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +): """Best config as JSON.""" - return Response(status_code=501, content="Not Implemented") + experiment = _get_experiment_or_404(db, experiment_id) + best_run = _get_best_run(db, experiment) + if best_run is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No completed runs with scores found", + ) + + weights = _get_scoring_weights(experiment) + payload = _build_best_config_payload(experiment, best_run, weights) + return payload -@router.get("/experiments/{experiment_id}/env", status_code=501) -def export_env(experiment_id: uuid.UUID): +# --------------------------------------------------------------------------- +# Export Best Config — .env +# --------------------------------------------------------------------------- + + +def _flatten_dict(d: dict, prefix: str = "") -> dict[str, str]: + """Flatten nested dict into KEY=value pairs for .env format.""" + items: dict[str, str] = {} + for k, v in d.items(): + key = f"{prefix}{k}".upper() if prefix else k.upper() + if isinstance(v, dict): + items.update(_flatten_dict(v, f"{key}_")) + elif isinstance(v, list): + items[key] = json.dumps(v) + else: + items[key] = str(v) + return items + + +@router.get("/experiments/{experiment_id}/env") +def export_env( + experiment_id: uuid.UUID, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +): """Best config as .env snippet.""" - return Response(status_code=501, content="Not Implemented") + experiment = _get_experiment_or_404(db, experiment_id) + best_run = _get_best_run(db, experiment) + if best_run is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No completed runs with scores found", + ) + + weights = _get_scoring_weights(experiment) + payload = _build_best_config_payload(experiment, best_run, weights) + + lines = [ + f"# PromptLooper — Best config for: {experiment.name}", + f"# Exported: {payload['exported_at']}", + f"# Weighted score: {payload['weighted_score']:.4f}", + f"# Run ID: {payload['run_id']}", + "", + ] + + flat = _flatten_dict(payload["config"]) + for key, value in sorted(flat.items()): + lines.append(f"{key}={value}") + + content = "\n".join(lines) + "\n" + return Response(content=content, media_type="text/plain") -@router.get("/experiments/{experiment_id}/yaml", status_code=501) -def export_yaml(experiment_id: uuid.UUID): +# --------------------------------------------------------------------------- +# Export Best Config — YAML +# --------------------------------------------------------------------------- + + +def _dict_to_yaml(d: dict, indent: int = 0) -> str: + """Simple YAML serializer for config dicts (no external dependency).""" + lines: list[str] = [] + prefix = " " * indent + for k, v in d.items(): + if isinstance(v, dict): + lines.append(f"{prefix}{k}:") + lines.append(_dict_to_yaml(v, indent + 1)) + elif isinstance(v, list): + lines.append(f"{prefix}{k}:") + for item in v: + if isinstance(item, dict): + lines.append(f"{prefix} -") + lines.append(_dict_to_yaml(item, indent + 2)) + else: + lines.append(f"{prefix} - {item}") + elif isinstance(v, bool): + lines.append(f"{prefix}{k}: {'true' if v else 'false'}") + elif v is None: + lines.append(f"{prefix}{k}: null") + else: + lines.append(f"{prefix}{k}: {v}") + return "\n".join(lines) + + +@router.get("/experiments/{experiment_id}/yaml") +def export_yaml( + experiment_id: uuid.UUID, + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +): """Best config as YAML.""" - return Response(status_code=501, content="Not Implemented") + experiment = _get_experiment_or_404(db, experiment_id) + best_run = _get_best_run(db, experiment) + if best_run is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No completed runs with scores found", + ) + + weights = _get_scoring_weights(experiment) + payload = _build_best_config_payload(experiment, best_run, weights) + + header = ( + f"# PromptLooper — Best config for: {experiment.name}\n" + f"# Exported: {payload['exported_at']}\n" + f"# Weighted score: {payload['weighted_score']:.4f}\n" + f"# Run ID: {payload['run_id']}\n\n" + ) + content = header + _dict_to_yaml(payload) + "\n" + return Response(content=content, media_type="text/yaml") -@router.get("/experiments/{experiment_id}/report", status_code=501) -def export_report(experiment_id: uuid.UUID): +# --------------------------------------------------------------------------- +# Export Report — Markdown +# --------------------------------------------------------------------------- + + +@router.get("/experiments/{experiment_id}/report") +def export_report( + experiment_id: uuid.UUID, + top_n: int = Query(5, ge=1, le=50), + db: Session = Depends(get_db), + _user: User = Depends(get_current_user), +): """Full experiment report (markdown).""" - return Response(status_code=501, content="Not Implemented") + experiment = _get_experiment_or_404(db, experiment_id) + + runs = ( + db.query(Run) + .options(joinedload(Run.scores), joinedload(Run.stage_results)) + .filter(Run.experiment_id == experiment_id) + .all() + ) + + weights = _get_scoring_weights(experiment) + completed = [r for r in runs if r.status == RunStatus.completed] + failed = [r for r in runs if r.status == RunStatus.failed] + + # Compute scored entries + scored_entries: list[tuple[Run, float]] = [] + for run in completed: + if run.scores: + ws = _compute_weighted_score(run.scores, weights) + scored_entries.append((run, ws)) + scored_entries.sort(key=lambda e: e[1], reverse=True) + + # Collect all scorer names + all_scorer_names: set[str] = set() + for run in completed: + for s in run.scores: + all_scorer_names.add(s.scorer_name) + + # Score distributions per scorer + score_values: dict[str, list[float]] = {name: [] for name in sorted(all_scorer_names)} + for run in completed: + for s in run.scores: + score_values[s.scorer_name].append(s.value) + + # Token and timing stats + total_tokens_in = sum(r.tokens_in or 0 for r in runs) + total_tokens_out = sum(r.tokens_out or 0 for r in runs) + durations = [r.duration_ms for r in completed if r.duration_ms is not None] + + now = datetime.now(timezone.utc).isoformat() + + lines: list[str] = [] + lines.append(f"# Experiment Report: {experiment.name}") + lines.append("") + lines.append(f"**Generated:** {now} ") + lines.append(f"**Experiment ID:** `{experiment.id}` ") + if experiment.description: + lines.append(f"**Description:** {experiment.description} ") + lines.append(f"**Status:** {experiment.status.value if hasattr(experiment.status, 'value') else experiment.status} ") + lines.append("") + + # Config space + lines.append("## Configuration Space") + lines.append("") + if experiment.parameter_space: + lines.append("```json") + lines.append(json.dumps(experiment.parameter_space, indent=2)) + lines.append("```") + else: + lines.append("_No parameter space defined._") + lines.append("") + + # Run summary + lines.append("## Run Summary") + lines.append("") + lines.append(f"| Metric | Value |") + lines.append(f"|--------|-------|") + lines.append(f"| Total runs | {len(runs)} |") + lines.append(f"| Completed | {len(completed)} |") + lines.append(f"| Failed | {len(failed)} |") + lines.append(f"| Scored | {len(scored_entries)} |") + lines.append("") + + # Top N configs + lines.append(f"## Top {min(top_n, len(scored_entries))} Configurations") + lines.append("") + if scored_entries: + lines.append("| Rank | Run ID | Weighted Score | Config Hash |") + lines.append("|------|--------|---------------|-------------|") + for i, (run, ws) in enumerate(scored_entries[:top_n], 1): + lines.append(f"| {i} | `{str(run.id)[:8]}...` | {ws:.4f} | `{run.config_hash[:12]}...` |") + lines.append("") + + # Detail for top entry + best_run, best_score = scored_entries[0] + lines.append("### Best Configuration Detail") + lines.append("") + lines.append("```json") + lines.append(json.dumps(best_run.config, indent=2)) + lines.append("```") + lines.append("") + + lines.append("**Scores:**") + lines.append("") + for s in best_run.scores: + lines.append(f"- **{s.scorer_name}:** {s.value:.4f}") + lines.append("") + else: + lines.append("_No scored runs available._") + lines.append("") + + # Score distributions + if score_values: + lines.append("## Score Distributions") + lines.append("") + lines.append("| Scorer | Min | Max | Mean | Count |") + lines.append("|--------|-----|-----|------|-------|") + for name in sorted(score_values.keys()): + vals = score_values[name] + if vals: + lines.append( + f"| {name} | {min(vals):.4f} | {max(vals):.4f} | " + f"{sum(vals)/len(vals):.4f} | {len(vals)} |" + ) + lines.append("") + + # Token usage + lines.append("## Token Usage") + lines.append("") + lines.append(f"| Metric | Value |") + lines.append(f"|--------|-------|") + lines.append(f"| Total tokens in | {total_tokens_in:,} |") + lines.append(f"| Total tokens out | {total_tokens_out:,} |") + lines.append(f"| Total tokens | {total_tokens_in + total_tokens_out:,} |") + lines.append("") + + # Timing stats + lines.append("## Timing") + lines.append("") + if durations: + avg_ms = sum(durations) / len(durations) + lines.append(f"| Metric | Value |") + lines.append(f"|--------|-------|") + lines.append(f"| Fastest run | {min(durations):,} ms |") + lines.append(f"| Slowest run | {max(durations):,} ms |") + lines.append(f"| Average | {avg_ms:,.0f} ms |") + lines.append(f"| Total time | {sum(durations):,} ms |") + else: + lines.append("_No timing data available._") + lines.append("") + + content = "\n".join(lines) + return Response(content=content, media_type="text/markdown") diff --git a/backend/tests/test_export.py b/backend/tests/test_export.py new file mode 100644 index 0000000..fe861b3 --- /dev/null +++ b/backend/tests/test_export.py @@ -0,0 +1,393 @@ +"""Tests for backend/routers/export.py — Export best config (JSON, .env, YAML) and report.""" + +import json +import os +import uuid +from datetime import datetime, timezone +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient + + +JWT_SECRET = "test-secret-key-for-jwt-signing" +API_KEY = "test-api-key-12345" + + +@pytest.fixture(autouse=True) +def _isolate_settings(tmp_path): + """Ensure tests use a temp SQLite DB and no Redis.""" + env = { + "DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}", + "REDIS_URL": "", + "DATA_DIR": str(tmp_path), + "JWT_SECRET": JWT_SECRET, + "API_KEY": API_KEY, + } + with patch.dict(os.environ, env, clear=False): + import config + new_settings = config.Settings(_env_file=None) + config.settings = new_settings + + import main + main.settings = new_settings + main._init_db() + main._init_redis() + + from models import Base + Base.metadata.create_all(bind=main.engine) + + import auth + auth.settings = new_settings + + yield + + +@pytest.fixture +def db_session(): + from main import get_db + gen = get_db() + session = next(gen) + yield session + try: + next(gen) + except StopIteration: + pass + + +@pytest.fixture +def admin_user(db_session): + from auth import hash_password + from models import User + user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True) + db_session.add(user) + db_session.commit() + db_session.refresh(user) + return user + + +@pytest.fixture +def project(db_session, admin_user): + from models import Project + proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id) + db_session.add(proj) + db_session.commit() + db_session.refresh(proj) + return proj + + +@pytest.fixture +def experiment(db_session, project): + from models import Experiment + exp = Experiment( + name="Test Experiment", + description="An experiment for testing exports", + project_id=project.id, + scoring_config={"weights": {"accuracy": 0.7, "fluency": 0.3}}, + parameter_space={"temperature": [0.1, 0.5, 0.9], "model": ["gpt-4", "gpt-3.5"]}, + ) + db_session.add(exp) + db_session.commit() + db_session.refresh(exp) + return exp + + +@pytest.fixture +def completed_runs(db_session, experiment): + """Create 3 completed runs with scores.""" + from models import Run, RunStatus, Score + + runs = [] + configs = [ + {"model": "gpt-4", "temperature": 0.1}, + {"model": "gpt-4", "temperature": 0.5}, + {"model": "gpt-3.5", "temperature": 0.9}, + ] + scores_data = [ + [("accuracy", 0.95), ("fluency", 0.80)], + [("accuracy", 0.85), ("fluency", 0.90)], + [("accuracy", 0.70), ("fluency", 0.60)], + ] + for i, (cfg, sc) in enumerate(zip(configs, scores_data)): + run = Run( + experiment_id=experiment.id, + config=cfg, + config_hash=f"hash_{i:03d}", + status=RunStatus.completed, + duration_ms=1000 + i * 500, + tokens_in=100 + i * 50, + tokens_out=200 + i * 100, + ) + db_session.add(run) + db_session.flush() + + for scorer_name, value in sc: + score = Score(run_id=run.id, scorer_name=scorer_name, value=value) + db_session.add(score) + + runs.append(run) + + db_session.commit() + for r in runs: + db_session.refresh(r) + return runs + + +@pytest.fixture +def auth_header(): + return {"X-Api-Key": API_KEY} + + +@pytest.fixture +def client(): + from main import app + return TestClient(app) + + +# --------------------------------------------------------------------------- +# Export Best — JSON +# --------------------------------------------------------------------------- + + +class TestExportBest: + def test_returns_best_config_json(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header) + assert resp.status_code == 200 + data = resp.json() + assert data["experiment_name"] == "Test Experiment" + assert data["config"]["model"] == "gpt-4" + assert data["config"]["temperature"] == 0.1 + assert data["weighted_score"] > 0 + assert "run_id" in data + assert "config_hash" in data + assert "exported_at" in data + + def test_best_uses_weighted_scores(self, client, auth_header, experiment, completed_runs): + """Run 0 has accuracy=0.95, fluency=0.80. With weights 0.7/0.3, score = (0.95*0.7 + 0.80*0.3)/1.0 = 0.905.""" + resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header) + data = resp.json() + assert abs(data["weighted_score"] - 0.905) < 0.001 + + def test_best_404_no_experiment(self, client, auth_header, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/export/experiments/{fake_id}/best", headers=auth_header) + assert resp.status_code == 404 + + def test_best_404_no_completed_runs(self, client, auth_header, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header) + assert resp.status_code == 404 + assert "No completed runs" in resp.json()["detail"] + + def test_best_requires_auth(self, client, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/best") + assert resp.status_code in (401, 403) + + +# --------------------------------------------------------------------------- +# Export Best — .env +# --------------------------------------------------------------------------- + + +class TestExportEnv: + def test_returns_env_format(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header) + assert resp.status_code == 200 + assert resp.headers["content-type"] == "text/plain; charset=utf-8" + content = resp.text + assert "# PromptLooper" in content + assert "MODEL=" in content + assert "TEMPERATURE=" in content + + def test_env_has_metadata_comments(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header) + content = resp.text + assert "Test Experiment" in content + assert "Weighted score" in content + + def test_env_404_no_experiment(self, client, auth_header, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/export/experiments/{fake_id}/env", headers=auth_header) + assert resp.status_code == 404 + + def test_env_404_no_runs(self, client, auth_header, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header) + assert resp.status_code == 404 + + def test_env_requires_auth(self, client, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/env") + assert resp.status_code in (401, 403) + + +# --------------------------------------------------------------------------- +# Export Best — YAML +# --------------------------------------------------------------------------- + + +class TestExportYaml: + def test_returns_yaml_format(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header) + assert resp.status_code == 200 + assert "text/yaml" in resp.headers["content-type"] + content = resp.text + assert "experiment_name: Test Experiment" in content + assert "config:" in content + + def test_yaml_has_metadata_comments(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header) + content = resp.text + assert "# PromptLooper" in content + assert "# Weighted score" in content + + def test_yaml_404_no_experiment(self, client, auth_header, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/export/experiments/{fake_id}/yaml", headers=auth_header) + assert resp.status_code == 404 + + def test_yaml_404_no_runs(self, client, auth_header, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header) + assert resp.status_code == 404 + + def test_yaml_requires_auth(self, client, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/yaml") + assert resp.status_code in (401, 403) + + +# --------------------------------------------------------------------------- +# Export Report — Markdown +# --------------------------------------------------------------------------- + + +class TestExportReport: + def test_returns_markdown_report(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + assert resp.status_code == 200 + assert "text/markdown" in resp.headers["content-type"] + content = resp.text + assert "# Experiment Report: Test Experiment" in content + + def test_report_contains_config_space(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Configuration Space" in content + assert "temperature" in content + + def test_report_contains_top_configs(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Top" in content + assert "Weighted Score" in content + + def test_report_contains_score_distributions(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Score Distributions" in content + assert "accuracy" in content + assert "fluency" in content + + def test_report_contains_token_usage(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Token Usage" in content + assert "Total tokens in" in content + + def test_report_contains_timing(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Timing" in content + assert "Fastest run" in content + + def test_report_run_summary(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "## Run Summary" in content + assert "Total runs" in content + assert "Completed" in content + + def test_report_custom_top_n(self, client, auth_header, experiment, completed_runs): + resp = client.get( + f"/api/export/experiments/{experiment.id}/report?top_n=2", + headers=auth_header, + ) + assert resp.status_code == 200 + content = resp.text + assert "## Top 2 Configurations" in content + + def test_report_empty_experiment(self, client, auth_header, experiment): + """Report should work even with no runs.""" + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + assert resp.status_code == 200 + content = resp.text + assert "Total runs | 0" in content + assert "_No scored runs available._" in content + + def test_report_404_no_experiment(self, client, auth_header, admin_user): + fake_id = uuid.uuid4() + resp = client.get(f"/api/export/experiments/{fake_id}/report", headers=auth_header) + assert resp.status_code == 404 + + def test_report_requires_auth(self, client, experiment): + resp = client.get(f"/api/export/experiments/{experiment.id}/report") + assert resp.status_code in (401, 403) + + def test_report_with_failed_runs(self, client, auth_header, experiment, completed_runs, db_session): + """Report should count failed runs separately.""" + from models import Run, RunStatus + failed = Run( + experiment_id=experiment.id, + config={"model": "bad", "temperature": 0.5}, + config_hash="hash_fail", + status=RunStatus.failed, + ) + db_session.add(failed) + db_session.commit() + + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "Total runs | 4" in content + assert "Failed | 1" in content + + def test_report_description_shown(self, client, auth_header, experiment, completed_runs): + resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header) + content = resp.text + assert "An experiment for testing exports" in content + + +# --------------------------------------------------------------------------- +# Helper function tests +# --------------------------------------------------------------------------- + + +class TestHelpers: + def test_flatten_dict_simple(self): + from routers.export import _flatten_dict + result = _flatten_dict({"model": "gpt-4", "temperature": 0.5}) + assert result == {"MODEL": "gpt-4", "TEMPERATURE": "0.5"} + + def test_flatten_dict_nested(self): + from routers.export import _flatten_dict + result = _flatten_dict({"llm": {"model": "gpt-4", "temp": 0.1}}) + assert result == {"LLM_MODEL": "gpt-4", "LLM_TEMP": "0.1"} + + def test_flatten_dict_list(self): + from routers.export import _flatten_dict + result = _flatten_dict({"tags": ["a", "b"]}) + assert result == {"TAGS": '["a", "b"]'} + + def test_dict_to_yaml_simple(self): + from routers.export import _dict_to_yaml + result = _dict_to_yaml({"name": "test", "value": 42}) + assert "name: test" in result + assert "value: 42" in result + + def test_dict_to_yaml_nested(self): + from routers.export import _dict_to_yaml + result = _dict_to_yaml({"config": {"model": "gpt-4"}}) + assert "config:" in result + assert " model: gpt-4" in result + + def test_dict_to_yaml_bool_and_none(self): + from routers.export import _dict_to_yaml + result = _dict_to_yaml({"enabled": True, "disabled": False, "empty": None}) + assert "enabled: true" in result + assert "disabled: false" in result + assert "empty: null" in result diff --git a/backend/tests/test_routers.py b/backend/tests/test_routers.py index 0a1bad6..5652b1c 100644 --- a/backend/tests/test_routers.py +++ b/backend/tests/test_routers.py @@ -174,22 +174,22 @@ def test_endpoints_test(client): def test_export_best(client): resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_export_env(client): resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_export_yaml(client): resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml") - assert resp.status_code == 501 + assert resp.status_code == 401 def test_export_report(client): resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report") - assert resp.status_code == 501 + assert resp.status_code == 401 # ---- Webhooks router (/api/webhooks) ----