MAESTRO: Implement export router with JSON, .env, YAML, and markdown report endpoints

Four fully authenticated endpoints at /api/export/experiments/{id}/: - /best: Returns best config as JSON with weighted score and metadata - /env: Flattened KEY=VALUE format with metadata comments - /yaml: Simple YAML serialization (no external dependency) - /report: Full markdown report with config space, top N configs, score distributions, token usage, and timing stats 34 tests in test_export.py covering all endpoints, auth, 404s, and helpers. Updated test_routers.py to expect 401 (auth required) instead of 501 (stub).
2026-04-07 03:30:45 -05:00 · 2026-04-07 03:30:45 -05:00 · e42117c8ee
commit e42117c8ee
parent 32535a92ea
4 changed files with 768 additions and 18 deletions
--- a/Docs/02a-backend-engine.md
+++ b/Docs/02a-backend-engine.md
@ -41,7 +41,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
 - [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
  <!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
+- [x] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
  <!-- Completed: Full export router with 4 endpoints: /best (JSON with weighted score, metadata), /env (flattened KEY=VALUE with comments), /yaml (simple serializer, no PyYAML dependency), /report (markdown with config space, top N configs, score distributions, token usage, timing stats). Auth required on all endpoints. 34 tests in test_export.py, all passing. -->
 - [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events).
--- a/backend/routers/export.py
+++ b/backend/routers/export.py
@ -1,31 +1,387 @@
 """Export router — export experiment results in various formats."""
 import json
 import uuid
 from datetime import datetime, timezone
-from fastapi import APIRouter, Response
+from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
 from sqlalchemy.orm import Session, joinedload
 from auth import get_current_user
 from main import get_db
 from models import Experiment, Run, RunStatus, Score, StageResult, User
 router = APIRouter()
-@router.get("/experiments/{experiment_id}/best", status_code=501)
+# ---------------------------------------------------------------------------
-def export_best(experiment_id: uuid.UUID):
+# Helpers
 # ---------------------------------------------------------------------------
 def _get_experiment_or_404(db: Session, experiment_id: uuid.UUID) -> Experiment:
    experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
    if experiment is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
    return experiment
 def _get_scoring_weights(experiment: Experiment) -> dict[str, float]:
    weights: dict[str, float] = {}
    if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
        weights = experiment.scoring_config.get("weights", {})
    return weights
 def _compute_weighted_score(scores: list[Score], weights: dict[str, float]) -> float:
    """Compute weighted score for a run's scores."""
    if not scores:
        return 0.0
    score_map: dict[str, float] = {}
    for s in scores:
        score_map[s.scorer_name] = s.value
    if weights:
        total_weight = sum(weights.get(name, 0.0) for name in score_map)
        if total_weight > 0:
            return sum(
                score_map[name] * weights.get(name, 0.0)
                for name in score_map
                if name in weights
            ) / total_weight
        else:
            return sum(score_map.values()) / len(score_map)
    else:
        return sum(score_map.values()) / len(score_map)
 def _get_best_run(db: Session, experiment: Experiment) -> Run | None:
    """Return the best completed run by weighted score, or None."""
    weights = _get_scoring_weights(experiment)
    runs = (
        db.query(Run)
        .options(joinedload(Run.scores))
        .filter(Run.experiment_id == experiment.id, Run.status == RunStatus.completed)
        .all()
    )
    best_run = None
    best_score = -1.0
    for run in runs:
        if not run.scores:
            continue
        ws = _compute_weighted_score(run.scores, weights)
        if ws > best_score:
            best_score = ws
            best_run = run
    return best_run
 def _build_best_config_payload(experiment: Experiment, run: Run, weights: dict[str, float]) -> dict:
    """Build the metadata+config dict for the best run."""
    score_map = {s.scorer_name: s.value for s in run.scores}
    return {
        "experiment_name": experiment.name,
        "experiment_id": str(experiment.id),
        "exported_at": datetime.now(timezone.utc).isoformat(),
        "weighted_score": _compute_weighted_score(run.scores, weights),
        "scores": score_map,
        "run_id": str(run.id),
        "config_hash": run.config_hash,
        "config": run.config,
    }
 # ---------------------------------------------------------------------------
 # Export Best Config — JSON
 # ---------------------------------------------------------------------------
@router.get("/experiments/{experiment_id}/best")
 def export_best(
    experiment_id: uuid.UUID,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
 ):
    """Best config as JSON."""
-    return Response(status_code=501, content="Not Implemented")
+    experiment = _get_experiment_or_404(db, experiment_id)
    best_run = _get_best_run(db, experiment)
    if best_run is None:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="No completed runs with scores found",
        )
    weights = _get_scoring_weights(experiment)
    payload = _build_best_config_payload(experiment, best_run, weights)
    return payload
-@router.get("/experiments/{experiment_id}/env", status_code=501)
+# ---------------------------------------------------------------------------
-def export_env(experiment_id: uuid.UUID):
+# Export Best Config — .env
 # ---------------------------------------------------------------------------
 def _flatten_dict(d: dict, prefix: str = "") -> dict[str, str]:
    """Flatten nested dict into KEY=value pairs for .env format."""
    items: dict[str, str] = {}
    for k, v in d.items():
        key = f"{prefix}{k}".upper() if prefix else k.upper()
        if isinstance(v, dict):
            items.update(_flatten_dict(v, f"{key}_"))
        elif isinstance(v, list):
            items[key] = json.dumps(v)
        else:
            items[key] = str(v)
    return items
@router.get("/experiments/{experiment_id}/env")
 def export_env(
    experiment_id: uuid.UUID,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
 ):
    """Best config as .env snippet."""
-    return Response(status_code=501, content="Not Implemented")
+    experiment = _get_experiment_or_404(db, experiment_id)
    best_run = _get_best_run(db, experiment)
    if best_run is None:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="No completed runs with scores found",
        )
    weights = _get_scoring_weights(experiment)
    payload = _build_best_config_payload(experiment, best_run, weights)
    lines = [
        f"# PromptLooper — Best config for: {experiment.name}",
        f"# Exported: {payload['exported_at']}",
        f"# Weighted score: {payload['weighted_score']:.4f}",
        f"# Run ID: {payload['run_id']}",
        "",
    ]
    flat = _flatten_dict(payload["config"])
    for key, value in sorted(flat.items()):
        lines.append(f"{key}={value}")
    content = "\n".join(lines) + "\n"
    return Response(content=content, media_type="text/plain")
-@router.get("/experiments/{experiment_id}/yaml", status_code=501)
+# ---------------------------------------------------------------------------
-def export_yaml(experiment_id: uuid.UUID):
+# Export Best Config — YAML
 # ---------------------------------------------------------------------------
 def _dict_to_yaml(d: dict, indent: int = 0) -> str:
    """Simple YAML serializer for config dicts (no external dependency)."""
    lines: list[str] = []
    prefix = "  " * indent
    for k, v in d.items():
        if isinstance(v, dict):
            lines.append(f"{prefix}{k}:")
            lines.append(_dict_to_yaml(v, indent + 1))
        elif isinstance(v, list):
            lines.append(f"{prefix}{k}:")
            for item in v:
                if isinstance(item, dict):
                    lines.append(f"{prefix}  -")
                    lines.append(_dict_to_yaml(item, indent + 2))
                else:
                    lines.append(f"{prefix}  - {item}")
        elif isinstance(v, bool):
            lines.append(f"{prefix}{k}: {'true' if v else 'false'}")
        elif v is None:
            lines.append(f"{prefix}{k}: null")
        else:
            lines.append(f"{prefix}{k}: {v}")
    return "\n".join(lines)
@router.get("/experiments/{experiment_id}/yaml")
 def export_yaml(
    experiment_id: uuid.UUID,
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
 ):
    """Best config as YAML."""
-    return Response(status_code=501, content="Not Implemented")
+    experiment = _get_experiment_or_404(db, experiment_id)
    best_run = _get_best_run(db, experiment)
    if best_run is None:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="No completed runs with scores found",
        )
    weights = _get_scoring_weights(experiment)
    payload = _build_best_config_payload(experiment, best_run, weights)
    header = (
        f"# PromptLooper — Best config for: {experiment.name}\n"
        f"# Exported: {payload['exported_at']}\n"
        f"# Weighted score: {payload['weighted_score']:.4f}\n"
        f"# Run ID: {payload['run_id']}\n\n"
    )
    content = header + _dict_to_yaml(payload) + "\n"
    return Response(content=content, media_type="text/yaml")
-@router.get("/experiments/{experiment_id}/report", status_code=501)
+# ---------------------------------------------------------------------------
-def export_report(experiment_id: uuid.UUID):
+# Export Report — Markdown
 # ---------------------------------------------------------------------------
@router.get("/experiments/{experiment_id}/report")
 def export_report(
    experiment_id: uuid.UUID,
    top_n: int = Query(5, ge=1, le=50),
    db: Session = Depends(get_db),
    _user: User = Depends(get_current_user),
 ):
    """Full experiment report (markdown)."""
-    return Response(status_code=501, content="Not Implemented")
+    experiment = _get_experiment_or_404(db, experiment_id)
    runs = (
        db.query(Run)
        .options(joinedload(Run.scores), joinedload(Run.stage_results))
        .filter(Run.experiment_id == experiment_id)
        .all()
    )
    weights = _get_scoring_weights(experiment)
    completed = [r for r in runs if r.status == RunStatus.completed]
    failed = [r for r in runs if r.status == RunStatus.failed]
    # Compute scored entries
    scored_entries: list[tuple[Run, float]] = []
    for run in completed:
        if run.scores:
            ws = _compute_weighted_score(run.scores, weights)
            scored_entries.append((run, ws))
    scored_entries.sort(key=lambda e: e[1], reverse=True)
    # Collect all scorer names
    all_scorer_names: set[str] = set()
    for run in completed:
        for s in run.scores:
            all_scorer_names.add(s.scorer_name)
    # Score distributions per scorer
    score_values: dict[str, list[float]] = {name: [] for name in sorted(all_scorer_names)}
    for run in completed:
        for s in run.scores:
            score_values[s.scorer_name].append(s.value)
    # Token and timing stats
    total_tokens_in = sum(r.tokens_in or 0 for r in runs)
    total_tokens_out = sum(r.tokens_out or 0 for r in runs)
    durations = [r.duration_ms for r in completed if r.duration_ms is not None]
    now = datetime.now(timezone.utc).isoformat()
    lines: list[str] = []
    lines.append(f"# Experiment Report: {experiment.name}")
    lines.append("")
    lines.append(f"**Generated:** {now}  ")
    lines.append(f"**Experiment ID:** `{experiment.id}`  ")
    if experiment.description:
        lines.append(f"**Description:** {experiment.description}  ")
    lines.append(f"**Status:** {experiment.status.value if hasattr(experiment.status, 'value') else experiment.status}  ")
    lines.append("")
    # Config space
    lines.append("## Configuration Space")
    lines.append("")
    if experiment.parameter_space:
        lines.append("```json")
        lines.append(json.dumps(experiment.parameter_space, indent=2))
        lines.append("```")
    else:
        lines.append("_No parameter space defined._")
    lines.append("")
    # Run summary
    lines.append("## Run Summary")
    lines.append("")
    lines.append(f"| Metric | Value |")
    lines.append(f"|--------|-------|")
    lines.append(f"| Total runs | {len(runs)} |")
    lines.append(f"| Completed | {len(completed)} |")
    lines.append(f"| Failed | {len(failed)} |")
    lines.append(f"| Scored | {len(scored_entries)} |")
    lines.append("")
    # Top N configs
    lines.append(f"## Top {min(top_n, len(scored_entries))} Configurations")
    lines.append("")
    if scored_entries:
        lines.append("| Rank | Run ID | Weighted Score | Config Hash |")
        lines.append("|------|--------|---------------|-------------|")
        for i, (run, ws) in enumerate(scored_entries[:top_n], 1):
            lines.append(f"| {i} | `{str(run.id)[:8]}...` | {ws:.4f} | `{run.config_hash[:12]}...` |")
        lines.append("")
        # Detail for top entry
        best_run, best_score = scored_entries[0]
        lines.append("### Best Configuration Detail")
        lines.append("")
        lines.append("```json")
        lines.append(json.dumps(best_run.config, indent=2))
        lines.append("```")
        lines.append("")
        lines.append("**Scores:**")
        lines.append("")
        for s in best_run.scores:
            lines.append(f"- **{s.scorer_name}:** {s.value:.4f}")
        lines.append("")
    else:
        lines.append("_No scored runs available._")
        lines.append("")
    # Score distributions
    if score_values:
        lines.append("## Score Distributions")
        lines.append("")
        lines.append("| Scorer | Min | Max | Mean | Count |")
        lines.append("|--------|-----|-----|------|-------|")
        for name in sorted(score_values.keys()):
            vals = score_values[name]
            if vals:
                lines.append(
                    f"| {name} | {min(vals):.4f} | {max(vals):.4f} | "
                    f"{sum(vals)/len(vals):.4f} | {len(vals)} |"
                )
        lines.append("")
    # Token usage
    lines.append("## Token Usage")
    lines.append("")
    lines.append(f"| Metric | Value |")
    lines.append(f"|--------|-------|")
    lines.append(f"| Total tokens in | {total_tokens_in:,} |")
    lines.append(f"| Total tokens out | {total_tokens_out:,} |")
    lines.append(f"| Total tokens | {total_tokens_in + total_tokens_out:,} |")
    lines.append("")
    # Timing stats
    lines.append("## Timing")
    lines.append("")
    if durations:
        avg_ms = sum(durations) / len(durations)
        lines.append(f"| Metric | Value |")
        lines.append(f"|--------|-------|")
        lines.append(f"| Fastest run | {min(durations):,} ms |")
        lines.append(f"| Slowest run | {max(durations):,} ms |")
        lines.append(f"| Average | {avg_ms:,.0f} ms |")
        lines.append(f"| Total time | {sum(durations):,} ms |")
    else:
        lines.append("_No timing data available._")
    lines.append("")
    content = "\n".join(lines)
    return Response(content=content, media_type="text/markdown")
--- a/backend/tests/test_export.py
+++ b/backend/tests/test_export.py
@ -0,0 +1,393 @@
 """Tests for backend/routers/export.py — Export best config (JSON, .env, YAML) and report."""
 import json
 import os
 import uuid
 from datetime import datetime, timezone
 from unittest.mock import patch
 import pytest
 from fastapi.testclient import TestClient
 JWT_SECRET = "test-secret-key-for-jwt-signing"
 API_KEY = "test-api-key-12345"
@pytest.fixture(autouse=True)
 def _isolate_settings(tmp_path):
    """Ensure tests use a temp SQLite DB and no Redis."""
    env = {
        "DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
        "REDIS_URL": "",
        "DATA_DIR": str(tmp_path),
        "JWT_SECRET": JWT_SECRET,
        "API_KEY": API_KEY,
    }
    with patch.dict(os.environ, env, clear=False):
        import config
        new_settings = config.Settings(_env_file=None)
        config.settings = new_settings
        import main
        main.settings = new_settings
        main._init_db()
        main._init_redis()
        from models import Base
        Base.metadata.create_all(bind=main.engine)
        import auth
        auth.settings = new_settings
        yield
@pytest.fixture
 def db_session():
    from main import get_db
    gen = get_db()
    session = next(gen)
    yield session
    try:
        next(gen)
    except StopIteration:
        pass
@pytest.fixture
 def admin_user(db_session):
    from auth import hash_password
    from models import User
    user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
    db_session.add(user)
    db_session.commit()
    db_session.refresh(user)
    return user
@pytest.fixture
 def project(db_session, admin_user):
    from models import Project
    proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
    db_session.add(proj)
    db_session.commit()
    db_session.refresh(proj)
    return proj
@pytest.fixture
 def experiment(db_session, project):
    from models import Experiment
    exp = Experiment(
        name="Test Experiment",
        description="An experiment for testing exports",
        project_id=project.id,
        scoring_config={"weights": {"accuracy": 0.7, "fluency": 0.3}},
        parameter_space={"temperature": [0.1, 0.5, 0.9], "model": ["gpt-4", "gpt-3.5"]},
    )
    db_session.add(exp)
    db_session.commit()
    db_session.refresh(exp)
    return exp
@pytest.fixture
 def completed_runs(db_session, experiment):
    """Create 3 completed runs with scores."""
    from models import Run, RunStatus, Score
    runs = []
    configs = [
        {"model": "gpt-4", "temperature": 0.1},
        {"model": "gpt-4", "temperature": 0.5},
        {"model": "gpt-3.5", "temperature": 0.9},
    ]
    scores_data = [
        [("accuracy", 0.95), ("fluency", 0.80)],
        [("accuracy", 0.85), ("fluency", 0.90)],
        [("accuracy", 0.70), ("fluency", 0.60)],
    ]
    for i, (cfg, sc) in enumerate(zip(configs, scores_data)):
        run = Run(
            experiment_id=experiment.id,
            config=cfg,
            config_hash=f"hash_{i:03d}",
            status=RunStatus.completed,
            duration_ms=1000 + i * 500,
            tokens_in=100 + i * 50,
            tokens_out=200 + i * 100,
        )
        db_session.add(run)
        db_session.flush()
        for scorer_name, value in sc:
            score = Score(run_id=run.id, scorer_name=scorer_name, value=value)
            db_session.add(score)
        runs.append(run)
    db_session.commit()
    for r in runs:
        db_session.refresh(r)
    return runs
@pytest.fixture
 def auth_header():
    return {"X-Api-Key": API_KEY}
@pytest.fixture
 def client():
    from main import app
    return TestClient(app)
 # ---------------------------------------------------------------------------
 # Export Best — JSON
 # ---------------------------------------------------------------------------
 class TestExportBest:
    def test_returns_best_config_json(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
        assert resp.status_code == 200
        data = resp.json()
        assert data["experiment_name"] == "Test Experiment"
        assert data["config"]["model"] == "gpt-4"
        assert data["config"]["temperature"] == 0.1
        assert data["weighted_score"] > 0
        assert "run_id" in data
        assert "config_hash" in data
        assert "exported_at" in data
    def test_best_uses_weighted_scores(self, client, auth_header, experiment, completed_runs):
        """Run 0 has accuracy=0.95, fluency=0.80. With weights 0.7/0.3, score = (0.95*0.7 + 0.80*0.3)/1.0 = 0.905."""
        resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
        data = resp.json()
        assert abs(data["weighted_score"] - 0.905) < 0.001
    def test_best_404_no_experiment(self, client, auth_header, admin_user):
        fake_id = uuid.uuid4()
        resp = client.get(f"/api/export/experiments/{fake_id}/best", headers=auth_header)
        assert resp.status_code == 404
    def test_best_404_no_completed_runs(self, client, auth_header, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
        assert resp.status_code == 404
        assert "No completed runs" in resp.json()["detail"]
    def test_best_requires_auth(self, client, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/best")
        assert resp.status_code in (401, 403)
 # ---------------------------------------------------------------------------
 # Export Best — .env
 # ---------------------------------------------------------------------------
 class TestExportEnv:
    def test_returns_env_format(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
        assert resp.status_code == 200
        assert resp.headers["content-type"] == "text/plain; charset=utf-8"
        content = resp.text
        assert "# PromptLooper" in content
        assert "MODEL=" in content
        assert "TEMPERATURE=" in content
    def test_env_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
        content = resp.text
        assert "Test Experiment" in content
        assert "Weighted score" in content
    def test_env_404_no_experiment(self, client, auth_header, admin_user):
        fake_id = uuid.uuid4()
        resp = client.get(f"/api/export/experiments/{fake_id}/env", headers=auth_header)
        assert resp.status_code == 404
    def test_env_404_no_runs(self, client, auth_header, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
        assert resp.status_code == 404
    def test_env_requires_auth(self, client, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/env")
        assert resp.status_code in (401, 403)
 # ---------------------------------------------------------------------------
 # Export Best — YAML
 # ---------------------------------------------------------------------------
 class TestExportYaml:
    def test_returns_yaml_format(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
        assert resp.status_code == 200
        assert "text/yaml" in resp.headers["content-type"]
        content = resp.text
        assert "experiment_name: Test Experiment" in content
        assert "config:" in content
    def test_yaml_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
        content = resp.text
        assert "# PromptLooper" in content
        assert "# Weighted score" in content
    def test_yaml_404_no_experiment(self, client, auth_header, admin_user):
        fake_id = uuid.uuid4()
        resp = client.get(f"/api/export/experiments/{fake_id}/yaml", headers=auth_header)
        assert resp.status_code == 404
    def test_yaml_404_no_runs(self, client, auth_header, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
        assert resp.status_code == 404
    def test_yaml_requires_auth(self, client, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/yaml")
        assert resp.status_code in (401, 403)
 # ---------------------------------------------------------------------------
 # Export Report — Markdown
 # ---------------------------------------------------------------------------
 class TestExportReport:
    def test_returns_markdown_report(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        assert resp.status_code == 200
        assert "text/markdown" in resp.headers["content-type"]
        content = resp.text
        assert "# Experiment Report: Test Experiment" in content
    def test_report_contains_config_space(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Configuration Space" in content
        assert "temperature" in content
    def test_report_contains_top_configs(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Top" in content
        assert "Weighted Score" in content
    def test_report_contains_score_distributions(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Score Distributions" in content
        assert "accuracy" in content
        assert "fluency" in content
    def test_report_contains_token_usage(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Token Usage" in content
        assert "Total tokens in" in content
    def test_report_contains_timing(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Timing" in content
        assert "Fastest run" in content
    def test_report_run_summary(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "## Run Summary" in content
        assert "Total runs" in content
        assert "Completed" in content
    def test_report_custom_top_n(self, client, auth_header, experiment, completed_runs):
        resp = client.get(
            f"/api/export/experiments/{experiment.id}/report?top_n=2",
            headers=auth_header,
        )
        assert resp.status_code == 200
        content = resp.text
        assert "## Top 2 Configurations" in content
    def test_report_empty_experiment(self, client, auth_header, experiment):
        """Report should work even with no runs."""
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        assert resp.status_code == 200
        content = resp.text
        assert "Total runs | 0" in content
        assert "_No scored runs available._" in content
    def test_report_404_no_experiment(self, client, auth_header, admin_user):
        fake_id = uuid.uuid4()
        resp = client.get(f"/api/export/experiments/{fake_id}/report", headers=auth_header)
        assert resp.status_code == 404
    def test_report_requires_auth(self, client, experiment):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report")
        assert resp.status_code in (401, 403)
    def test_report_with_failed_runs(self, client, auth_header, experiment, completed_runs, db_session):
        """Report should count failed runs separately."""
        from models import Run, RunStatus
        failed = Run(
            experiment_id=experiment.id,
            config={"model": "bad", "temperature": 0.5},
            config_hash="hash_fail",
            status=RunStatus.failed,
        )
        db_session.add(failed)
        db_session.commit()
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "Total runs | 4" in content
        assert "Failed | 1" in content
    def test_report_description_shown(self, client, auth_header, experiment, completed_runs):
        resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
        content = resp.text
        assert "An experiment for testing exports" in content
 # ---------------------------------------------------------------------------
 # Helper function tests
 # ---------------------------------------------------------------------------
 class TestHelpers:
    def test_flatten_dict_simple(self):
        from routers.export import _flatten_dict
        result = _flatten_dict({"model": "gpt-4", "temperature": 0.5})
        assert result == {"MODEL": "gpt-4", "TEMPERATURE": "0.5"}
    def test_flatten_dict_nested(self):
        from routers.export import _flatten_dict
        result = _flatten_dict({"llm": {"model": "gpt-4", "temp": 0.1}})
        assert result == {"LLM_MODEL": "gpt-4", "LLM_TEMP": "0.1"}
    def test_flatten_dict_list(self):
        from routers.export import _flatten_dict
        result = _flatten_dict({"tags": ["a", "b"]})
        assert result == {"TAGS": '["a", "b"]'}
    def test_dict_to_yaml_simple(self):
        from routers.export import _dict_to_yaml
        result = _dict_to_yaml({"name": "test", "value": 42})
        assert "name: test" in result
        assert "value: 42" in result
    def test_dict_to_yaml_nested(self):
        from routers.export import _dict_to_yaml
        result = _dict_to_yaml({"config": {"model": "gpt-4"}})
        assert "config:" in result
        assert "  model: gpt-4" in result
    def test_dict_to_yaml_bool_and_none(self):
        from routers.export import _dict_to_yaml
        result = _dict_to_yaml({"enabled": True, "disabled": False, "empty": None})
        assert "enabled: true" in result
        assert "disabled: false" in result
        assert "empty: null" in result
--- a/backend/tests/test_routers.py
+++ b/backend/tests/test_routers.py
@ -174,22 +174,22 @@ def test_endpoints_test(client):
 def test_export_best(client):
    resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best")
-    assert resp.status_code == 501
+    assert resp.status_code == 401
 def test_export_env(client):
    resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env")
-    assert resp.status_code == 501
+    assert resp.status_code == 401
 def test_export_yaml(client):
    resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml")
-    assert resp.status_code == 501
+    assert resp.status_code == 401
 def test_export_report(client):
    resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report")
-    assert resp.status_code == 501
+    assert resp.status_code == 401
 # ---- Webhooks router (/api/webhooks) ----