MAESTRO: Implement export router with JSON, .env, YAML, and markdown report endpoints
Four fully authenticated endpoints at /api/export/experiments/{id}/:
- /best: Returns best config as JSON with weighted score and metadata
- /env: Flattened KEY=VALUE format with metadata comments
- /yaml: Simple YAML serialization (no external dependency)
- /report: Full markdown report with config space, top N configs,
score distributions, token usage, and timing stats
34 tests in test_export.py covering all endpoints, auth, 404s, and helpers.
Updated test_routers.py to expect 401 (auth required) instead of 501 (stub).
This commit is contained in:
parent
32535a92ea
commit
e42117c8ee
4 changed files with 768 additions and 18 deletions
|
|
@ -41,7 +41,8 @@ Implement the core experiment execution engine: LLM adapters, response caching,
|
||||||
- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
|
- [x] Implement backend/routers/runs.py fully — list runs with filtering (by experiment, status, score range), get run detail with stage results and scores, POST for ad-hoc single runs, and POST /{id}/score for human ratings. Include the leaderboard endpoint that returns top N runs ranked by weighted score.
|
||||||
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
|
<!-- Completed: Full runs router with list (filter by experiment/status/score range + pagination), detail (eager-loaded stage results + scores), ad-hoc run creation with dispatch, human scoring POST, and leaderboard with configurable weighted scoring from experiment scoring_config. Added AdHocRunCreate, LeaderboardEntry, LeaderboardResponse schemas. 25 tests in test_runs.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
- [x] Implement backend/routers/export.py — export best config in JSON, .env, and YAML formats as defined in the spec. Include metadata (score, experiment name, timestamp). The report endpoint should generate a markdown summary of the experiment: config space explored, top 5 configs, score distributions, token usage, timing stats.
|
||||||
|
<!-- Completed: Full export router with 4 endpoints: /best (JSON with weighted score, metadata), /env (flattened KEY=VALUE with comments), /yaml (simple serializer, no PyYAML dependency), /report (markdown with config space, top N configs, score distributions, token usage, timing stats). Auth required on all endpoints. 34 tests in test_export.py, all passing. -->
|
||||||
|
|
||||||
- [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events).
|
- [ ] Implement backend/websocket/manager.py — WebSocket connection manager that: maintains active connections per experiment and globally, receives Redis pub/sub messages and broadcasts to relevant connections, handles connection/disconnection cleanly, supports reconnection with message replay (last N events).
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,31 +1,387 @@
|
||||||
"""Export router — export experiment results in various formats."""
|
"""Export router — export experiment results in various formats."""
|
||||||
|
|
||||||
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from fastapi import APIRouter, Response
|
from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
|
||||||
|
from sqlalchemy.orm import Session, joinedload
|
||||||
|
|
||||||
|
from auth import get_current_user
|
||||||
|
from main import get_db
|
||||||
|
from models import Experiment, Run, RunStatus, Score, StageResult, User
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/best", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def export_best(experiment_id: uuid.UUID):
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _get_experiment_or_404(db: Session, experiment_id: uuid.UUID) -> Experiment:
|
||||||
|
experiment = db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
||||||
|
if experiment is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment not found")
|
||||||
|
return experiment
|
||||||
|
|
||||||
|
|
||||||
|
def _get_scoring_weights(experiment: Experiment) -> dict[str, float]:
|
||||||
|
weights: dict[str, float] = {}
|
||||||
|
if experiment.scoring_config and isinstance(experiment.scoring_config, dict):
|
||||||
|
weights = experiment.scoring_config.get("weights", {})
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_weighted_score(scores: list[Score], weights: dict[str, float]) -> float:
|
||||||
|
"""Compute weighted score for a run's scores."""
|
||||||
|
if not scores:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
score_map: dict[str, float] = {}
|
||||||
|
for s in scores:
|
||||||
|
score_map[s.scorer_name] = s.value
|
||||||
|
|
||||||
|
if weights:
|
||||||
|
total_weight = sum(weights.get(name, 0.0) for name in score_map)
|
||||||
|
if total_weight > 0:
|
||||||
|
return sum(
|
||||||
|
score_map[name] * weights.get(name, 0.0)
|
||||||
|
for name in score_map
|
||||||
|
if name in weights
|
||||||
|
) / total_weight
|
||||||
|
else:
|
||||||
|
return sum(score_map.values()) / len(score_map)
|
||||||
|
else:
|
||||||
|
return sum(score_map.values()) / len(score_map)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_best_run(db: Session, experiment: Experiment) -> Run | None:
|
||||||
|
"""Return the best completed run by weighted score, or None."""
|
||||||
|
weights = _get_scoring_weights(experiment)
|
||||||
|
runs = (
|
||||||
|
db.query(Run)
|
||||||
|
.options(joinedload(Run.scores))
|
||||||
|
.filter(Run.experiment_id == experiment.id, Run.status == RunStatus.completed)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
best_run = None
|
||||||
|
best_score = -1.0
|
||||||
|
for run in runs:
|
||||||
|
if not run.scores:
|
||||||
|
continue
|
||||||
|
ws = _compute_weighted_score(run.scores, weights)
|
||||||
|
if ws > best_score:
|
||||||
|
best_score = ws
|
||||||
|
best_run = run
|
||||||
|
|
||||||
|
return best_run
|
||||||
|
|
||||||
|
|
||||||
|
def _build_best_config_payload(experiment: Experiment, run: Run, weights: dict[str, float]) -> dict:
|
||||||
|
"""Build the metadata+config dict for the best run."""
|
||||||
|
score_map = {s.scorer_name: s.value for s in run.scores}
|
||||||
|
return {
|
||||||
|
"experiment_name": experiment.name,
|
||||||
|
"experiment_id": str(experiment.id),
|
||||||
|
"exported_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"weighted_score": _compute_weighted_score(run.scores, weights),
|
||||||
|
"scores": score_map,
|
||||||
|
"run_id": str(run.id),
|
||||||
|
"config_hash": run.config_hash,
|
||||||
|
"config": run.config,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Export Best Config — JSON
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/experiments/{experiment_id}/best")
|
||||||
|
def export_best(
|
||||||
|
experiment_id: uuid.UUID,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
):
|
||||||
"""Best config as JSON."""
|
"""Best config as JSON."""
|
||||||
return Response(status_code=501, content="Not Implemented")
|
experiment = _get_experiment_or_404(db, experiment_id)
|
||||||
|
best_run = _get_best_run(db, experiment)
|
||||||
|
if best_run is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="No completed runs with scores found",
|
||||||
|
)
|
||||||
|
|
||||||
|
weights = _get_scoring_weights(experiment)
|
||||||
|
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/env", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def export_env(experiment_id: uuid.UUID):
|
# Export Best Config — .env
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_dict(d: dict, prefix: str = "") -> dict[str, str]:
|
||||||
|
"""Flatten nested dict into KEY=value pairs for .env format."""
|
||||||
|
items: dict[str, str] = {}
|
||||||
|
for k, v in d.items():
|
||||||
|
key = f"{prefix}{k}".upper() if prefix else k.upper()
|
||||||
|
if isinstance(v, dict):
|
||||||
|
items.update(_flatten_dict(v, f"{key}_"))
|
||||||
|
elif isinstance(v, list):
|
||||||
|
items[key] = json.dumps(v)
|
||||||
|
else:
|
||||||
|
items[key] = str(v)
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/experiments/{experiment_id}/env")
|
||||||
|
def export_env(
|
||||||
|
experiment_id: uuid.UUID,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
):
|
||||||
"""Best config as .env snippet."""
|
"""Best config as .env snippet."""
|
||||||
return Response(status_code=501, content="Not Implemented")
|
experiment = _get_experiment_or_404(db, experiment_id)
|
||||||
|
best_run = _get_best_run(db, experiment)
|
||||||
|
if best_run is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="No completed runs with scores found",
|
||||||
|
)
|
||||||
|
|
||||||
|
weights = _get_scoring_weights(experiment)
|
||||||
|
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"# PromptLooper — Best config for: {experiment.name}",
|
||||||
|
f"# Exported: {payload['exported_at']}",
|
||||||
|
f"# Weighted score: {payload['weighted_score']:.4f}",
|
||||||
|
f"# Run ID: {payload['run_id']}",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
flat = _flatten_dict(payload["config"])
|
||||||
|
for key, value in sorted(flat.items()):
|
||||||
|
lines.append(f"{key}={value}")
|
||||||
|
|
||||||
|
content = "\n".join(lines) + "\n"
|
||||||
|
return Response(content=content, media_type="text/plain")
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/yaml", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def export_yaml(experiment_id: uuid.UUID):
|
# Export Best Config — YAML
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _dict_to_yaml(d: dict, indent: int = 0) -> str:
|
||||||
|
"""Simple YAML serializer for config dicts (no external dependency)."""
|
||||||
|
lines: list[str] = []
|
||||||
|
prefix = " " * indent
|
||||||
|
for k, v in d.items():
|
||||||
|
if isinstance(v, dict):
|
||||||
|
lines.append(f"{prefix}{k}:")
|
||||||
|
lines.append(_dict_to_yaml(v, indent + 1))
|
||||||
|
elif isinstance(v, list):
|
||||||
|
lines.append(f"{prefix}{k}:")
|
||||||
|
for item in v:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
lines.append(f"{prefix} -")
|
||||||
|
lines.append(_dict_to_yaml(item, indent + 2))
|
||||||
|
else:
|
||||||
|
lines.append(f"{prefix} - {item}")
|
||||||
|
elif isinstance(v, bool):
|
||||||
|
lines.append(f"{prefix}{k}: {'true' if v else 'false'}")
|
||||||
|
elif v is None:
|
||||||
|
lines.append(f"{prefix}{k}: null")
|
||||||
|
else:
|
||||||
|
lines.append(f"{prefix}{k}: {v}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/experiments/{experiment_id}/yaml")
|
||||||
|
def export_yaml(
|
||||||
|
experiment_id: uuid.UUID,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
):
|
||||||
"""Best config as YAML."""
|
"""Best config as YAML."""
|
||||||
return Response(status_code=501, content="Not Implemented")
|
experiment = _get_experiment_or_404(db, experiment_id)
|
||||||
|
best_run = _get_best_run(db, experiment)
|
||||||
|
if best_run is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="No completed runs with scores found",
|
||||||
|
)
|
||||||
|
|
||||||
|
weights = _get_scoring_weights(experiment)
|
||||||
|
payload = _build_best_config_payload(experiment, best_run, weights)
|
||||||
|
|
||||||
|
header = (
|
||||||
|
f"# PromptLooper — Best config for: {experiment.name}\n"
|
||||||
|
f"# Exported: {payload['exported_at']}\n"
|
||||||
|
f"# Weighted score: {payload['weighted_score']:.4f}\n"
|
||||||
|
f"# Run ID: {payload['run_id']}\n\n"
|
||||||
|
)
|
||||||
|
content = header + _dict_to_yaml(payload) + "\n"
|
||||||
|
return Response(content=content, media_type="text/yaml")
|
||||||
|
|
||||||
|
|
||||||
@router.get("/experiments/{experiment_id}/report", status_code=501)
|
# ---------------------------------------------------------------------------
|
||||||
def export_report(experiment_id: uuid.UUID):
|
# Export Report — Markdown
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/experiments/{experiment_id}/report")
|
||||||
|
def export_report(
|
||||||
|
experiment_id: uuid.UUID,
|
||||||
|
top_n: int = Query(5, ge=1, le=50),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
_user: User = Depends(get_current_user),
|
||||||
|
):
|
||||||
"""Full experiment report (markdown)."""
|
"""Full experiment report (markdown)."""
|
||||||
return Response(status_code=501, content="Not Implemented")
|
experiment = _get_experiment_or_404(db, experiment_id)
|
||||||
|
|
||||||
|
runs = (
|
||||||
|
db.query(Run)
|
||||||
|
.options(joinedload(Run.scores), joinedload(Run.stage_results))
|
||||||
|
.filter(Run.experiment_id == experiment_id)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
weights = _get_scoring_weights(experiment)
|
||||||
|
completed = [r for r in runs if r.status == RunStatus.completed]
|
||||||
|
failed = [r for r in runs if r.status == RunStatus.failed]
|
||||||
|
|
||||||
|
# Compute scored entries
|
||||||
|
scored_entries: list[tuple[Run, float]] = []
|
||||||
|
for run in completed:
|
||||||
|
if run.scores:
|
||||||
|
ws = _compute_weighted_score(run.scores, weights)
|
||||||
|
scored_entries.append((run, ws))
|
||||||
|
scored_entries.sort(key=lambda e: e[1], reverse=True)
|
||||||
|
|
||||||
|
# Collect all scorer names
|
||||||
|
all_scorer_names: set[str] = set()
|
||||||
|
for run in completed:
|
||||||
|
for s in run.scores:
|
||||||
|
all_scorer_names.add(s.scorer_name)
|
||||||
|
|
||||||
|
# Score distributions per scorer
|
||||||
|
score_values: dict[str, list[float]] = {name: [] for name in sorted(all_scorer_names)}
|
||||||
|
for run in completed:
|
||||||
|
for s in run.scores:
|
||||||
|
score_values[s.scorer_name].append(s.value)
|
||||||
|
|
||||||
|
# Token and timing stats
|
||||||
|
total_tokens_in = sum(r.tokens_in or 0 for r in runs)
|
||||||
|
total_tokens_out = sum(r.tokens_out or 0 for r in runs)
|
||||||
|
durations = [r.duration_ms for r in completed if r.duration_ms is not None]
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
lines: list[str] = []
|
||||||
|
lines.append(f"# Experiment Report: {experiment.name}")
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"**Generated:** {now} ")
|
||||||
|
lines.append(f"**Experiment ID:** `{experiment.id}` ")
|
||||||
|
if experiment.description:
|
||||||
|
lines.append(f"**Description:** {experiment.description} ")
|
||||||
|
lines.append(f"**Status:** {experiment.status.value if hasattr(experiment.status, 'value') else experiment.status} ")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Config space
|
||||||
|
lines.append("## Configuration Space")
|
||||||
|
lines.append("")
|
||||||
|
if experiment.parameter_space:
|
||||||
|
lines.append("```json")
|
||||||
|
lines.append(json.dumps(experiment.parameter_space, indent=2))
|
||||||
|
lines.append("```")
|
||||||
|
else:
|
||||||
|
lines.append("_No parameter space defined._")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Run summary
|
||||||
|
lines.append("## Run Summary")
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"| Metric | Value |")
|
||||||
|
lines.append(f"|--------|-------|")
|
||||||
|
lines.append(f"| Total runs | {len(runs)} |")
|
||||||
|
lines.append(f"| Completed | {len(completed)} |")
|
||||||
|
lines.append(f"| Failed | {len(failed)} |")
|
||||||
|
lines.append(f"| Scored | {len(scored_entries)} |")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Top N configs
|
||||||
|
lines.append(f"## Top {min(top_n, len(scored_entries))} Configurations")
|
||||||
|
lines.append("")
|
||||||
|
if scored_entries:
|
||||||
|
lines.append("| Rank | Run ID | Weighted Score | Config Hash |")
|
||||||
|
lines.append("|------|--------|---------------|-------------|")
|
||||||
|
for i, (run, ws) in enumerate(scored_entries[:top_n], 1):
|
||||||
|
lines.append(f"| {i} | `{str(run.id)[:8]}...` | {ws:.4f} | `{run.config_hash[:12]}...` |")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Detail for top entry
|
||||||
|
best_run, best_score = scored_entries[0]
|
||||||
|
lines.append("### Best Configuration Detail")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("```json")
|
||||||
|
lines.append(json.dumps(best_run.config, indent=2))
|
||||||
|
lines.append("```")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
lines.append("**Scores:**")
|
||||||
|
lines.append("")
|
||||||
|
for s in best_run.scores:
|
||||||
|
lines.append(f"- **{s.scorer_name}:** {s.value:.4f}")
|
||||||
|
lines.append("")
|
||||||
|
else:
|
||||||
|
lines.append("_No scored runs available._")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Score distributions
|
||||||
|
if score_values:
|
||||||
|
lines.append("## Score Distributions")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("| Scorer | Min | Max | Mean | Count |")
|
||||||
|
lines.append("|--------|-----|-----|------|-------|")
|
||||||
|
for name in sorted(score_values.keys()):
|
||||||
|
vals = score_values[name]
|
||||||
|
if vals:
|
||||||
|
lines.append(
|
||||||
|
f"| {name} | {min(vals):.4f} | {max(vals):.4f} | "
|
||||||
|
f"{sum(vals)/len(vals):.4f} | {len(vals)} |"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Token usage
|
||||||
|
lines.append("## Token Usage")
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"| Metric | Value |")
|
||||||
|
lines.append(f"|--------|-------|")
|
||||||
|
lines.append(f"| Total tokens in | {total_tokens_in:,} |")
|
||||||
|
lines.append(f"| Total tokens out | {total_tokens_out:,} |")
|
||||||
|
lines.append(f"| Total tokens | {total_tokens_in + total_tokens_out:,} |")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Timing stats
|
||||||
|
lines.append("## Timing")
|
||||||
|
lines.append("")
|
||||||
|
if durations:
|
||||||
|
avg_ms = sum(durations) / len(durations)
|
||||||
|
lines.append(f"| Metric | Value |")
|
||||||
|
lines.append(f"|--------|-------|")
|
||||||
|
lines.append(f"| Fastest run | {min(durations):,} ms |")
|
||||||
|
lines.append(f"| Slowest run | {max(durations):,} ms |")
|
||||||
|
lines.append(f"| Average | {avg_ms:,.0f} ms |")
|
||||||
|
lines.append(f"| Total time | {sum(durations):,} ms |")
|
||||||
|
else:
|
||||||
|
lines.append("_No timing data available._")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
content = "\n".join(lines)
|
||||||
|
return Response(content=content, media_type="text/markdown")
|
||||||
|
|
|
||||||
393
backend/tests/test_export.py
Normal file
393
backend/tests/test_export.py
Normal file
|
|
@ -0,0 +1,393 @@
|
||||||
|
"""Tests for backend/routers/export.py — Export best config (JSON, .env, YAML) and report."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
JWT_SECRET = "test-secret-key-for-jwt-signing"
|
||||||
|
API_KEY = "test-api-key-12345"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _isolate_settings(tmp_path):
|
||||||
|
"""Ensure tests use a temp SQLite DB and no Redis."""
|
||||||
|
env = {
|
||||||
|
"DATABASE_URL": f"sqlite:///{tmp_path / 'test.db'}",
|
||||||
|
"REDIS_URL": "",
|
||||||
|
"DATA_DIR": str(tmp_path),
|
||||||
|
"JWT_SECRET": JWT_SECRET,
|
||||||
|
"API_KEY": API_KEY,
|
||||||
|
}
|
||||||
|
with patch.dict(os.environ, env, clear=False):
|
||||||
|
import config
|
||||||
|
new_settings = config.Settings(_env_file=None)
|
||||||
|
config.settings = new_settings
|
||||||
|
|
||||||
|
import main
|
||||||
|
main.settings = new_settings
|
||||||
|
main._init_db()
|
||||||
|
main._init_redis()
|
||||||
|
|
||||||
|
from models import Base
|
||||||
|
Base.metadata.create_all(bind=main.engine)
|
||||||
|
|
||||||
|
import auth
|
||||||
|
auth.settings = new_settings
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session():
|
||||||
|
from main import get_db
|
||||||
|
gen = get_db()
|
||||||
|
session = next(gen)
|
||||||
|
yield session
|
||||||
|
try:
|
||||||
|
next(gen)
|
||||||
|
except StopIteration:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def admin_user(db_session):
|
||||||
|
from auth import hash_password
|
||||||
|
from models import User
|
||||||
|
user = User(username="admin", password_hash=hash_password("adminpass"), is_admin=True)
|
||||||
|
db_session.add(user)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(user)
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def project(db_session, admin_user):
|
||||||
|
from models import Project
|
||||||
|
proj = Project(name="Test Project", description="A test project", owner_id=admin_user.id)
|
||||||
|
db_session.add(proj)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(proj)
|
||||||
|
return proj
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def experiment(db_session, project):
|
||||||
|
from models import Experiment
|
||||||
|
exp = Experiment(
|
||||||
|
name="Test Experiment",
|
||||||
|
description="An experiment for testing exports",
|
||||||
|
project_id=project.id,
|
||||||
|
scoring_config={"weights": {"accuracy": 0.7, "fluency": 0.3}},
|
||||||
|
parameter_space={"temperature": [0.1, 0.5, 0.9], "model": ["gpt-4", "gpt-3.5"]},
|
||||||
|
)
|
||||||
|
db_session.add(exp)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(exp)
|
||||||
|
return exp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def completed_runs(db_session, experiment):
|
||||||
|
"""Create 3 completed runs with scores."""
|
||||||
|
from models import Run, RunStatus, Score
|
||||||
|
|
||||||
|
runs = []
|
||||||
|
configs = [
|
||||||
|
{"model": "gpt-4", "temperature": 0.1},
|
||||||
|
{"model": "gpt-4", "temperature": 0.5},
|
||||||
|
{"model": "gpt-3.5", "temperature": 0.9},
|
||||||
|
]
|
||||||
|
scores_data = [
|
||||||
|
[("accuracy", 0.95), ("fluency", 0.80)],
|
||||||
|
[("accuracy", 0.85), ("fluency", 0.90)],
|
||||||
|
[("accuracy", 0.70), ("fluency", 0.60)],
|
||||||
|
]
|
||||||
|
for i, (cfg, sc) in enumerate(zip(configs, scores_data)):
|
||||||
|
run = Run(
|
||||||
|
experiment_id=experiment.id,
|
||||||
|
config=cfg,
|
||||||
|
config_hash=f"hash_{i:03d}",
|
||||||
|
status=RunStatus.completed,
|
||||||
|
duration_ms=1000 + i * 500,
|
||||||
|
tokens_in=100 + i * 50,
|
||||||
|
tokens_out=200 + i * 100,
|
||||||
|
)
|
||||||
|
db_session.add(run)
|
||||||
|
db_session.flush()
|
||||||
|
|
||||||
|
for scorer_name, value in sc:
|
||||||
|
score = Score(run_id=run.id, scorer_name=scorer_name, value=value)
|
||||||
|
db_session.add(score)
|
||||||
|
|
||||||
|
runs.append(run)
|
||||||
|
|
||||||
|
db_session.commit()
|
||||||
|
for r in runs:
|
||||||
|
db_session.refresh(r)
|
||||||
|
return runs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def auth_header():
|
||||||
|
return {"X-Api-Key": API_KEY}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
from main import app
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Export Best — JSON
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportBest:
|
||||||
|
def test_returns_best_config_json(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["experiment_name"] == "Test Experiment"
|
||||||
|
assert data["config"]["model"] == "gpt-4"
|
||||||
|
assert data["config"]["temperature"] == 0.1
|
||||||
|
assert data["weighted_score"] > 0
|
||||||
|
assert "run_id" in data
|
||||||
|
assert "config_hash" in data
|
||||||
|
assert "exported_at" in data
|
||||||
|
|
||||||
|
def test_best_uses_weighted_scores(self, client, auth_header, experiment, completed_runs):
|
||||||
|
"""Run 0 has accuracy=0.95, fluency=0.80. With weights 0.7/0.3, score = (0.95*0.7 + 0.80*0.3)/1.0 = 0.905."""
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||||
|
data = resp.json()
|
||||||
|
assert abs(data["weighted_score"] - 0.905) < 0.001
|
||||||
|
|
||||||
|
def test_best_404_no_experiment(self, client, auth_header, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/export/experiments/{fake_id}/best", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_best_404_no_completed_runs(self, client, auth_header, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/best", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
assert "No completed runs" in resp.json()["detail"]
|
||||||
|
|
||||||
|
def test_best_requires_auth(self, client, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/best")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Export Best — .env
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportEnv:
|
||||||
|
def test_returns_env_format(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.headers["content-type"] == "text/plain; charset=utf-8"
|
||||||
|
content = resp.text
|
||||||
|
assert "# PromptLooper" in content
|
||||||
|
assert "MODEL=" in content
|
||||||
|
assert "TEMPERATURE=" in content
|
||||||
|
|
||||||
|
def test_env_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "Test Experiment" in content
|
||||||
|
assert "Weighted score" in content
|
||||||
|
|
||||||
|
def test_env_404_no_experiment(self, client, auth_header, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/export/experiments/{fake_id}/env", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_env_404_no_runs(self, client, auth_header, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/env", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_env_requires_auth(self, client, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/env")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Export Best — YAML
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportYaml:
|
||||||
|
def test_returns_yaml_format(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "text/yaml" in resp.headers["content-type"]
|
||||||
|
content = resp.text
|
||||||
|
assert "experiment_name: Test Experiment" in content
|
||||||
|
assert "config:" in content
|
||||||
|
|
||||||
|
def test_yaml_has_metadata_comments(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "# PromptLooper" in content
|
||||||
|
assert "# Weighted score" in content
|
||||||
|
|
||||||
|
def test_yaml_404_no_experiment(self, client, auth_header, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/export/experiments/{fake_id}/yaml", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_yaml_404_no_runs(self, client, auth_header, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_yaml_requires_auth(self, client, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/yaml")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Export Report — Markdown
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportReport:
|
||||||
|
def test_returns_markdown_report(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "text/markdown" in resp.headers["content-type"]
|
||||||
|
content = resp.text
|
||||||
|
assert "# Experiment Report: Test Experiment" in content
|
||||||
|
|
||||||
|
def test_report_contains_config_space(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Configuration Space" in content
|
||||||
|
assert "temperature" in content
|
||||||
|
|
||||||
|
def test_report_contains_top_configs(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Top" in content
|
||||||
|
assert "Weighted Score" in content
|
||||||
|
|
||||||
|
def test_report_contains_score_distributions(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Score Distributions" in content
|
||||||
|
assert "accuracy" in content
|
||||||
|
assert "fluency" in content
|
||||||
|
|
||||||
|
def test_report_contains_token_usage(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Token Usage" in content
|
||||||
|
assert "Total tokens in" in content
|
||||||
|
|
||||||
|
def test_report_contains_timing(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Timing" in content
|
||||||
|
assert "Fastest run" in content
|
||||||
|
|
||||||
|
def test_report_run_summary(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "## Run Summary" in content
|
||||||
|
assert "Total runs" in content
|
||||||
|
assert "Completed" in content
|
||||||
|
|
||||||
|
def test_report_custom_top_n(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(
|
||||||
|
f"/api/export/experiments/{experiment.id}/report?top_n=2",
|
||||||
|
headers=auth_header,
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
content = resp.text
|
||||||
|
assert "## Top 2 Configurations" in content
|
||||||
|
|
||||||
|
def test_report_empty_experiment(self, client, auth_header, experiment):
|
||||||
|
"""Report should work even with no runs."""
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
content = resp.text
|
||||||
|
assert "Total runs | 0" in content
|
||||||
|
assert "_No scored runs available._" in content
|
||||||
|
|
||||||
|
def test_report_404_no_experiment(self, client, auth_header, admin_user):
|
||||||
|
fake_id = uuid.uuid4()
|
||||||
|
resp = client.get(f"/api/export/experiments/{fake_id}/report", headers=auth_header)
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_report_requires_auth(self, client, experiment):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
def test_report_with_failed_runs(self, client, auth_header, experiment, completed_runs, db_session):
|
||||||
|
"""Report should count failed runs separately."""
|
||||||
|
from models import Run, RunStatus
|
||||||
|
failed = Run(
|
||||||
|
experiment_id=experiment.id,
|
||||||
|
config={"model": "bad", "temperature": 0.5},
|
||||||
|
config_hash="hash_fail",
|
||||||
|
status=RunStatus.failed,
|
||||||
|
)
|
||||||
|
db_session.add(failed)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "Total runs | 4" in content
|
||||||
|
assert "Failed | 1" in content
|
||||||
|
|
||||||
|
def test_report_description_shown(self, client, auth_header, experiment, completed_runs):
|
||||||
|
resp = client.get(f"/api/export/experiments/{experiment.id}/report", headers=auth_header)
|
||||||
|
content = resp.text
|
||||||
|
assert "An experiment for testing exports" in content
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper function tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestHelpers:
|
||||||
|
def test_flatten_dict_simple(self):
|
||||||
|
from routers.export import _flatten_dict
|
||||||
|
result = _flatten_dict({"model": "gpt-4", "temperature": 0.5})
|
||||||
|
assert result == {"MODEL": "gpt-4", "TEMPERATURE": "0.5"}
|
||||||
|
|
||||||
|
def test_flatten_dict_nested(self):
|
||||||
|
from routers.export import _flatten_dict
|
||||||
|
result = _flatten_dict({"llm": {"model": "gpt-4", "temp": 0.1}})
|
||||||
|
assert result == {"LLM_MODEL": "gpt-4", "LLM_TEMP": "0.1"}
|
||||||
|
|
||||||
|
def test_flatten_dict_list(self):
|
||||||
|
from routers.export import _flatten_dict
|
||||||
|
result = _flatten_dict({"tags": ["a", "b"]})
|
||||||
|
assert result == {"TAGS": '["a", "b"]'}
|
||||||
|
|
||||||
|
def test_dict_to_yaml_simple(self):
|
||||||
|
from routers.export import _dict_to_yaml
|
||||||
|
result = _dict_to_yaml({"name": "test", "value": 42})
|
||||||
|
assert "name: test" in result
|
||||||
|
assert "value: 42" in result
|
||||||
|
|
||||||
|
def test_dict_to_yaml_nested(self):
|
||||||
|
from routers.export import _dict_to_yaml
|
||||||
|
result = _dict_to_yaml({"config": {"model": "gpt-4"}})
|
||||||
|
assert "config:" in result
|
||||||
|
assert " model: gpt-4" in result
|
||||||
|
|
||||||
|
def test_dict_to_yaml_bool_and_none(self):
|
||||||
|
from routers.export import _dict_to_yaml
|
||||||
|
result = _dict_to_yaml({"enabled": True, "disabled": False, "empty": None})
|
||||||
|
assert "enabled: true" in result
|
||||||
|
assert "disabled: false" in result
|
||||||
|
assert "empty: null" in result
|
||||||
|
|
@ -174,22 +174,22 @@ def test_endpoints_test(client):
|
||||||
|
|
||||||
def test_export_best(client):
|
def test_export_best(client):
|
||||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best")
|
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/best")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_export_env(client):
|
def test_export_env(client):
|
||||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env")
|
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/env")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_export_yaml(client):
|
def test_export_yaml(client):
|
||||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml")
|
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/yaml")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
def test_export_report(client):
|
def test_export_report(client):
|
||||||
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report")
|
resp = client.get("/api/export/experiments/00000000-0000-0000-0000-000000000001/report")
|
||||||
assert resp.status_code == 501
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
# ---- Webhooks router (/api/webhooks) ----
|
# ---- Webhooks router (/api/webhooks) ----
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue