-
Compare
-
Compare results across runs and experiments.
+
+
+ {label}
+
+
+
+
+ );
+}
+
+function ConfigDiffView({ entries }: { entries: ConfigDiffEntry[] }) {
+ if (entries.length === 0)
+ return (
+
+ No config to compare.
+
+ );
+
+ return (
+
+ {entries.map((entry) => {
+ const bg =
+ entry.type === "changed"
+ ? "bg-yellow-50 dark:bg-yellow-900/20 border-yellow-200 dark:border-yellow-800"
+ : entry.type === "added"
+ ? "bg-green-50 dark:bg-green-900/20 border-green-200 dark:border-green-800"
+ : entry.type === "removed"
+ ? "bg-red-50 dark:bg-red-900/20 border-red-200 dark:border-red-800"
+ : "bg-white dark:bg-slate-800 border-slate-200 dark:border-slate-700";
+
+ return (
+
+
+ {entry.key}
+
+
+ {entry.type === "changed" ? (
+ <>
+
+ {JSON.stringify(entry.leftValue)}
+
+ →
+
+ {JSON.stringify(entry.rightValue)}
+
+ >
+ ) : entry.type === "added" ? (
+
+ + {JSON.stringify(entry.rightValue)}
+
+ ) : entry.type === "removed" ? (
+
+ - {JSON.stringify(entry.leftValue)}
+
+ ) : (
+
+ {JSON.stringify(entry.leftValue)}
+
+ )}
+
+
+ );
+ })}
+
+ );
+}
+
+function ResponseDiffView({ lines }: { lines: DiffLine[] }) {
+ if (lines.length === 0)
+ return (
+
+ No responses to compare.
+
+ );
+
+ return (
+
+ {lines.map((line, idx) => {
+ const cls =
+ line.type === "added"
+ ? "bg-green-100 dark:bg-green-900/30 text-green-800 dark:text-green-300"
+ : line.type === "removed"
+ ? "bg-red-100 dark:bg-red-900/30 text-red-800 dark:text-red-300"
+ : "text-slate-700 dark:text-slate-300";
+ const prefix =
+ line.type === "added" ? "+ " : line.type === "removed" ? "- " : " ";
+
+ return (
+
+ {prefix}
+ {line.text}
+
+ );
+ })}
+
+ );
+}
+
+function ScoreComparisonBar({
+ name,
+ leftValue,
+ rightValue,
+}: {
+ name: string;
+ leftValue: number;
+ rightValue: number;
+}) {
+ const leftPct = Math.max(0, Math.min(100, leftValue * 100));
+ const rightPct = Math.max(0, Math.min(100, rightValue * 100));
+
+ return (
+
+
+
+ {name}
+
+
+
+ {leftValue.toFixed(3)}
+
+ vs
+
+ {rightValue.toFixed(3)}
+
+
+
+
+
+ );
+}
+
+// ---------------------------------------------------------------------------
+// ComparePage Component
+// ---------------------------------------------------------------------------
+
+function toRunCardData(run: RunDetailResponse): RunCardData {
+ return {
+ run_id: run.id,
+ config_summary: run.config_hash?.slice(0, 12) ?? "unknown",
+ config: run.config ?? {},
+ status: run.status,
+ cached: run.status === "cached",
+ duration_ms: run.duration_ms,
+ tokens_in: run.tokens_in,
+ tokens_out: run.tokens_out,
+ scores: (run.scores ?? []).map((s) => ({
+ scorer_name: s.scorer_name,
+ value: s.value,
+ })),
+ stage_results: (run.stage_results ?? []).map((sr) => ({
+ stage_index: sr.stage_index,
+ prompt_sent: sr.prompt_sent,
+ response_raw: sr.response_raw,
+ model_used: sr.model_used,
+ parameters: sr.parameters,
+ tokens_in: sr.tokens_in,
+ tokens_out: sr.tokens_out,
+ latency_ms: sr.latency_ms,
+ })),
+ };
+}
+
+export default function ComparePage() {
+ const [searchParams, setSearchParams] = useSearchParams();
+
+ // State
+ const [allExperiments, setAllExperiments] = useState
(
+ [],
+ );
+ const [loadingExperiments, setLoadingExperiments] = useState(true);
+ const [error, setError] = useState(null);
+
+ // Left side
+ const [leftExpId, setLeftExpId] = useState(
+ searchParams.get("leftExp") ?? "",
+ );
+ const [leftRunId, setLeftRunId] = useState(
+ searchParams.get("leftRun") ?? "",
+ );
+ const [leftRuns, setLeftRuns] = useState([]);
+ const [leftRunDetail, setLeftRunDetail] = useState(
+ null,
+ );
+ const [loadingLeftRuns, setLoadingLeftRuns] = useState(false);
+
+ // Right side
+ const [rightExpId, setRightExpId] = useState(
+ searchParams.get("rightExp") ?? "",
+ );
+ const [rightRunId, setRightRunId] = useState(
+ searchParams.get("rightRun") ?? "",
+ );
+ const [rightRuns, setRightRuns] = useState([]);
+ const [rightRunDetail, setRightRunDetail] =
+ useState(null);
+ const [loadingRightRuns, setLoadingRightRuns] = useState(false);
+
+ // Winner pick
+ const [winner, setWinner] = useState<"left" | "right" | null>(null);
+ const [pickSaved, setPickSaved] = useState(false);
+
+ // Load experiments on mount
+ useEffect(() => {
+ let cancelled = false;
+ setLoadingExperiments(true);
+ experimentsApi
+ .list()
+ .then((resp: ExperimentListResponse) => {
+ if (!cancelled) setAllExperiments(resp.items);
+ })
+ .catch((err: Error) => {
+ if (!cancelled) setError(err.message);
+ })
+ .finally(() => {
+ if (!cancelled) setLoadingExperiments(false);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, []);
+
+ // Load runs when experiment changes (left)
+ useEffect(() => {
+ if (!leftExpId) {
+ setLeftRuns([]);
+ setLeftRunId("");
+ setLeftRunDetail(null);
+ return;
+ }
+ let cancelled = false;
+ setLoadingLeftRuns(true);
+ runsApi
+ .list(leftExpId)
+ .then((resp: RunListResponse) => {
+ if (!cancelled) setLeftRuns(resp.items);
+ })
+ .catch(() => {
+ if (!cancelled) setLeftRuns([]);
+ })
+ .finally(() => {
+ if (!cancelled) setLoadingLeftRuns(false);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, [leftExpId]);
+
+ // Load runs when experiment changes (right)
+ useEffect(() => {
+ if (!rightExpId) {
+ setRightRuns([]);
+ setRightRunId("");
+ setRightRunDetail(null);
+ return;
+ }
+ let cancelled = false;
+ setLoadingRightRuns(true);
+ runsApi
+ .list(rightExpId)
+ .then((resp: RunListResponse) => {
+ if (!cancelled) setRightRuns(resp.items);
+ })
+ .catch(() => {
+ if (!cancelled) setRightRuns([]);
+ })
+ .finally(() => {
+ if (!cancelled) setLoadingRightRuns(false);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, [rightExpId]);
+
+ // Load run detail when run selected (left)
+ useEffect(() => {
+ if (!leftRunId) {
+ setLeftRunDetail(null);
+ return;
+ }
+ let cancelled = false;
+ runsApi
+ .get(leftRunId)
+ .then((resp: RunDetailResponse) => {
+ if (!cancelled) setLeftRunDetail(resp);
+ })
+ .catch(() => {
+ if (!cancelled) setLeftRunDetail(null);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, [leftRunId]);
+
+ // Load run detail when run selected (right)
+ useEffect(() => {
+ if (!rightRunId) {
+ setRightRunDetail(null);
+ return;
+ }
+ let cancelled = false;
+ runsApi
+ .get(rightRunId)
+ .then((resp: RunDetailResponse) => {
+ if (!cancelled) setRightRunDetail(resp);
+ })
+ .catch(() => {
+ if (!cancelled) setRightRunDetail(null);
+ });
+ return () => {
+ cancelled = true;
+ };
+ }, [rightRunId]);
+
+ // Sync URL params
+ useEffect(() => {
+ const params: Record = {};
+ if (leftExpId) params.leftExp = leftExpId;
+ if (leftRunId) params.leftRun = leftRunId;
+ if (rightExpId) params.rightExp = rightExpId;
+ if (rightRunId) params.rightRun = rightRunId;
+ setSearchParams(params, { replace: true });
+ }, [leftExpId, leftRunId, rightExpId, rightRunId, setSearchParams]);
+
+ // Config diff
+ const configDiff = useMemo(() => {
+ if (!leftRunDetail || !rightRunDetail) return [];
+ return computeConfigDiff(
+ leftRunDetail.config ?? {},
+ rightRunDetail.config ?? {},
+ );
+ }, [leftRunDetail, rightRunDetail]);
+
+ // Response diff (first stage response)
+ const responseDiff = useMemo(() => {
+ if (!leftRunDetail || !rightRunDetail) return [];
+ const leftResponse =
+ leftRunDetail.stage_results?.[0]?.response_raw ?? "";
+ const rightResponse =
+ rightRunDetail.stage_results?.[0]?.response_raw ?? "";
+ return computeLineDiff(leftResponse, rightResponse);
+ }, [leftRunDetail, rightRunDetail]);
+
+ // Score comparison
+ const scoreComparison = useMemo(() => {
+ if (!leftRunDetail || !rightRunDetail) return [];
+ const leftScores = new Map(
+ (leftRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]),
+ );
+ const rightScores = new Map(
+ (rightRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]),
+ );
+ const allNames = Array.from(
+ new Set([...leftScores.keys(), ...rightScores.keys()]),
+ ).sort();
+ return allNames.map((name) => ({
+ name,
+ left: leftScores.get(name) ?? 0,
+ right: rightScores.get(name) ?? 0,
+ }));
+ }, [leftRunDetail, rightRunDetail]);
+
+ const handlePickWinner = useCallback(
+ (side: "left" | "right") => {
+ const winnerRunId = side === "left" ? leftRunId : rightRunId;
+ const loserRunId = side === "left" ? rightRunId : leftRunId;
+ if (!winnerRunId || !loserRunId) return;
+
+ // Submit a human_preference score for the winner
+ runsApi
+ .score(winnerRunId, {
+ scorer_name: "human_preference",
+ value: 1.0,
+ metadata: { compared_against: loserRunId, comparison_winner: true },
+ })
+ .then(() => {
+ setWinner(side);
+ setPickSaved(true);
+ setTimeout(() => setPickSaved(false), 2000);
+ })
+ .catch(() => {
+ // Still show the pick locally even if API fails
+ setWinner(side);
+ });
+ },
+ [leftRunId, rightRunId],
+ );
+
+ const bothSelected = leftRunDetail !== null && rightRunDetail !== null;
+
+ if (loadingExperiments) {
+ return (
+
+
+ Loading experiments…
+
+
+ );
+ }
+
+ if (error) {
+ return (
+
+ );
+ }
+
+ return (
+
+
+ Compare Runs
+
+
+ {/* Run selectors — two columns */}
+
+
+ {
+ setLeftExpId(id);
+ setLeftRunId("");
+ setLeftRunDetail(null);
+ setWinner(null);
+ }}
+ onRunChange={(id) => {
+ setLeftRunId(id);
+ setWinner(null);
+ }}
+ allExperiments={allExperiments}
+ availableRuns={leftRuns}
+ loadingRuns={loadingLeftRuns}
+ />
+
+
+ {
+ setRightExpId(id);
+ setRightRunId("");
+ setRightRunDetail(null);
+ setWinner(null);
+ }}
+ onRunChange={(id) => {
+ setRightRunId(id);
+ setWinner(null);
+ }}
+ allExperiments={allExperiments}
+ availableRuns={rightRuns}
+ loadingRuns={loadingRightRuns}
+ />
+
+
+
+ {/* Comparison sections */}
+ {bothSelected && (
+
+ {/* Config diff */}
+
+
+ {/* Score comparison */}
+ {scoreComparison.length > 0 && (
+
+
+ Score Comparison
+
+
+
+ Run A
+
+
+ Run B
+
+
+
+ {scoreComparison.map((sc) => (
+
+ ))}
+
+
+ )}
+
+ {/* Response diff */}
+
+
+ {/* Run detail cards side by side */}
+
+
+ Full Run Details
+
+
+
+
+
+
+
+ {/* Pick winner */}
+
+
+ or
+
+ {pickSaved && (
+
+ Saved!
+
+ )}
+
+
+ )}
+
+ {/* Empty state when not both selected */}
+ {!bothSelected && !loadingExperiments && (
+
+
+ Select an experiment and run on each side to compare.
+
+
+ )}
);
}