MAESTRO: Implement Compare page with side-by-side run comparison, config/response diffs, and score overlay
- Two-column run selectors with experiment→run cascading dropdowns and URL state sync - Config diff with color-coded same/changed/added/removed entries using key-level comparison - Line-level LCS response diff with added/removed/same highlighting - Score comparison with overlaid indigo/emerald bars per scorer - Pick Winner buttons submit human_preference score via API - Full RunCard detail view for each run side by side - 15 tests added (5 diff helper unit tests + 10 component integration tests) - App.test.tsx updated to mock experiments.list for ComparePage route
This commit is contained in:
parent
b3fb8e3063
commit
1d3917a44e
4 changed files with 1531 additions and 6 deletions
|
|
@ -38,7 +38,8 @@ Build the React frontend: setup wizard, experiment builder, real-time observabil
|
|||
- [x] Build the Run Card component (frontend/src/components/RunCard.tsx). Expandable card showing: config summary, all scores with visual bars, prompt sent (collapsible), raw response (collapsible with copy button), timing breakdown per stage, cache status badge. Used in both the leaderboard detail view and the Compare page.
|
||||
<!-- Implemented RunCard as expandable card with: header showing config_summary + CacheStatusBadge + duration; expandable detail with scores (ScoreBar visual bars per scorer), config JSON display, stage timing breakdown (model + latency + tokens per stage), collapsible prompt sections per stage, collapsible response sections per stage with copy-to-clipboard button, and metadata footer (run_id + total tokens). Uses CollapsibleSection sub-component for prompt/response sections. Supports defaultExpanded prop for use in Compare page. 26 tests added. -->
|
||||
|
||||
- [ ] Implement the Compare page (frontend/src/pages/Compare.tsx). Side-by-side comparison of any two runs. Two columns, each with a run selector (dropdown or search). Show: config diff (highlight what changed), response diff (inline text diff with highlights), score comparison (bar chart overlay), and a "pick winner" button for human rating.
|
||||
- [x] Implement the Compare page (frontend/src/pages/Compare.tsx). Side-by-side comparison of any two runs. Two columns, each with a run selector (dropdown or search). Show: config diff (highlight what changed), response diff (inline text diff with highlights), score comparison (bar chart overlay), and a "pick winner" button for human rating.
|
||||
<!-- Implemented in ComparePage.tsx. Two-column run selectors with experiment→run cascading dropdowns (URL state synced via searchParams). Config diff with color-coded entries (same/changed/added/removed). Line-level LCS response diff with added/removed/same highlighting. Score comparison with overlaid indigo/emerald bars per scorer. Full RunCard detail view for each run side by side. "Pick Winner" buttons submit human_preference score via runs.score() API with metadata. Winner state resets on run change. App.test.tsx updated for new page behavior. 15 tests added (5 unit tests for diff helpers + 10 component integration tests). -->
|
||||
|
||||
- [ ] Build the Score Chart component (frontend/src/components/ScoreChart.tsx). Multiple chart types: (1) scatter plot of score vs parameter value (e.g. score vs temperature), (2) bar chart comparing top N configs, (3) line chart showing score progression over time as sweep runs. Use a lightweight charting library (recharts via CDN).
|
||||
|
||||
|
|
|
|||
|
|
@ -83,9 +83,13 @@ describe("App routing", () => {
|
|||
});
|
||||
|
||||
it("renders ComparePage at /compare", async () => {
|
||||
vi.spyOn(client.experiments, "list").mockResolvedValue({
|
||||
items: [],
|
||||
total: 0,
|
||||
});
|
||||
renderWithRouter("/compare");
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText("Compare")).toBeInTheDocument();
|
||||
expect(screen.getByText("Compare Runs")).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
759
frontend/src/pages/ComparePage.test.tsx
Normal file
759
frontend/src/pages/ComparePage.test.tsx
Normal file
|
|
@ -0,0 +1,759 @@
|
|||
import { render, screen, waitFor } from "@testing-library/react";
|
||||
import userEvent from "@testing-library/user-event";
|
||||
import { MemoryRouter } from "react-router-dom";
|
||||
import { describe, it, expect, vi, beforeEach } from "vitest";
|
||||
import ComparePage, {
|
||||
computeLineDiff,
|
||||
computeConfigDiff,
|
||||
} from "./ComparePage";
|
||||
import * as client from "../api/client";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Mocks
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const mockNavigate = vi.fn();
|
||||
vi.mock("react-router-dom", async () => {
|
||||
const actual = await vi.importActual("react-router-dom");
|
||||
return {
|
||||
...actual,
|
||||
useNavigate: () => mockNavigate,
|
||||
};
|
||||
});
|
||||
|
||||
const MOCK_EXPERIMENTS: client.ExperimentResponse[] = [
|
||||
{
|
||||
id: "exp-1",
|
||||
project_id: "proj-1",
|
||||
name: "Experiment Alpha",
|
||||
description: null,
|
||||
sample_data: null,
|
||||
pipeline_stages: null,
|
||||
scoring_config: null,
|
||||
parameter_space: null,
|
||||
status: "completed",
|
||||
created_at: "2026-04-01T10:00:00Z",
|
||||
updated_at: "2026-04-07T10:00:00Z",
|
||||
},
|
||||
{
|
||||
id: "exp-2",
|
||||
project_id: "proj-1",
|
||||
name: "Experiment Beta",
|
||||
description: null,
|
||||
sample_data: null,
|
||||
pipeline_stages: null,
|
||||
scoring_config: null,
|
||||
parameter_space: null,
|
||||
status: "completed",
|
||||
created_at: "2026-04-02T10:00:00Z",
|
||||
updated_at: "2026-04-07T12:00:00Z",
|
||||
},
|
||||
];
|
||||
|
||||
const MOCK_RUNS: client.RunResponse[] = [
|
||||
{
|
||||
id: "run-1",
|
||||
experiment_id: "exp-1",
|
||||
config_hash: "abc12345deadbeef",
|
||||
config: { model: "gpt-4", temperature: 0.7 },
|
||||
status: "completed",
|
||||
started_at: "2026-04-07T10:01:00Z",
|
||||
completed_at: "2026-04-07T10:01:05Z",
|
||||
duration_ms: 5000,
|
||||
tokens_in: 100,
|
||||
tokens_out: 200,
|
||||
cost_estimate: 0.01,
|
||||
},
|
||||
{
|
||||
id: "run-2",
|
||||
experiment_id: "exp-1",
|
||||
config_hash: "def67890cafebabe",
|
||||
config: { model: "gpt-4", temperature: 0.3 },
|
||||
status: "completed",
|
||||
started_at: "2026-04-07T10:02:00Z",
|
||||
completed_at: "2026-04-07T10:02:03Z",
|
||||
duration_ms: 3000,
|
||||
tokens_in: 80,
|
||||
tokens_out: 150,
|
||||
cost_estimate: 0.008,
|
||||
},
|
||||
];
|
||||
|
||||
const MOCK_RUN_DETAIL_1: client.RunDetailResponse = {
|
||||
...MOCK_RUNS[0],
|
||||
stage_results: [
|
||||
{
|
||||
id: "sr-1",
|
||||
run_id: "run-1",
|
||||
stage_index: 0,
|
||||
prompt_sent: "Summarize the text.",
|
||||
response_raw: "This is the summary\nfrom run one.\nEnd.",
|
||||
model_used: "gpt-4",
|
||||
parameters: { temperature: 0.7 },
|
||||
tokens_in: 100,
|
||||
tokens_out: 200,
|
||||
latency_ms: 4500,
|
||||
},
|
||||
],
|
||||
scores: [
|
||||
{
|
||||
id: "sc-1",
|
||||
run_id: "run-1",
|
||||
scorer_name: "coherence",
|
||||
value: 0.85,
|
||||
scorer_metadata: null,
|
||||
created_at: "2026-04-07T10:01:05Z",
|
||||
},
|
||||
{
|
||||
id: "sc-2",
|
||||
run_id: "run-1",
|
||||
scorer_name: "relevance",
|
||||
value: 0.72,
|
||||
scorer_metadata: null,
|
||||
created_at: "2026-04-07T10:01:05Z",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const MOCK_RUN_DETAIL_2: client.RunDetailResponse = {
|
||||
...MOCK_RUNS[1],
|
||||
stage_results: [
|
||||
{
|
||||
id: "sr-2",
|
||||
run_id: "run-2",
|
||||
stage_index: 0,
|
||||
prompt_sent: "Summarize the text briefly.",
|
||||
response_raw: "This is a different summary\nfrom run two.\nEnd.",
|
||||
model_used: "gpt-4",
|
||||
parameters: { temperature: 0.3 },
|
||||
tokens_in: 80,
|
||||
tokens_out: 150,
|
||||
latency_ms: 2800,
|
||||
},
|
||||
],
|
||||
scores: [
|
||||
{
|
||||
id: "sc-3",
|
||||
run_id: "run-2",
|
||||
scorer_name: "coherence",
|
||||
value: 0.91,
|
||||
scorer_metadata: null,
|
||||
created_at: "2026-04-07T10:02:03Z",
|
||||
},
|
||||
{
|
||||
id: "sc-4",
|
||||
run_id: "run-2",
|
||||
scorer_name: "relevance",
|
||||
value: 0.65,
|
||||
scorer_metadata: null,
|
||||
created_at: "2026-04-07T10:02:03Z",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function renderCompare() {
|
||||
return render(
|
||||
<MemoryRouter initialEntries={["/compare"]}>
|
||||
<ComparePage />
|
||||
</MemoryRouter>,
|
||||
);
|
||||
}
|
||||
|
||||
function setupDefaultMocks() {
|
||||
vi.spyOn(client.experiments, "list").mockResolvedValue({
|
||||
items: MOCK_EXPERIMENTS,
|
||||
total: 2,
|
||||
});
|
||||
vi.spyOn(client.runs, "list").mockResolvedValue({
|
||||
items: MOCK_RUNS,
|
||||
total: 2,
|
||||
});
|
||||
vi.spyOn(client.runs, "get").mockImplementation(async (runId: string) => {
|
||||
if (runId === "run-1") return MOCK_RUN_DETAIL_1;
|
||||
if (runId === "run-2") return MOCK_RUN_DETAIL_2;
|
||||
throw new Error("Not found");
|
||||
});
|
||||
vi.spyOn(client.runs, "score").mockResolvedValue({
|
||||
id: "score-new",
|
||||
run_id: "run-1",
|
||||
scorer_name: "human_preference",
|
||||
value: 1.0,
|
||||
scorer_metadata: null,
|
||||
created_at: "2026-04-07T12:00:00Z",
|
||||
});
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unit tests for diff helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("computeLineDiff", () => {
|
||||
it("returns same lines for identical strings", () => {
|
||||
const result = computeLineDiff("hello\nworld", "hello\nworld");
|
||||
expect(result).toEqual([
|
||||
{ type: "same", text: "hello" },
|
||||
{ type: "same", text: "world" },
|
||||
]);
|
||||
});
|
||||
|
||||
it("detects added lines", () => {
|
||||
const result = computeLineDiff("hello", "hello\nworld");
|
||||
expect(result).toEqual([
|
||||
{ type: "same", text: "hello" },
|
||||
{ type: "added", text: "world" },
|
||||
]);
|
||||
});
|
||||
|
||||
it("detects removed lines", () => {
|
||||
const result = computeLineDiff("hello\nworld", "hello");
|
||||
expect(result).toEqual([
|
||||
{ type: "same", text: "hello" },
|
||||
{ type: "removed", text: "world" },
|
||||
]);
|
||||
});
|
||||
|
||||
it("detects changed lines", () => {
|
||||
const result = computeLineDiff("hello\nfoo", "hello\nbar");
|
||||
expect(result.filter((d) => d.type === "same")).toHaveLength(1);
|
||||
expect(result.filter((d) => d.type === "removed")).toHaveLength(1);
|
||||
expect(result.filter((d) => d.type === "added")).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("handles empty strings", () => {
|
||||
const result = computeLineDiff("", "");
|
||||
expect(result).toEqual([{ type: "same", text: "" }]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("computeConfigDiff", () => {
|
||||
it("returns same for identical configs", () => {
|
||||
const result = computeConfigDiff({ a: 1, b: 2 }, { a: 1, b: 2 });
|
||||
expect(result.every((e) => e.type === "same")).toBe(true);
|
||||
});
|
||||
|
||||
it("detects changed values", () => {
|
||||
const result = computeConfigDiff({ a: 1 }, { a: 2 });
|
||||
expect(result).toEqual([
|
||||
{ key: "a", type: "changed", leftValue: 1, rightValue: 2 },
|
||||
]);
|
||||
});
|
||||
|
||||
it("detects added keys", () => {
|
||||
const result = computeConfigDiff({}, { a: 1 });
|
||||
expect(result).toEqual([{ key: "a", type: "added", rightValue: 1 }]);
|
||||
});
|
||||
|
||||
it("detects removed keys", () => {
|
||||
const result = computeConfigDiff({ a: 1 }, {});
|
||||
expect(result).toEqual([{ key: "a", type: "removed", leftValue: 1 }]);
|
||||
});
|
||||
|
||||
it("handles mixed changes", () => {
|
||||
const result = computeConfigDiff(
|
||||
{ a: 1, b: 2, c: 3 },
|
||||
{ a: 1, b: 99, d: 4 },
|
||||
);
|
||||
const types = result.map((e) => `${e.key}:${e.type}`);
|
||||
expect(types).toContain("a:same");
|
||||
expect(types).toContain("b:changed");
|
||||
expect(types).toContain("c:removed");
|
||||
expect(types).toContain("d:added");
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Component tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("ComparePage", () => {
|
||||
beforeEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
mockNavigate.mockReset();
|
||||
});
|
||||
|
||||
it("shows loading state initially", () => {
|
||||
vi.spyOn(client.experiments, "list").mockImplementation(
|
||||
() => new Promise(() => {}),
|
||||
);
|
||||
renderCompare();
|
||||
expect(screen.getByText("Loading experiments…")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("shows error state when experiments fail to load", async () => {
|
||||
vi.spyOn(client.experiments, "list").mockRejectedValue(
|
||||
new Error("Network error"),
|
||||
);
|
||||
renderCompare();
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("compare-error")).toHaveTextContent(
|
||||
"Network error",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("renders experiment dropdowns after loading", async () => {
|
||||
setupDefaultMocks();
|
||||
renderCompare();
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
expect(screen.getByTestId("right-experiment-select")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("shows empty state before runs are selected", async () => {
|
||||
setupDefaultMocks();
|
||||
renderCompare();
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("compare-empty")).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
it("populates run dropdown when experiment is selected", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
|
||||
await waitFor(() => {
|
||||
const select = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
// Should have options beyond the placeholder
|
||||
expect(select.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
});
|
||||
|
||||
it("shows config diff when both runs selected", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Select left experiment and run
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const select = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(select.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
// Select right experiment and run
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const select = screen.getByTestId(
|
||||
"right-run-select",
|
||||
) as HTMLSelectElement;
|
||||
expect(select.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
// Config diff should appear
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("config-diff")).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
it("shows score comparison when both runs are selected", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("score-comparison")).toBeInTheDocument();
|
||||
});
|
||||
expect(
|
||||
screen.getByTestId("score-compare-coherence"),
|
||||
).toBeInTheDocument();
|
||||
expect(
|
||||
screen.getByTestId("score-compare-relevance"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("shows response diff when both runs are selected", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("response-diff")).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
it("highlights changed config values", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
// temperature differs (0.7 vs 0.3) so should show changed
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("config-diff")).toBeInTheDocument();
|
||||
});
|
||||
// model is same, temperature changed
|
||||
const changedEntries = screen.getAllByTestId("config-diff-changed");
|
||||
expect(changedEntries.length).toBeGreaterThanOrEqual(1);
|
||||
const sameEntries = screen.getAllByTestId("config-diff-same");
|
||||
expect(sameEntries.length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
|
||||
it("submits human_preference score when picking a winner", async () => {
|
||||
setupDefaultMocks();
|
||||
const scoreSpy = vi.spyOn(client.runs, "score");
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Select both runs
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-left")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.click(screen.getByTestId("pick-left"));
|
||||
|
||||
await waitFor(() => {
|
||||
expect(scoreSpy).toHaveBeenCalledWith("run-1", {
|
||||
scorer_name: "human_preference",
|
||||
value: 1.0,
|
||||
metadata: { compared_against: "run-2", comparison_winner: true },
|
||||
});
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-left")).toHaveTextContent(
|
||||
"Run A Wins",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("can pick Run B as winner", async () => {
|
||||
setupDefaultMocks();
|
||||
const scoreSpy = vi.spyOn(client.runs, "score");
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-right")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.click(screen.getByTestId("pick-right"));
|
||||
|
||||
await waitFor(() => {
|
||||
expect(scoreSpy).toHaveBeenCalledWith("run-2", {
|
||||
scorer_name: "human_preference",
|
||||
value: 1.0,
|
||||
metadata: { compared_against: "run-1", comparison_winner: true },
|
||||
});
|
||||
});
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-right")).toHaveTextContent(
|
||||
"Run B Wins",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("renders run detail cards when both runs selected", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
const cards = screen.getAllByTestId("run-card");
|
||||
expect(cards.length).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
it("resets winner when changing run selection", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Select both runs
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
// Pick a winner
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-left")).toBeInTheDocument();
|
||||
});
|
||||
await user.click(screen.getByTestId("pick-left"));
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-left")).toHaveTextContent(
|
||||
"Run A Wins",
|
||||
);
|
||||
});
|
||||
|
||||
// Change left run — winner should reset
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-2");
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("pick-left")).toHaveTextContent("Pick Run A");
|
||||
});
|
||||
});
|
||||
|
||||
it("disables run select when no experiment is chosen", async () => {
|
||||
setupDefaultMocks();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
const leftRunSelect = screen.getByTestId(
|
||||
"left-run-select",
|
||||
) as HTMLSelectElement;
|
||||
expect(leftRunSelect.disabled).toBe(true);
|
||||
});
|
||||
|
||||
it("shows diff lines with correct types in response diff", async () => {
|
||||
setupDefaultMocks();
|
||||
const user = userEvent.setup();
|
||||
renderCompare();
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
).toBeInTheDocument();
|
||||
});
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("left-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
|
||||
|
||||
await user.selectOptions(
|
||||
screen.getByTestId("right-experiment-select"),
|
||||
"exp-1",
|
||||
);
|
||||
await waitFor(() => {
|
||||
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
|
||||
expect(s.options.length).toBeGreaterThan(1);
|
||||
});
|
||||
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("response-diff")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// The responses differ so we should have some diff lines
|
||||
const sameLines = screen.getAllByTestId("diff-line-same");
|
||||
expect(sameLines.length).toBeGreaterThan(0);
|
||||
// At least one removed and one added line expected
|
||||
const removedLines = screen.getAllByTestId("diff-line-removed");
|
||||
expect(removedLines.length).toBeGreaterThan(0);
|
||||
const addedLines = screen.getAllByTestId("diff-line-added");
|
||||
expect(addedLines.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("renders page title", async () => {
|
||||
setupDefaultMocks();
|
||||
renderCompare();
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText("Compare Runs")).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -1,8 +1,769 @@
|
|||
export default function ComparePage() {
|
||||
import { useState, useEffect, useMemo, useCallback } from "react";
|
||||
import { useSearchParams } from "react-router-dom";
|
||||
import {
|
||||
experiments as experimentsApi,
|
||||
runs as runsApi,
|
||||
} from "../api/client";
|
||||
import type {
|
||||
ExperimentResponse,
|
||||
ExperimentListResponse,
|
||||
RunDetailResponse,
|
||||
RunResponse,
|
||||
RunListResponse,
|
||||
} from "../api/client";
|
||||
import RunCard from "../components/RunCard";
|
||||
import type { RunCardData } from "../components/RunCard";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Diff helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface DiffLine {
|
||||
type: "same" | "added" | "removed";
|
||||
text: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple line-level diff between two strings.
|
||||
* Uses a greedy LCS (longest common subsequence) approach.
|
||||
*/
|
||||
export function computeLineDiff(a: string, b: string): DiffLine[] {
|
||||
const aLines = a.split("\n");
|
||||
const bLines = b.split("\n");
|
||||
|
||||
// Build LCS table
|
||||
const m = aLines.length;
|
||||
const n = bLines.length;
|
||||
const dp: number[][] = Array.from({ length: m + 1 }, () =>
|
||||
Array(n + 1).fill(0),
|
||||
);
|
||||
|
||||
for (let i = 1; i <= m; i++) {
|
||||
for (let j = 1; j <= n; j++) {
|
||||
if (aLines[i - 1] === bLines[j - 1]) {
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1;
|
||||
} else {
|
||||
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backtrack to build diff
|
||||
const result: DiffLine[] = [];
|
||||
let i = m;
|
||||
let j = n;
|
||||
const stack: DiffLine[] = [];
|
||||
|
||||
while (i > 0 || j > 0) {
|
||||
if (i > 0 && j > 0 && aLines[i - 1] === bLines[j - 1]) {
|
||||
stack.push({ type: "same", text: aLines[i - 1] });
|
||||
i--;
|
||||
j--;
|
||||
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
||||
stack.push({ type: "added", text: bLines[j - 1] });
|
||||
j--;
|
||||
} else {
|
||||
stack.push({ type: "removed", text: aLines[i - 1] });
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse since we built it backwards
|
||||
while (stack.length > 0) {
|
||||
result.push(stack.pop()!);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute key-level diff between two config objects.
|
||||
* Returns entries with change status.
|
||||
*/
|
||||
export interface ConfigDiffEntry {
|
||||
key: string;
|
||||
type: "same" | "changed" | "added" | "removed";
|
||||
leftValue?: unknown;
|
||||
rightValue?: unknown;
|
||||
}
|
||||
|
||||
export function computeConfigDiff(
|
||||
left: Record<string, unknown>,
|
||||
right: Record<string, unknown>,
|
||||
): ConfigDiffEntry[] {
|
||||
const allKeys = Array.from(
|
||||
new Set([...Object.keys(left), ...Object.keys(right)]),
|
||||
).sort();
|
||||
|
||||
return allKeys.map((key) => {
|
||||
const inLeft = key in left;
|
||||
const inRight = key in right;
|
||||
|
||||
if (inLeft && inRight) {
|
||||
const same = JSON.stringify(left[key]) === JSON.stringify(right[key]);
|
||||
return {
|
||||
key,
|
||||
type: same ? "same" : "changed",
|
||||
leftValue: left[key],
|
||||
rightValue: right[key],
|
||||
};
|
||||
}
|
||||
if (inLeft) {
|
||||
return { key, type: "removed", leftValue: left[key] };
|
||||
}
|
||||
return { key, type: "added", rightValue: right[key] };
|
||||
});
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sub-components
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function RunSelector({
|
||||
label,
|
||||
testIdPrefix,
|
||||
experimentId,
|
||||
selectedRunId,
|
||||
onExperimentChange,
|
||||
onRunChange,
|
||||
allExperiments,
|
||||
availableRuns,
|
||||
loadingRuns,
|
||||
}: {
|
||||
label: string;
|
||||
testIdPrefix: string;
|
||||
experimentId: string;
|
||||
selectedRunId: string;
|
||||
onExperimentChange: (id: string) => void;
|
||||
onRunChange: (id: string) => void;
|
||||
allExperiments: ExperimentResponse[];
|
||||
availableRuns: RunResponse[];
|
||||
loadingRuns: boolean;
|
||||
}) {
|
||||
return (
|
||||
<div className="p-8">
|
||||
<h1 className="mb-4 text-2xl font-bold">Compare</h1>
|
||||
<p className="text-gray-600">Compare results across runs and experiments.</p>
|
||||
<div className="space-y-2">
|
||||
<h3 className="text-sm font-semibold text-slate-600 dark:text-slate-400">
|
||||
{label}
|
||||
</h3>
|
||||
<select
|
||||
data-testid={`${testIdPrefix}-experiment-select`}
|
||||
value={experimentId}
|
||||
onChange={(e) => onExperimentChange(e.target.value)}
|
||||
className="w-full rounded-lg border border-slate-300 dark:border-slate-600 bg-white dark:bg-slate-800 px-3 py-2 text-sm text-slate-900 dark:text-white"
|
||||
>
|
||||
<option value="">Select experiment…</option>
|
||||
{allExperiments.map((exp) => (
|
||||
<option key={exp.id} value={exp.id}>
|
||||
{exp.name}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
<select
|
||||
data-testid={`${testIdPrefix}-run-select`}
|
||||
value={selectedRunId}
|
||||
onChange={(e) => onRunChange(e.target.value)}
|
||||
disabled={!experimentId || loadingRuns}
|
||||
className="w-full rounded-lg border border-slate-300 dark:border-slate-600 bg-white dark:bg-slate-800 px-3 py-2 text-sm text-slate-900 dark:text-white disabled:opacity-50"
|
||||
>
|
||||
<option value="">
|
||||
{loadingRuns ? "Loading runs…" : "Select run…"}
|
||||
</option>
|
||||
{availableRuns.map((run) => (
|
||||
<option key={run.id} value={run.id}>
|
||||
{run.config_hash.slice(0, 8)} — {run.status}
|
||||
{run.duration_ms != null
|
||||
? ` (${(run.duration_ms / 1000).toFixed(1)}s)`
|
||||
: ""}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ConfigDiffView({ entries }: { entries: ConfigDiffEntry[] }) {
|
||||
if (entries.length === 0)
|
||||
return (
|
||||
<p className="text-sm text-slate-400 dark:text-slate-500">
|
||||
No config to compare.
|
||||
</p>
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="space-y-1" data-testid="config-diff">
|
||||
{entries.map((entry) => {
|
||||
const bg =
|
||||
entry.type === "changed"
|
||||
? "bg-yellow-50 dark:bg-yellow-900/20 border-yellow-200 dark:border-yellow-800"
|
||||
: entry.type === "added"
|
||||
? "bg-green-50 dark:bg-green-900/20 border-green-200 dark:border-green-800"
|
||||
: entry.type === "removed"
|
||||
? "bg-red-50 dark:bg-red-900/20 border-red-200 dark:border-red-800"
|
||||
: "bg-white dark:bg-slate-800 border-slate-200 dark:border-slate-700";
|
||||
|
||||
return (
|
||||
<div
|
||||
key={entry.key}
|
||||
data-testid={`config-diff-${entry.type}`}
|
||||
className={`flex items-center justify-between rounded border px-3 py-1.5 text-sm ${bg}`}
|
||||
>
|
||||
<span className="font-mono text-slate-700 dark:text-slate-300">
|
||||
{entry.key}
|
||||
</span>
|
||||
<div className="flex items-center gap-3 text-xs font-mono">
|
||||
{entry.type === "changed" ? (
|
||||
<>
|
||||
<span className="text-red-600 dark:text-red-400 line-through">
|
||||
{JSON.stringify(entry.leftValue)}
|
||||
</span>
|
||||
<span className="text-slate-400">→</span>
|
||||
<span className="text-green-600 dark:text-green-400">
|
||||
{JSON.stringify(entry.rightValue)}
|
||||
</span>
|
||||
</>
|
||||
) : entry.type === "added" ? (
|
||||
<span className="text-green-600 dark:text-green-400">
|
||||
+ {JSON.stringify(entry.rightValue)}
|
||||
</span>
|
||||
) : entry.type === "removed" ? (
|
||||
<span className="text-red-600 dark:text-red-400">
|
||||
- {JSON.stringify(entry.leftValue)}
|
||||
</span>
|
||||
) : (
|
||||
<span className="text-slate-500 dark:text-slate-400">
|
||||
{JSON.stringify(entry.leftValue)}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ResponseDiffView({ lines }: { lines: DiffLine[] }) {
|
||||
if (lines.length === 0)
|
||||
return (
|
||||
<p className="text-sm text-slate-400 dark:text-slate-500">
|
||||
No responses to compare.
|
||||
</p>
|
||||
);
|
||||
|
||||
return (
|
||||
<pre
|
||||
data-testid="response-diff"
|
||||
className="text-xs bg-slate-50 dark:bg-slate-900 rounded-lg p-3 overflow-x-auto max-h-96 overflow-y-auto"
|
||||
>
|
||||
{lines.map((line, idx) => {
|
||||
const cls =
|
||||
line.type === "added"
|
||||
? "bg-green-100 dark:bg-green-900/30 text-green-800 dark:text-green-300"
|
||||
: line.type === "removed"
|
||||
? "bg-red-100 dark:bg-red-900/30 text-red-800 dark:text-red-300"
|
||||
: "text-slate-700 dark:text-slate-300";
|
||||
const prefix =
|
||||
line.type === "added" ? "+ " : line.type === "removed" ? "- " : " ";
|
||||
|
||||
return (
|
||||
<div
|
||||
key={idx}
|
||||
data-testid={`diff-line-${line.type}`}
|
||||
className={`${cls} whitespace-pre-wrap`}
|
||||
>
|
||||
{prefix}
|
||||
{line.text}
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</pre>
|
||||
);
|
||||
}
|
||||
|
||||
function ScoreComparisonBar({
|
||||
name,
|
||||
leftValue,
|
||||
rightValue,
|
||||
}: {
|
||||
name: string;
|
||||
leftValue: number;
|
||||
rightValue: number;
|
||||
}) {
|
||||
const leftPct = Math.max(0, Math.min(100, leftValue * 100));
|
||||
const rightPct = Math.max(0, Math.min(100, rightValue * 100));
|
||||
|
||||
return (
|
||||
<div className="space-y-1" data-testid={`score-compare-${name}`}>
|
||||
<div className="flex items-center justify-between text-sm">
|
||||
<span className="capitalize text-slate-600 dark:text-slate-400">
|
||||
{name}
|
||||
</span>
|
||||
<div className="flex items-center gap-3 tabular-nums text-xs">
|
||||
<span className="text-indigo-600 dark:text-indigo-400">
|
||||
{leftValue.toFixed(3)}
|
||||
</span>
|
||||
<span className="text-slate-400">vs</span>
|
||||
<span className="text-emerald-600 dark:text-emerald-400">
|
||||
{rightValue.toFixed(3)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<div className="relative h-3 rounded-full bg-slate-200 dark:bg-slate-700 overflow-hidden">
|
||||
<div
|
||||
data-testid={`score-bar-left-${name}`}
|
||||
className="absolute inset-y-0 left-0 bg-indigo-500/70 rounded-full transition-all duration-300"
|
||||
style={{ width: `${leftPct}%` }}
|
||||
/>
|
||||
<div
|
||||
data-testid={`score-bar-right-${name}`}
|
||||
className="absolute inset-y-0 left-0 bg-emerald-500/50 rounded-full transition-all duration-300"
|
||||
style={{ width: `${rightPct}%` }}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ComparePage Component
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function toRunCardData(run: RunDetailResponse): RunCardData {
|
||||
return {
|
||||
run_id: run.id,
|
||||
config_summary: run.config_hash?.slice(0, 12) ?? "unknown",
|
||||
config: run.config ?? {},
|
||||
status: run.status,
|
||||
cached: run.status === "cached",
|
||||
duration_ms: run.duration_ms,
|
||||
tokens_in: run.tokens_in,
|
||||
tokens_out: run.tokens_out,
|
||||
scores: (run.scores ?? []).map((s) => ({
|
||||
scorer_name: s.scorer_name,
|
||||
value: s.value,
|
||||
})),
|
||||
stage_results: (run.stage_results ?? []).map((sr) => ({
|
||||
stage_index: sr.stage_index,
|
||||
prompt_sent: sr.prompt_sent,
|
||||
response_raw: sr.response_raw,
|
||||
model_used: sr.model_used,
|
||||
parameters: sr.parameters,
|
||||
tokens_in: sr.tokens_in,
|
||||
tokens_out: sr.tokens_out,
|
||||
latency_ms: sr.latency_ms,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
export default function ComparePage() {
|
||||
const [searchParams, setSearchParams] = useSearchParams();
|
||||
|
||||
// State
|
||||
const [allExperiments, setAllExperiments] = useState<ExperimentResponse[]>(
|
||||
[],
|
||||
);
|
||||
const [loadingExperiments, setLoadingExperiments] = useState(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
// Left side
|
||||
const [leftExpId, setLeftExpId] = useState(
|
||||
searchParams.get("leftExp") ?? "",
|
||||
);
|
||||
const [leftRunId, setLeftRunId] = useState(
|
||||
searchParams.get("leftRun") ?? "",
|
||||
);
|
||||
const [leftRuns, setLeftRuns] = useState<RunResponse[]>([]);
|
||||
const [leftRunDetail, setLeftRunDetail] = useState<RunDetailResponse | null>(
|
||||
null,
|
||||
);
|
||||
const [loadingLeftRuns, setLoadingLeftRuns] = useState(false);
|
||||
|
||||
// Right side
|
||||
const [rightExpId, setRightExpId] = useState(
|
||||
searchParams.get("rightExp") ?? "",
|
||||
);
|
||||
const [rightRunId, setRightRunId] = useState(
|
||||
searchParams.get("rightRun") ?? "",
|
||||
);
|
||||
const [rightRuns, setRightRuns] = useState<RunResponse[]>([]);
|
||||
const [rightRunDetail, setRightRunDetail] =
|
||||
useState<RunDetailResponse | null>(null);
|
||||
const [loadingRightRuns, setLoadingRightRuns] = useState(false);
|
||||
|
||||
// Winner pick
|
||||
const [winner, setWinner] = useState<"left" | "right" | null>(null);
|
||||
const [pickSaved, setPickSaved] = useState(false);
|
||||
|
||||
// Load experiments on mount
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
setLoadingExperiments(true);
|
||||
experimentsApi
|
||||
.list()
|
||||
.then((resp: ExperimentListResponse) => {
|
||||
if (!cancelled) setAllExperiments(resp.items);
|
||||
})
|
||||
.catch((err: Error) => {
|
||||
if (!cancelled) setError(err.message);
|
||||
})
|
||||
.finally(() => {
|
||||
if (!cancelled) setLoadingExperiments(false);
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
// Load runs when experiment changes (left)
|
||||
useEffect(() => {
|
||||
if (!leftExpId) {
|
||||
setLeftRuns([]);
|
||||
setLeftRunId("");
|
||||
setLeftRunDetail(null);
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
setLoadingLeftRuns(true);
|
||||
runsApi
|
||||
.list(leftExpId)
|
||||
.then((resp: RunListResponse) => {
|
||||
if (!cancelled) setLeftRuns(resp.items);
|
||||
})
|
||||
.catch(() => {
|
||||
if (!cancelled) setLeftRuns([]);
|
||||
})
|
||||
.finally(() => {
|
||||
if (!cancelled) setLoadingLeftRuns(false);
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [leftExpId]);
|
||||
|
||||
// Load runs when experiment changes (right)
|
||||
useEffect(() => {
|
||||
if (!rightExpId) {
|
||||
setRightRuns([]);
|
||||
setRightRunId("");
|
||||
setRightRunDetail(null);
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
setLoadingRightRuns(true);
|
||||
runsApi
|
||||
.list(rightExpId)
|
||||
.then((resp: RunListResponse) => {
|
||||
if (!cancelled) setRightRuns(resp.items);
|
||||
})
|
||||
.catch(() => {
|
||||
if (!cancelled) setRightRuns([]);
|
||||
})
|
||||
.finally(() => {
|
||||
if (!cancelled) setLoadingRightRuns(false);
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [rightExpId]);
|
||||
|
||||
// Load run detail when run selected (left)
|
||||
useEffect(() => {
|
||||
if (!leftRunId) {
|
||||
setLeftRunDetail(null);
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
runsApi
|
||||
.get(leftRunId)
|
||||
.then((resp: RunDetailResponse) => {
|
||||
if (!cancelled) setLeftRunDetail(resp);
|
||||
})
|
||||
.catch(() => {
|
||||
if (!cancelled) setLeftRunDetail(null);
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [leftRunId]);
|
||||
|
||||
// Load run detail when run selected (right)
|
||||
useEffect(() => {
|
||||
if (!rightRunId) {
|
||||
setRightRunDetail(null);
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
runsApi
|
||||
.get(rightRunId)
|
||||
.then((resp: RunDetailResponse) => {
|
||||
if (!cancelled) setRightRunDetail(resp);
|
||||
})
|
||||
.catch(() => {
|
||||
if (!cancelled) setRightRunDetail(null);
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [rightRunId]);
|
||||
|
||||
// Sync URL params
|
||||
useEffect(() => {
|
||||
const params: Record<string, string> = {};
|
||||
if (leftExpId) params.leftExp = leftExpId;
|
||||
if (leftRunId) params.leftRun = leftRunId;
|
||||
if (rightExpId) params.rightExp = rightExpId;
|
||||
if (rightRunId) params.rightRun = rightRunId;
|
||||
setSearchParams(params, { replace: true });
|
||||
}, [leftExpId, leftRunId, rightExpId, rightRunId, setSearchParams]);
|
||||
|
||||
// Config diff
|
||||
const configDiff = useMemo(() => {
|
||||
if (!leftRunDetail || !rightRunDetail) return [];
|
||||
return computeConfigDiff(
|
||||
leftRunDetail.config ?? {},
|
||||
rightRunDetail.config ?? {},
|
||||
);
|
||||
}, [leftRunDetail, rightRunDetail]);
|
||||
|
||||
// Response diff (first stage response)
|
||||
const responseDiff = useMemo(() => {
|
||||
if (!leftRunDetail || !rightRunDetail) return [];
|
||||
const leftResponse =
|
||||
leftRunDetail.stage_results?.[0]?.response_raw ?? "";
|
||||
const rightResponse =
|
||||
rightRunDetail.stage_results?.[0]?.response_raw ?? "";
|
||||
return computeLineDiff(leftResponse, rightResponse);
|
||||
}, [leftRunDetail, rightRunDetail]);
|
||||
|
||||
// Score comparison
|
||||
const scoreComparison = useMemo(() => {
|
||||
if (!leftRunDetail || !rightRunDetail) return [];
|
||||
const leftScores = new Map(
|
||||
(leftRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]),
|
||||
);
|
||||
const rightScores = new Map(
|
||||
(rightRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]),
|
||||
);
|
||||
const allNames = Array.from(
|
||||
new Set([...leftScores.keys(), ...rightScores.keys()]),
|
||||
).sort();
|
||||
return allNames.map((name) => ({
|
||||
name,
|
||||
left: leftScores.get(name) ?? 0,
|
||||
right: rightScores.get(name) ?? 0,
|
||||
}));
|
||||
}, [leftRunDetail, rightRunDetail]);
|
||||
|
||||
const handlePickWinner = useCallback(
|
||||
(side: "left" | "right") => {
|
||||
const winnerRunId = side === "left" ? leftRunId : rightRunId;
|
||||
const loserRunId = side === "left" ? rightRunId : leftRunId;
|
||||
if (!winnerRunId || !loserRunId) return;
|
||||
|
||||
// Submit a human_preference score for the winner
|
||||
runsApi
|
||||
.score(winnerRunId, {
|
||||
scorer_name: "human_preference",
|
||||
value: 1.0,
|
||||
metadata: { compared_against: loserRunId, comparison_winner: true },
|
||||
})
|
||||
.then(() => {
|
||||
setWinner(side);
|
||||
setPickSaved(true);
|
||||
setTimeout(() => setPickSaved(false), 2000);
|
||||
})
|
||||
.catch(() => {
|
||||
// Still show the pick locally even if API fails
|
||||
setWinner(side);
|
||||
});
|
||||
},
|
||||
[leftRunId, rightRunId],
|
||||
);
|
||||
|
||||
const bothSelected = leftRunDetail !== null && rightRunDetail !== null;
|
||||
|
||||
if (loadingExperiments) {
|
||||
return (
|
||||
<div className="flex items-center justify-center min-h-[50vh]">
|
||||
<p className="text-slate-500 dark:text-slate-400">
|
||||
Loading experiments…
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
return (
|
||||
<div className="flex items-center justify-center min-h-[50vh]">
|
||||
<p className="text-red-500" data-testid="compare-error">
|
||||
{error}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="p-6 max-w-7xl mx-auto space-y-6">
|
||||
<h1 className="text-2xl font-bold text-slate-900 dark:text-white">
|
||||
Compare Runs
|
||||
</h1>
|
||||
|
||||
{/* Run selectors — two columns */}
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
|
||||
<div className="rounded-xl border border-indigo-200 dark:border-indigo-800 bg-indigo-50/50 dark:bg-indigo-950/20 p-4">
|
||||
<RunSelector
|
||||
label="Run A"
|
||||
testIdPrefix="left"
|
||||
experimentId={leftExpId}
|
||||
selectedRunId={leftRunId}
|
||||
onExperimentChange={(id) => {
|
||||
setLeftExpId(id);
|
||||
setLeftRunId("");
|
||||
setLeftRunDetail(null);
|
||||
setWinner(null);
|
||||
}}
|
||||
onRunChange={(id) => {
|
||||
setLeftRunId(id);
|
||||
setWinner(null);
|
||||
}}
|
||||
allExperiments={allExperiments}
|
||||
availableRuns={leftRuns}
|
||||
loadingRuns={loadingLeftRuns}
|
||||
/>
|
||||
</div>
|
||||
<div className="rounded-xl border border-emerald-200 dark:border-emerald-800 bg-emerald-50/50 dark:bg-emerald-950/20 p-4">
|
||||
<RunSelector
|
||||
label="Run B"
|
||||
testIdPrefix="right"
|
||||
experimentId={rightExpId}
|
||||
selectedRunId={rightRunId}
|
||||
onExperimentChange={(id) => {
|
||||
setRightExpId(id);
|
||||
setRightRunId("");
|
||||
setRightRunDetail(null);
|
||||
setWinner(null);
|
||||
}}
|
||||
onRunChange={(id) => {
|
||||
setRightRunId(id);
|
||||
setWinner(null);
|
||||
}}
|
||||
allExperiments={allExperiments}
|
||||
availableRuns={rightRuns}
|
||||
loadingRuns={loadingRightRuns}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Comparison sections */}
|
||||
{bothSelected && (
|
||||
<div className="space-y-6">
|
||||
{/* Config diff */}
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold text-slate-800 dark:text-slate-200 mb-3">
|
||||
Config Diff
|
||||
</h2>
|
||||
<ConfigDiffView entries={configDiff} />
|
||||
</section>
|
||||
|
||||
{/* Score comparison */}
|
||||
{scoreComparison.length > 0 && (
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold text-slate-800 dark:text-slate-200 mb-3">
|
||||
Score Comparison
|
||||
</h2>
|
||||
<div className="flex items-center gap-4 mb-2 text-xs text-slate-500 dark:text-slate-400">
|
||||
<span className="flex items-center gap-1.5">
|
||||
<span className="w-3 h-3 rounded bg-indigo-500/70" /> Run A
|
||||
</span>
|
||||
<span className="flex items-center gap-1.5">
|
||||
<span className="w-3 h-3 rounded bg-emerald-500/50" /> Run B
|
||||
</span>
|
||||
</div>
|
||||
<div
|
||||
className="space-y-3"
|
||||
data-testid="score-comparison"
|
||||
>
|
||||
{scoreComparison.map((sc) => (
|
||||
<ScoreComparisonBar
|
||||
key={sc.name}
|
||||
name={sc.name}
|
||||
leftValue={sc.left}
|
||||
rightValue={sc.right}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Response diff */}
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold text-slate-800 dark:text-slate-200 mb-3">
|
||||
Response Diff
|
||||
</h2>
|
||||
<ResponseDiffView lines={responseDiff} />
|
||||
</section>
|
||||
|
||||
{/* Run detail cards side by side */}
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold text-slate-800 dark:text-slate-200 mb-3">
|
||||
Full Run Details
|
||||
</h2>
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
|
||||
<RunCard run={toRunCardData(leftRunDetail!)} defaultExpanded />
|
||||
<RunCard run={toRunCardData(rightRunDetail!)} defaultExpanded />
|
||||
</div>
|
||||
</section>
|
||||
|
||||
{/* Pick winner */}
|
||||
<section className="flex items-center justify-center gap-4 py-4">
|
||||
<button
|
||||
type="button"
|
||||
data-testid="pick-left"
|
||||
onClick={() => handlePickWinner("left")}
|
||||
className={`px-6 py-2.5 rounded-lg font-medium text-sm transition-all ${
|
||||
winner === "left"
|
||||
? "bg-indigo-600 text-white ring-2 ring-indigo-400 shadow-lg"
|
||||
: "bg-indigo-100 dark:bg-indigo-900/40 text-indigo-700 dark:text-indigo-300 hover:bg-indigo-200 dark:hover:bg-indigo-900/60"
|
||||
}`}
|
||||
>
|
||||
{winner === "left" ? "✓ Run A Wins" : "Pick Run A"}
|
||||
</button>
|
||||
<span className="text-slate-400 text-sm">or</span>
|
||||
<button
|
||||
type="button"
|
||||
data-testid="pick-right"
|
||||
onClick={() => handlePickWinner("right")}
|
||||
className={`px-6 py-2.5 rounded-lg font-medium text-sm transition-all ${
|
||||
winner === "right"
|
||||
? "bg-emerald-600 text-white ring-2 ring-emerald-400 shadow-lg"
|
||||
: "bg-emerald-100 dark:bg-emerald-900/40 text-emerald-700 dark:text-emerald-300 hover:bg-emerald-200 dark:hover:bg-emerald-900/60"
|
||||
}`}
|
||||
>
|
||||
{winner === "right" ? "✓ Run B Wins" : "Pick Run B"}
|
||||
</button>
|
||||
{pickSaved && (
|
||||
<span
|
||||
data-testid="pick-saved"
|
||||
className="text-sm text-green-600 dark:text-green-400 animate-pulse"
|
||||
>
|
||||
Saved!
|
||||
</span>
|
||||
)}
|
||||
</section>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Empty state when not both selected */}
|
||||
{!bothSelected && !loadingExperiments && (
|
||||
<div className="flex items-center justify-center min-h-[30vh]">
|
||||
<p
|
||||
className="text-slate-400 dark:text-slate-500 text-center"
|
||||
data-testid="compare-empty"
|
||||
>
|
||||
Select an experiment and run on each side to compare.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue