promptlooper/frontend/src/pages/ComparePage.test.tsx
John Lightner 1d3917a44e MAESTRO: Implement Compare page with side-by-side run comparison, config/response diffs, and score overlay
- Two-column run selectors with experiment→run cascading dropdowns and URL state sync
- Config diff with color-coded same/changed/added/removed entries using key-level comparison
- Line-level LCS response diff with added/removed/same highlighting
- Score comparison with overlaid indigo/emerald bars per scorer
- Pick Winner buttons submit human_preference score via API
- Full RunCard detail view for each run side by side
- 15 tests added (5 diff helper unit tests + 10 component integration tests)
- App.test.tsx updated to mock experiments.list for ComparePage route
2026-04-07 03:25:37 -05:00

759 lines
22 KiB
TypeScript

import { render, screen, waitFor } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { MemoryRouter } from "react-router-dom";
import { describe, it, expect, vi, beforeEach } from "vitest";
import ComparePage, {
computeLineDiff,
computeConfigDiff,
} from "./ComparePage";
import * as client from "../api/client";
// ---------------------------------------------------------------------------
// Mocks
// ---------------------------------------------------------------------------
const mockNavigate = vi.fn();
vi.mock("react-router-dom", async () => {
const actual = await vi.importActual("react-router-dom");
return {
...actual,
useNavigate: () => mockNavigate,
};
});
const MOCK_EXPERIMENTS: client.ExperimentResponse[] = [
{
id: "exp-1",
project_id: "proj-1",
name: "Experiment Alpha",
description: null,
sample_data: null,
pipeline_stages: null,
scoring_config: null,
parameter_space: null,
status: "completed",
created_at: "2026-04-01T10:00:00Z",
updated_at: "2026-04-07T10:00:00Z",
},
{
id: "exp-2",
project_id: "proj-1",
name: "Experiment Beta",
description: null,
sample_data: null,
pipeline_stages: null,
scoring_config: null,
parameter_space: null,
status: "completed",
created_at: "2026-04-02T10:00:00Z",
updated_at: "2026-04-07T12:00:00Z",
},
];
const MOCK_RUNS: client.RunResponse[] = [
{
id: "run-1",
experiment_id: "exp-1",
config_hash: "abc12345deadbeef",
config: { model: "gpt-4", temperature: 0.7 },
status: "completed",
started_at: "2026-04-07T10:01:00Z",
completed_at: "2026-04-07T10:01:05Z",
duration_ms: 5000,
tokens_in: 100,
tokens_out: 200,
cost_estimate: 0.01,
},
{
id: "run-2",
experiment_id: "exp-1",
config_hash: "def67890cafebabe",
config: { model: "gpt-4", temperature: 0.3 },
status: "completed",
started_at: "2026-04-07T10:02:00Z",
completed_at: "2026-04-07T10:02:03Z",
duration_ms: 3000,
tokens_in: 80,
tokens_out: 150,
cost_estimate: 0.008,
},
];
const MOCK_RUN_DETAIL_1: client.RunDetailResponse = {
...MOCK_RUNS[0],
stage_results: [
{
id: "sr-1",
run_id: "run-1",
stage_index: 0,
prompt_sent: "Summarize the text.",
response_raw: "This is the summary\nfrom run one.\nEnd.",
model_used: "gpt-4",
parameters: { temperature: 0.7 },
tokens_in: 100,
tokens_out: 200,
latency_ms: 4500,
},
],
scores: [
{
id: "sc-1",
run_id: "run-1",
scorer_name: "coherence",
value: 0.85,
scorer_metadata: null,
created_at: "2026-04-07T10:01:05Z",
},
{
id: "sc-2",
run_id: "run-1",
scorer_name: "relevance",
value: 0.72,
scorer_metadata: null,
created_at: "2026-04-07T10:01:05Z",
},
],
};
const MOCK_RUN_DETAIL_2: client.RunDetailResponse = {
...MOCK_RUNS[1],
stage_results: [
{
id: "sr-2",
run_id: "run-2",
stage_index: 0,
prompt_sent: "Summarize the text briefly.",
response_raw: "This is a different summary\nfrom run two.\nEnd.",
model_used: "gpt-4",
parameters: { temperature: 0.3 },
tokens_in: 80,
tokens_out: 150,
latency_ms: 2800,
},
],
scores: [
{
id: "sc-3",
run_id: "run-2",
scorer_name: "coherence",
value: 0.91,
scorer_metadata: null,
created_at: "2026-04-07T10:02:03Z",
},
{
id: "sc-4",
run_id: "run-2",
scorer_name: "relevance",
value: 0.65,
scorer_metadata: null,
created_at: "2026-04-07T10:02:03Z",
},
],
};
function renderCompare() {
return render(
<MemoryRouter initialEntries={["/compare"]}>
<ComparePage />
</MemoryRouter>,
);
}
function setupDefaultMocks() {
vi.spyOn(client.experiments, "list").mockResolvedValue({
items: MOCK_EXPERIMENTS,
total: 2,
});
vi.spyOn(client.runs, "list").mockResolvedValue({
items: MOCK_RUNS,
total: 2,
});
vi.spyOn(client.runs, "get").mockImplementation(async (runId: string) => {
if (runId === "run-1") return MOCK_RUN_DETAIL_1;
if (runId === "run-2") return MOCK_RUN_DETAIL_2;
throw new Error("Not found");
});
vi.spyOn(client.runs, "score").mockResolvedValue({
id: "score-new",
run_id: "run-1",
scorer_name: "human_preference",
value: 1.0,
scorer_metadata: null,
created_at: "2026-04-07T12:00:00Z",
});
}
// ---------------------------------------------------------------------------
// Unit tests for diff helpers
// ---------------------------------------------------------------------------
describe("computeLineDiff", () => {
it("returns same lines for identical strings", () => {
const result = computeLineDiff("hello\nworld", "hello\nworld");
expect(result).toEqual([
{ type: "same", text: "hello" },
{ type: "same", text: "world" },
]);
});
it("detects added lines", () => {
const result = computeLineDiff("hello", "hello\nworld");
expect(result).toEqual([
{ type: "same", text: "hello" },
{ type: "added", text: "world" },
]);
});
it("detects removed lines", () => {
const result = computeLineDiff("hello\nworld", "hello");
expect(result).toEqual([
{ type: "same", text: "hello" },
{ type: "removed", text: "world" },
]);
});
it("detects changed lines", () => {
const result = computeLineDiff("hello\nfoo", "hello\nbar");
expect(result.filter((d) => d.type === "same")).toHaveLength(1);
expect(result.filter((d) => d.type === "removed")).toHaveLength(1);
expect(result.filter((d) => d.type === "added")).toHaveLength(1);
});
it("handles empty strings", () => {
const result = computeLineDiff("", "");
expect(result).toEqual([{ type: "same", text: "" }]);
});
});
describe("computeConfigDiff", () => {
it("returns same for identical configs", () => {
const result = computeConfigDiff({ a: 1, b: 2 }, { a: 1, b: 2 });
expect(result.every((e) => e.type === "same")).toBe(true);
});
it("detects changed values", () => {
const result = computeConfigDiff({ a: 1 }, { a: 2 });
expect(result).toEqual([
{ key: "a", type: "changed", leftValue: 1, rightValue: 2 },
]);
});
it("detects added keys", () => {
const result = computeConfigDiff({}, { a: 1 });
expect(result).toEqual([{ key: "a", type: "added", rightValue: 1 }]);
});
it("detects removed keys", () => {
const result = computeConfigDiff({ a: 1 }, {});
expect(result).toEqual([{ key: "a", type: "removed", leftValue: 1 }]);
});
it("handles mixed changes", () => {
const result = computeConfigDiff(
{ a: 1, b: 2, c: 3 },
{ a: 1, b: 99, d: 4 },
);
const types = result.map((e) => `${e.key}:${e.type}`);
expect(types).toContain("a:same");
expect(types).toContain("b:changed");
expect(types).toContain("c:removed");
expect(types).toContain("d:added");
});
});
// ---------------------------------------------------------------------------
// Component tests
// ---------------------------------------------------------------------------
describe("ComparePage", () => {
beforeEach(() => {
vi.restoreAllMocks();
mockNavigate.mockReset();
});
it("shows loading state initially", () => {
vi.spyOn(client.experiments, "list").mockImplementation(
() => new Promise(() => {}),
);
renderCompare();
expect(screen.getByText("Loading experiments…")).toBeInTheDocument();
});
it("shows error state when experiments fail to load", async () => {
vi.spyOn(client.experiments, "list").mockRejectedValue(
new Error("Network error"),
);
renderCompare();
await waitFor(() => {
expect(screen.getByTestId("compare-error")).toHaveTextContent(
"Network error",
);
});
});
it("renders experiment dropdowns after loading", async () => {
setupDefaultMocks();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
expect(screen.getByTestId("right-experiment-select")).toBeInTheDocument();
});
it("shows empty state before runs are selected", async () => {
setupDefaultMocks();
renderCompare();
await waitFor(() => {
expect(screen.getByTestId("compare-empty")).toBeInTheDocument();
});
});
it("populates run dropdown when experiment is selected", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const select = screen.getByTestId("left-run-select") as HTMLSelectElement;
// Should have options beyond the placeholder
expect(select.options.length).toBeGreaterThan(1);
});
});
it("shows config diff when both runs selected", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
// Select left experiment and run
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const select = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(select.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
// Select right experiment and run
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const select = screen.getByTestId(
"right-run-select",
) as HTMLSelectElement;
expect(select.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
// Config diff should appear
await waitFor(() => {
expect(screen.getByTestId("config-diff")).toBeInTheDocument();
});
});
it("shows score comparison when both runs are selected", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("score-comparison")).toBeInTheDocument();
});
expect(
screen.getByTestId("score-compare-coherence"),
).toBeInTheDocument();
expect(
screen.getByTestId("score-compare-relevance"),
).toBeInTheDocument();
});
it("shows response diff when both runs are selected", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("response-diff")).toBeInTheDocument();
});
});
it("highlights changed config values", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
// temperature differs (0.7 vs 0.3) so should show changed
await waitFor(() => {
expect(screen.getByTestId("config-diff")).toBeInTheDocument();
});
// model is same, temperature changed
const changedEntries = screen.getAllByTestId("config-diff-changed");
expect(changedEntries.length).toBeGreaterThanOrEqual(1);
const sameEntries = screen.getAllByTestId("config-diff-same");
expect(sameEntries.length).toBeGreaterThanOrEqual(1);
});
it("submits human_preference score when picking a winner", async () => {
setupDefaultMocks();
const scoreSpy = vi.spyOn(client.runs, "score");
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
// Select both runs
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("pick-left")).toBeInTheDocument();
});
await user.click(screen.getByTestId("pick-left"));
await waitFor(() => {
expect(scoreSpy).toHaveBeenCalledWith("run-1", {
scorer_name: "human_preference",
value: 1.0,
metadata: { compared_against: "run-2", comparison_winner: true },
});
});
await waitFor(() => {
expect(screen.getByTestId("pick-left")).toHaveTextContent(
"Run A Wins",
);
});
});
it("can pick Run B as winner", async () => {
setupDefaultMocks();
const scoreSpy = vi.spyOn(client.runs, "score");
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("pick-right")).toBeInTheDocument();
});
await user.click(screen.getByTestId("pick-right"));
await waitFor(() => {
expect(scoreSpy).toHaveBeenCalledWith("run-2", {
scorer_name: "human_preference",
value: 1.0,
metadata: { compared_against: "run-1", comparison_winner: true },
});
});
await waitFor(() => {
expect(screen.getByTestId("pick-right")).toHaveTextContent(
"Run B Wins",
);
});
});
it("renders run detail cards when both runs selected", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
const cards = screen.getAllByTestId("run-card");
expect(cards.length).toBe(2);
});
});
it("resets winner when changing run selection", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
// Select both runs
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
// Pick a winner
await waitFor(() => {
expect(screen.getByTestId("pick-left")).toBeInTheDocument();
});
await user.click(screen.getByTestId("pick-left"));
await waitFor(() => {
expect(screen.getByTestId("pick-left")).toHaveTextContent(
"Run A Wins",
);
});
// Change left run — winner should reset
await user.selectOptions(screen.getByTestId("left-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("pick-left")).toHaveTextContent("Pick Run A");
});
});
it("disables run select when no experiment is chosen", async () => {
setupDefaultMocks();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
const leftRunSelect = screen.getByTestId(
"left-run-select",
) as HTMLSelectElement;
expect(leftRunSelect.disabled).toBe(true);
});
it("shows diff lines with correct types in response diff", async () => {
setupDefaultMocks();
const user = userEvent.setup();
renderCompare();
await waitFor(() => {
expect(
screen.getByTestId("left-experiment-select"),
).toBeInTheDocument();
});
await user.selectOptions(
screen.getByTestId("left-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("left-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("left-run-select"), "run-1");
await user.selectOptions(
screen.getByTestId("right-experiment-select"),
"exp-1",
);
await waitFor(() => {
const s = screen.getByTestId("right-run-select") as HTMLSelectElement;
expect(s.options.length).toBeGreaterThan(1);
});
await user.selectOptions(screen.getByTestId("right-run-select"), "run-2");
await waitFor(() => {
expect(screen.getByTestId("response-diff")).toBeInTheDocument();
});
// The responses differ so we should have some diff lines
const sameLines = screen.getAllByTestId("diff-line-same");
expect(sameLines.length).toBeGreaterThan(0);
// At least one removed and one added line expected
const removedLines = screen.getAllByTestId("diff-line-removed");
expect(removedLines.length).toBeGreaterThan(0);
const addedLines = screen.getAllByTestId("diff-line-added");
expect(addedLines.length).toBeGreaterThan(0);
});
it("renders page title", async () => {
setupDefaultMocks();
renderCompare();
await waitFor(() => {
expect(screen.getByText("Compare Runs")).toBeInTheDocument();
});
});
});