From 1d3917a44edb6d6dd93a791d95ea0574b76509a7 Mon Sep 17 00:00:00 2001 From: John Lightner Date: Tue, 7 Apr 2026 03:25:37 -0500 Subject: [PATCH] MAESTRO: Implement Compare page with side-by-side run comparison, config/response diffs, and score overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Two-column run selectors with experiment→run cascading dropdowns and URL state sync - Config diff with color-coded same/changed/added/removed entries using key-level comparison - Line-level LCS response diff with added/removed/same highlighting - Score comparison with overlaid indigo/emerald bars per scorer - Pick Winner buttons submit human_preference score via API - Full RunCard detail view for each run side by side - 15 tests added (5 diff helper unit tests + 10 component integration tests) - App.test.tsx updated to mock experiments.list for ComparePage route --- Auto Run Docs/02b-frontend-dashboard.md | 3 +- frontend/src/App.test.tsx | 6 +- frontend/src/pages/ComparePage.test.tsx | 759 +++++++++++++++++++++++ frontend/src/pages/ComparePage.tsx | 769 +++++++++++++++++++++++- 4 files changed, 1531 insertions(+), 6 deletions(-) create mode 100644 frontend/src/pages/ComparePage.test.tsx diff --git a/Auto Run Docs/02b-frontend-dashboard.md b/Auto Run Docs/02b-frontend-dashboard.md index 6da4b2f..474eae0 100644 --- a/Auto Run Docs/02b-frontend-dashboard.md +++ b/Auto Run Docs/02b-frontend-dashboard.md @@ -38,7 +38,8 @@ Build the React frontend: setup wizard, experiment builder, real-time observabil - [x] Build the Run Card component (frontend/src/components/RunCard.tsx). Expandable card showing: config summary, all scores with visual bars, prompt sent (collapsible), raw response (collapsible with copy button), timing breakdown per stage, cache status badge. Used in both the leaderboard detail view and the Compare page. -- [ ] Implement the Compare page (frontend/src/pages/Compare.tsx). Side-by-side comparison of any two runs. Two columns, each with a run selector (dropdown or search). Show: config diff (highlight what changed), response diff (inline text diff with highlights), score comparison (bar chart overlay), and a "pick winner" button for human rating. +- [x] Implement the Compare page (frontend/src/pages/Compare.tsx). Side-by-side comparison of any two runs. Two columns, each with a run selector (dropdown or search). Show: config diff (highlight what changed), response diff (inline text diff with highlights), score comparison (bar chart overlay), and a "pick winner" button for human rating. + - [ ] Build the Score Chart component (frontend/src/components/ScoreChart.tsx). Multiple chart types: (1) scatter plot of score vs parameter value (e.g. score vs temperature), (2) bar chart comparing top N configs, (3) line chart showing score progression over time as sweep runs. Use a lightweight charting library (recharts via CDN). diff --git a/frontend/src/App.test.tsx b/frontend/src/App.test.tsx index a514543..169abe2 100644 --- a/frontend/src/App.test.tsx +++ b/frontend/src/App.test.tsx @@ -83,9 +83,13 @@ describe("App routing", () => { }); it("renders ComparePage at /compare", async () => { + vi.spyOn(client.experiments, "list").mockResolvedValue({ + items: [], + total: 0, + }); renderWithRouter("/compare"); await waitFor(() => { - expect(screen.getByText("Compare")).toBeInTheDocument(); + expect(screen.getByText("Compare Runs")).toBeInTheDocument(); }); }); diff --git a/frontend/src/pages/ComparePage.test.tsx b/frontend/src/pages/ComparePage.test.tsx new file mode 100644 index 0000000..6c0afae --- /dev/null +++ b/frontend/src/pages/ComparePage.test.tsx @@ -0,0 +1,759 @@ +import { render, screen, waitFor } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { MemoryRouter } from "react-router-dom"; +import { describe, it, expect, vi, beforeEach } from "vitest"; +import ComparePage, { + computeLineDiff, + computeConfigDiff, +} from "./ComparePage"; +import * as client from "../api/client"; + +// --------------------------------------------------------------------------- +// Mocks +// --------------------------------------------------------------------------- + +const mockNavigate = vi.fn(); +vi.mock("react-router-dom", async () => { + const actual = await vi.importActual("react-router-dom"); + return { + ...actual, + useNavigate: () => mockNavigate, + }; +}); + +const MOCK_EXPERIMENTS: client.ExperimentResponse[] = [ + { + id: "exp-1", + project_id: "proj-1", + name: "Experiment Alpha", + description: null, + sample_data: null, + pipeline_stages: null, + scoring_config: null, + parameter_space: null, + status: "completed", + created_at: "2026-04-01T10:00:00Z", + updated_at: "2026-04-07T10:00:00Z", + }, + { + id: "exp-2", + project_id: "proj-1", + name: "Experiment Beta", + description: null, + sample_data: null, + pipeline_stages: null, + scoring_config: null, + parameter_space: null, + status: "completed", + created_at: "2026-04-02T10:00:00Z", + updated_at: "2026-04-07T12:00:00Z", + }, +]; + +const MOCK_RUNS: client.RunResponse[] = [ + { + id: "run-1", + experiment_id: "exp-1", + config_hash: "abc12345deadbeef", + config: { model: "gpt-4", temperature: 0.7 }, + status: "completed", + started_at: "2026-04-07T10:01:00Z", + completed_at: "2026-04-07T10:01:05Z", + duration_ms: 5000, + tokens_in: 100, + tokens_out: 200, + cost_estimate: 0.01, + }, + { + id: "run-2", + experiment_id: "exp-1", + config_hash: "def67890cafebabe", + config: { model: "gpt-4", temperature: 0.3 }, + status: "completed", + started_at: "2026-04-07T10:02:00Z", + completed_at: "2026-04-07T10:02:03Z", + duration_ms: 3000, + tokens_in: 80, + tokens_out: 150, + cost_estimate: 0.008, + }, +]; + +const MOCK_RUN_DETAIL_1: client.RunDetailResponse = { + ...MOCK_RUNS[0], + stage_results: [ + { + id: "sr-1", + run_id: "run-1", + stage_index: 0, + prompt_sent: "Summarize the text.", + response_raw: "This is the summary\nfrom run one.\nEnd.", + model_used: "gpt-4", + parameters: { temperature: 0.7 }, + tokens_in: 100, + tokens_out: 200, + latency_ms: 4500, + }, + ], + scores: [ + { + id: "sc-1", + run_id: "run-1", + scorer_name: "coherence", + value: 0.85, + scorer_metadata: null, + created_at: "2026-04-07T10:01:05Z", + }, + { + id: "sc-2", + run_id: "run-1", + scorer_name: "relevance", + value: 0.72, + scorer_metadata: null, + created_at: "2026-04-07T10:01:05Z", + }, + ], +}; + +const MOCK_RUN_DETAIL_2: client.RunDetailResponse = { + ...MOCK_RUNS[1], + stage_results: [ + { + id: "sr-2", + run_id: "run-2", + stage_index: 0, + prompt_sent: "Summarize the text briefly.", + response_raw: "This is a different summary\nfrom run two.\nEnd.", + model_used: "gpt-4", + parameters: { temperature: 0.3 }, + tokens_in: 80, + tokens_out: 150, + latency_ms: 2800, + }, + ], + scores: [ + { + id: "sc-3", + run_id: "run-2", + scorer_name: "coherence", + value: 0.91, + scorer_metadata: null, + created_at: "2026-04-07T10:02:03Z", + }, + { + id: "sc-4", + run_id: "run-2", + scorer_name: "relevance", + value: 0.65, + scorer_metadata: null, + created_at: "2026-04-07T10:02:03Z", + }, + ], +}; + +function renderCompare() { + return render( + + + , + ); +} + +function setupDefaultMocks() { + vi.spyOn(client.experiments, "list").mockResolvedValue({ + items: MOCK_EXPERIMENTS, + total: 2, + }); + vi.spyOn(client.runs, "list").mockResolvedValue({ + items: MOCK_RUNS, + total: 2, + }); + vi.spyOn(client.runs, "get").mockImplementation(async (runId: string) => { + if (runId === "run-1") return MOCK_RUN_DETAIL_1; + if (runId === "run-2") return MOCK_RUN_DETAIL_2; + throw new Error("Not found"); + }); + vi.spyOn(client.runs, "score").mockResolvedValue({ + id: "score-new", + run_id: "run-1", + scorer_name: "human_preference", + value: 1.0, + scorer_metadata: null, + created_at: "2026-04-07T12:00:00Z", + }); +} + +// --------------------------------------------------------------------------- +// Unit tests for diff helpers +// --------------------------------------------------------------------------- + +describe("computeLineDiff", () => { + it("returns same lines for identical strings", () => { + const result = computeLineDiff("hello\nworld", "hello\nworld"); + expect(result).toEqual([ + { type: "same", text: "hello" }, + { type: "same", text: "world" }, + ]); + }); + + it("detects added lines", () => { + const result = computeLineDiff("hello", "hello\nworld"); + expect(result).toEqual([ + { type: "same", text: "hello" }, + { type: "added", text: "world" }, + ]); + }); + + it("detects removed lines", () => { + const result = computeLineDiff("hello\nworld", "hello"); + expect(result).toEqual([ + { type: "same", text: "hello" }, + { type: "removed", text: "world" }, + ]); + }); + + it("detects changed lines", () => { + const result = computeLineDiff("hello\nfoo", "hello\nbar"); + expect(result.filter((d) => d.type === "same")).toHaveLength(1); + expect(result.filter((d) => d.type === "removed")).toHaveLength(1); + expect(result.filter((d) => d.type === "added")).toHaveLength(1); + }); + + it("handles empty strings", () => { + const result = computeLineDiff("", ""); + expect(result).toEqual([{ type: "same", text: "" }]); + }); +}); + +describe("computeConfigDiff", () => { + it("returns same for identical configs", () => { + const result = computeConfigDiff({ a: 1, b: 2 }, { a: 1, b: 2 }); + expect(result.every((e) => e.type === "same")).toBe(true); + }); + + it("detects changed values", () => { + const result = computeConfigDiff({ a: 1 }, { a: 2 }); + expect(result).toEqual([ + { key: "a", type: "changed", leftValue: 1, rightValue: 2 }, + ]); + }); + + it("detects added keys", () => { + const result = computeConfigDiff({}, { a: 1 }); + expect(result).toEqual([{ key: "a", type: "added", rightValue: 1 }]); + }); + + it("detects removed keys", () => { + const result = computeConfigDiff({ a: 1 }, {}); + expect(result).toEqual([{ key: "a", type: "removed", leftValue: 1 }]); + }); + + it("handles mixed changes", () => { + const result = computeConfigDiff( + { a: 1, b: 2, c: 3 }, + { a: 1, b: 99, d: 4 }, + ); + const types = result.map((e) => `${e.key}:${e.type}`); + expect(types).toContain("a:same"); + expect(types).toContain("b:changed"); + expect(types).toContain("c:removed"); + expect(types).toContain("d:added"); + }); +}); + +// --------------------------------------------------------------------------- +// Component tests +// --------------------------------------------------------------------------- + +describe("ComparePage", () => { + beforeEach(() => { + vi.restoreAllMocks(); + mockNavigate.mockReset(); + }); + + it("shows loading state initially", () => { + vi.spyOn(client.experiments, "list").mockImplementation( + () => new Promise(() => {}), + ); + renderCompare(); + expect(screen.getByText("Loading experiments…")).toBeInTheDocument(); + }); + + it("shows error state when experiments fail to load", async () => { + vi.spyOn(client.experiments, "list").mockRejectedValue( + new Error("Network error"), + ); + renderCompare(); + await waitFor(() => { + expect(screen.getByTestId("compare-error")).toHaveTextContent( + "Network error", + ); + }); + }); + + it("renders experiment dropdowns after loading", async () => { + setupDefaultMocks(); + renderCompare(); + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + expect(screen.getByTestId("right-experiment-select")).toBeInTheDocument(); + }); + + it("shows empty state before runs are selected", async () => { + setupDefaultMocks(); + renderCompare(); + await waitFor(() => { + expect(screen.getByTestId("compare-empty")).toBeInTheDocument(); + }); + }); + + it("populates run dropdown when experiment is selected", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + + await waitFor(() => { + const select = screen.getByTestId("left-run-select") as HTMLSelectElement; + // Should have options beyond the placeholder + expect(select.options.length).toBeGreaterThan(1); + }); + }); + + it("shows config diff when both runs selected", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + // Select left experiment and run + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const select = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(select.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + // Select right experiment and run + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const select = screen.getByTestId( + "right-run-select", + ) as HTMLSelectElement; + expect(select.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + // Config diff should appear + await waitFor(() => { + expect(screen.getByTestId("config-diff")).toBeInTheDocument(); + }); + }); + + it("shows score comparison when both runs are selected", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + expect(screen.getByTestId("score-comparison")).toBeInTheDocument(); + }); + expect( + screen.getByTestId("score-compare-coherence"), + ).toBeInTheDocument(); + expect( + screen.getByTestId("score-compare-relevance"), + ).toBeInTheDocument(); + }); + + it("shows response diff when both runs are selected", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + expect(screen.getByTestId("response-diff")).toBeInTheDocument(); + }); + }); + + it("highlights changed config values", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + // temperature differs (0.7 vs 0.3) so should show changed + await waitFor(() => { + expect(screen.getByTestId("config-diff")).toBeInTheDocument(); + }); + // model is same, temperature changed + const changedEntries = screen.getAllByTestId("config-diff-changed"); + expect(changedEntries.length).toBeGreaterThanOrEqual(1); + const sameEntries = screen.getAllByTestId("config-diff-same"); + expect(sameEntries.length).toBeGreaterThanOrEqual(1); + }); + + it("submits human_preference score when picking a winner", async () => { + setupDefaultMocks(); + const scoreSpy = vi.spyOn(client.runs, "score"); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + // Select both runs + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + expect(screen.getByTestId("pick-left")).toBeInTheDocument(); + }); + + await user.click(screen.getByTestId("pick-left")); + + await waitFor(() => { + expect(scoreSpy).toHaveBeenCalledWith("run-1", { + scorer_name: "human_preference", + value: 1.0, + metadata: { compared_against: "run-2", comparison_winner: true }, + }); + }); + + await waitFor(() => { + expect(screen.getByTestId("pick-left")).toHaveTextContent( + "Run A Wins", + ); + }); + }); + + it("can pick Run B as winner", async () => { + setupDefaultMocks(); + const scoreSpy = vi.spyOn(client.runs, "score"); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + expect(screen.getByTestId("pick-right")).toBeInTheDocument(); + }); + + await user.click(screen.getByTestId("pick-right")); + + await waitFor(() => { + expect(scoreSpy).toHaveBeenCalledWith("run-2", { + scorer_name: "human_preference", + value: 1.0, + metadata: { compared_against: "run-1", comparison_winner: true }, + }); + }); + + await waitFor(() => { + expect(screen.getByTestId("pick-right")).toHaveTextContent( + "Run B Wins", + ); + }); + }); + + it("renders run detail cards when both runs selected", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + const cards = screen.getAllByTestId("run-card"); + expect(cards.length).toBe(2); + }); + }); + + it("resets winner when changing run selection", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + // Select both runs + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + // Pick a winner + await waitFor(() => { + expect(screen.getByTestId("pick-left")).toBeInTheDocument(); + }); + await user.click(screen.getByTestId("pick-left")); + await waitFor(() => { + expect(screen.getByTestId("pick-left")).toHaveTextContent( + "Run A Wins", + ); + }); + + // Change left run — winner should reset + await user.selectOptions(screen.getByTestId("left-run-select"), "run-2"); + await waitFor(() => { + expect(screen.getByTestId("pick-left")).toHaveTextContent("Pick Run A"); + }); + }); + + it("disables run select when no experiment is chosen", async () => { + setupDefaultMocks(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + const leftRunSelect = screen.getByTestId( + "left-run-select", + ) as HTMLSelectElement; + expect(leftRunSelect.disabled).toBe(true); + }); + + it("shows diff lines with correct types in response diff", async () => { + setupDefaultMocks(); + const user = userEvent.setup(); + renderCompare(); + + await waitFor(() => { + expect( + screen.getByTestId("left-experiment-select"), + ).toBeInTheDocument(); + }); + + await user.selectOptions( + screen.getByTestId("left-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("left-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("left-run-select"), "run-1"); + + await user.selectOptions( + screen.getByTestId("right-experiment-select"), + "exp-1", + ); + await waitFor(() => { + const s = screen.getByTestId("right-run-select") as HTMLSelectElement; + expect(s.options.length).toBeGreaterThan(1); + }); + await user.selectOptions(screen.getByTestId("right-run-select"), "run-2"); + + await waitFor(() => { + expect(screen.getByTestId("response-diff")).toBeInTheDocument(); + }); + + // The responses differ so we should have some diff lines + const sameLines = screen.getAllByTestId("diff-line-same"); + expect(sameLines.length).toBeGreaterThan(0); + // At least one removed and one added line expected + const removedLines = screen.getAllByTestId("diff-line-removed"); + expect(removedLines.length).toBeGreaterThan(0); + const addedLines = screen.getAllByTestId("diff-line-added"); + expect(addedLines.length).toBeGreaterThan(0); + }); + + it("renders page title", async () => { + setupDefaultMocks(); + renderCompare(); + await waitFor(() => { + expect(screen.getByText("Compare Runs")).toBeInTheDocument(); + }); + }); +}); diff --git a/frontend/src/pages/ComparePage.tsx b/frontend/src/pages/ComparePage.tsx index 6ffda45..85ae280 100644 --- a/frontend/src/pages/ComparePage.tsx +++ b/frontend/src/pages/ComparePage.tsx @@ -1,8 +1,769 @@ -export default function ComparePage() { +import { useState, useEffect, useMemo, useCallback } from "react"; +import { useSearchParams } from "react-router-dom"; +import { + experiments as experimentsApi, + runs as runsApi, +} from "../api/client"; +import type { + ExperimentResponse, + ExperimentListResponse, + RunDetailResponse, + RunResponse, + RunListResponse, +} from "../api/client"; +import RunCard from "../components/RunCard"; +import type { RunCardData } from "../components/RunCard"; + +// --------------------------------------------------------------------------- +// Diff helpers +// --------------------------------------------------------------------------- + +export interface DiffLine { + type: "same" | "added" | "removed"; + text: string; +} + +/** + * Simple line-level diff between two strings. + * Uses a greedy LCS (longest common subsequence) approach. + */ +export function computeLineDiff(a: string, b: string): DiffLine[] { + const aLines = a.split("\n"); + const bLines = b.split("\n"); + + // Build LCS table + const m = aLines.length; + const n = bLines.length; + const dp: number[][] = Array.from({ length: m + 1 }, () => + Array(n + 1).fill(0), + ); + + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + if (aLines[i - 1] === bLines[j - 1]) { + dp[i][j] = dp[i - 1][j - 1] + 1; + } else { + dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]); + } + } + } + + // Backtrack to build diff + const result: DiffLine[] = []; + let i = m; + let j = n; + const stack: DiffLine[] = []; + + while (i > 0 || j > 0) { + if (i > 0 && j > 0 && aLines[i - 1] === bLines[j - 1]) { + stack.push({ type: "same", text: aLines[i - 1] }); + i--; + j--; + } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) { + stack.push({ type: "added", text: bLines[j - 1] }); + j--; + } else { + stack.push({ type: "removed", text: aLines[i - 1] }); + i--; + } + } + + // Reverse since we built it backwards + while (stack.length > 0) { + result.push(stack.pop()!); + } + + return result; +} + +/** + * Compute key-level diff between two config objects. + * Returns entries with change status. + */ +export interface ConfigDiffEntry { + key: string; + type: "same" | "changed" | "added" | "removed"; + leftValue?: unknown; + rightValue?: unknown; +} + +export function computeConfigDiff( + left: Record, + right: Record, +): ConfigDiffEntry[] { + const allKeys = Array.from( + new Set([...Object.keys(left), ...Object.keys(right)]), + ).sort(); + + return allKeys.map((key) => { + const inLeft = key in left; + const inRight = key in right; + + if (inLeft && inRight) { + const same = JSON.stringify(left[key]) === JSON.stringify(right[key]); + return { + key, + type: same ? "same" : "changed", + leftValue: left[key], + rightValue: right[key], + }; + } + if (inLeft) { + return { key, type: "removed", leftValue: left[key] }; + } + return { key, type: "added", rightValue: right[key] }; + }); +} + +// --------------------------------------------------------------------------- +// Sub-components +// --------------------------------------------------------------------------- + +function RunSelector({ + label, + testIdPrefix, + experimentId, + selectedRunId, + onExperimentChange, + onRunChange, + allExperiments, + availableRuns, + loadingRuns, +}: { + label: string; + testIdPrefix: string; + experimentId: string; + selectedRunId: string; + onExperimentChange: (id: string) => void; + onRunChange: (id: string) => void; + allExperiments: ExperimentResponse[]; + availableRuns: RunResponse[]; + loadingRuns: boolean; +}) { return ( -
-

Compare

-

Compare results across runs and experiments.

+
+

+ {label} +

+ + +
+ ); +} + +function ConfigDiffView({ entries }: { entries: ConfigDiffEntry[] }) { + if (entries.length === 0) + return ( +

+ No config to compare. +

+ ); + + return ( +
+ {entries.map((entry) => { + const bg = + entry.type === "changed" + ? "bg-yellow-50 dark:bg-yellow-900/20 border-yellow-200 dark:border-yellow-800" + : entry.type === "added" + ? "bg-green-50 dark:bg-green-900/20 border-green-200 dark:border-green-800" + : entry.type === "removed" + ? "bg-red-50 dark:bg-red-900/20 border-red-200 dark:border-red-800" + : "bg-white dark:bg-slate-800 border-slate-200 dark:border-slate-700"; + + return ( +
+ + {entry.key} + +
+ {entry.type === "changed" ? ( + <> + + {JSON.stringify(entry.leftValue)} + + + + {JSON.stringify(entry.rightValue)} + + + ) : entry.type === "added" ? ( + + + {JSON.stringify(entry.rightValue)} + + ) : entry.type === "removed" ? ( + + - {JSON.stringify(entry.leftValue)} + + ) : ( + + {JSON.stringify(entry.leftValue)} + + )} +
+
+ ); + })} +
+ ); +} + +function ResponseDiffView({ lines }: { lines: DiffLine[] }) { + if (lines.length === 0) + return ( +

+ No responses to compare. +

+ ); + + return ( +
+      {lines.map((line, idx) => {
+        const cls =
+          line.type === "added"
+            ? "bg-green-100 dark:bg-green-900/30 text-green-800 dark:text-green-300"
+            : line.type === "removed"
+              ? "bg-red-100 dark:bg-red-900/30 text-red-800 dark:text-red-300"
+              : "text-slate-700 dark:text-slate-300";
+        const prefix =
+          line.type === "added" ? "+ " : line.type === "removed" ? "- " : "  ";
+
+        return (
+          
+ {prefix} + {line.text} +
+ ); + })} +
+ ); +} + +function ScoreComparisonBar({ + name, + leftValue, + rightValue, +}: { + name: string; + leftValue: number; + rightValue: number; +}) { + const leftPct = Math.max(0, Math.min(100, leftValue * 100)); + const rightPct = Math.max(0, Math.min(100, rightValue * 100)); + + return ( +
+
+ + {name} + +
+ + {leftValue.toFixed(3)} + + vs + + {rightValue.toFixed(3)} + +
+
+
+
+
+
+
+ ); +} + +// --------------------------------------------------------------------------- +// ComparePage Component +// --------------------------------------------------------------------------- + +function toRunCardData(run: RunDetailResponse): RunCardData { + return { + run_id: run.id, + config_summary: run.config_hash?.slice(0, 12) ?? "unknown", + config: run.config ?? {}, + status: run.status, + cached: run.status === "cached", + duration_ms: run.duration_ms, + tokens_in: run.tokens_in, + tokens_out: run.tokens_out, + scores: (run.scores ?? []).map((s) => ({ + scorer_name: s.scorer_name, + value: s.value, + })), + stage_results: (run.stage_results ?? []).map((sr) => ({ + stage_index: sr.stage_index, + prompt_sent: sr.prompt_sent, + response_raw: sr.response_raw, + model_used: sr.model_used, + parameters: sr.parameters, + tokens_in: sr.tokens_in, + tokens_out: sr.tokens_out, + latency_ms: sr.latency_ms, + })), + }; +} + +export default function ComparePage() { + const [searchParams, setSearchParams] = useSearchParams(); + + // State + const [allExperiments, setAllExperiments] = useState( + [], + ); + const [loadingExperiments, setLoadingExperiments] = useState(true); + const [error, setError] = useState(null); + + // Left side + const [leftExpId, setLeftExpId] = useState( + searchParams.get("leftExp") ?? "", + ); + const [leftRunId, setLeftRunId] = useState( + searchParams.get("leftRun") ?? "", + ); + const [leftRuns, setLeftRuns] = useState([]); + const [leftRunDetail, setLeftRunDetail] = useState( + null, + ); + const [loadingLeftRuns, setLoadingLeftRuns] = useState(false); + + // Right side + const [rightExpId, setRightExpId] = useState( + searchParams.get("rightExp") ?? "", + ); + const [rightRunId, setRightRunId] = useState( + searchParams.get("rightRun") ?? "", + ); + const [rightRuns, setRightRuns] = useState([]); + const [rightRunDetail, setRightRunDetail] = + useState(null); + const [loadingRightRuns, setLoadingRightRuns] = useState(false); + + // Winner pick + const [winner, setWinner] = useState<"left" | "right" | null>(null); + const [pickSaved, setPickSaved] = useState(false); + + // Load experiments on mount + useEffect(() => { + let cancelled = false; + setLoadingExperiments(true); + experimentsApi + .list() + .then((resp: ExperimentListResponse) => { + if (!cancelled) setAllExperiments(resp.items); + }) + .catch((err: Error) => { + if (!cancelled) setError(err.message); + }) + .finally(() => { + if (!cancelled) setLoadingExperiments(false); + }); + return () => { + cancelled = true; + }; + }, []); + + // Load runs when experiment changes (left) + useEffect(() => { + if (!leftExpId) { + setLeftRuns([]); + setLeftRunId(""); + setLeftRunDetail(null); + return; + } + let cancelled = false; + setLoadingLeftRuns(true); + runsApi + .list(leftExpId) + .then((resp: RunListResponse) => { + if (!cancelled) setLeftRuns(resp.items); + }) + .catch(() => { + if (!cancelled) setLeftRuns([]); + }) + .finally(() => { + if (!cancelled) setLoadingLeftRuns(false); + }); + return () => { + cancelled = true; + }; + }, [leftExpId]); + + // Load runs when experiment changes (right) + useEffect(() => { + if (!rightExpId) { + setRightRuns([]); + setRightRunId(""); + setRightRunDetail(null); + return; + } + let cancelled = false; + setLoadingRightRuns(true); + runsApi + .list(rightExpId) + .then((resp: RunListResponse) => { + if (!cancelled) setRightRuns(resp.items); + }) + .catch(() => { + if (!cancelled) setRightRuns([]); + }) + .finally(() => { + if (!cancelled) setLoadingRightRuns(false); + }); + return () => { + cancelled = true; + }; + }, [rightExpId]); + + // Load run detail when run selected (left) + useEffect(() => { + if (!leftRunId) { + setLeftRunDetail(null); + return; + } + let cancelled = false; + runsApi + .get(leftRunId) + .then((resp: RunDetailResponse) => { + if (!cancelled) setLeftRunDetail(resp); + }) + .catch(() => { + if (!cancelled) setLeftRunDetail(null); + }); + return () => { + cancelled = true; + }; + }, [leftRunId]); + + // Load run detail when run selected (right) + useEffect(() => { + if (!rightRunId) { + setRightRunDetail(null); + return; + } + let cancelled = false; + runsApi + .get(rightRunId) + .then((resp: RunDetailResponse) => { + if (!cancelled) setRightRunDetail(resp); + }) + .catch(() => { + if (!cancelled) setRightRunDetail(null); + }); + return () => { + cancelled = true; + }; + }, [rightRunId]); + + // Sync URL params + useEffect(() => { + const params: Record = {}; + if (leftExpId) params.leftExp = leftExpId; + if (leftRunId) params.leftRun = leftRunId; + if (rightExpId) params.rightExp = rightExpId; + if (rightRunId) params.rightRun = rightRunId; + setSearchParams(params, { replace: true }); + }, [leftExpId, leftRunId, rightExpId, rightRunId, setSearchParams]); + + // Config diff + const configDiff = useMemo(() => { + if (!leftRunDetail || !rightRunDetail) return []; + return computeConfigDiff( + leftRunDetail.config ?? {}, + rightRunDetail.config ?? {}, + ); + }, [leftRunDetail, rightRunDetail]); + + // Response diff (first stage response) + const responseDiff = useMemo(() => { + if (!leftRunDetail || !rightRunDetail) return []; + const leftResponse = + leftRunDetail.stage_results?.[0]?.response_raw ?? ""; + const rightResponse = + rightRunDetail.stage_results?.[0]?.response_raw ?? ""; + return computeLineDiff(leftResponse, rightResponse); + }, [leftRunDetail, rightRunDetail]); + + // Score comparison + const scoreComparison = useMemo(() => { + if (!leftRunDetail || !rightRunDetail) return []; + const leftScores = new Map( + (leftRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]), + ); + const rightScores = new Map( + (rightRunDetail.scores ?? []).map((s) => [s.scorer_name, s.value]), + ); + const allNames = Array.from( + new Set([...leftScores.keys(), ...rightScores.keys()]), + ).sort(); + return allNames.map((name) => ({ + name, + left: leftScores.get(name) ?? 0, + right: rightScores.get(name) ?? 0, + })); + }, [leftRunDetail, rightRunDetail]); + + const handlePickWinner = useCallback( + (side: "left" | "right") => { + const winnerRunId = side === "left" ? leftRunId : rightRunId; + const loserRunId = side === "left" ? rightRunId : leftRunId; + if (!winnerRunId || !loserRunId) return; + + // Submit a human_preference score for the winner + runsApi + .score(winnerRunId, { + scorer_name: "human_preference", + value: 1.0, + metadata: { compared_against: loserRunId, comparison_winner: true }, + }) + .then(() => { + setWinner(side); + setPickSaved(true); + setTimeout(() => setPickSaved(false), 2000); + }) + .catch(() => { + // Still show the pick locally even if API fails + setWinner(side); + }); + }, + [leftRunId, rightRunId], + ); + + const bothSelected = leftRunDetail !== null && rightRunDetail !== null; + + if (loadingExperiments) { + return ( +
+

+ Loading experiments… +

+
+ ); + } + + if (error) { + return ( +
+

+ {error} +

+
+ ); + } + + return ( +
+

+ Compare Runs +

+ + {/* Run selectors — two columns */} +
+
+ { + setLeftExpId(id); + setLeftRunId(""); + setLeftRunDetail(null); + setWinner(null); + }} + onRunChange={(id) => { + setLeftRunId(id); + setWinner(null); + }} + allExperiments={allExperiments} + availableRuns={leftRuns} + loadingRuns={loadingLeftRuns} + /> +
+
+ { + setRightExpId(id); + setRightRunId(""); + setRightRunDetail(null); + setWinner(null); + }} + onRunChange={(id) => { + setRightRunId(id); + setWinner(null); + }} + allExperiments={allExperiments} + availableRuns={rightRuns} + loadingRuns={loadingRightRuns} + /> +
+
+ + {/* Comparison sections */} + {bothSelected && ( +
+ {/* Config diff */} +
+

+ Config Diff +

+ +
+ + {/* Score comparison */} + {scoreComparison.length > 0 && ( +
+

+ Score Comparison +

+
+ + Run A + + + Run B + +
+
+ {scoreComparison.map((sc) => ( + + ))} +
+
+ )} + + {/* Response diff */} +
+

+ Response Diff +

+ +
+ + {/* Run detail cards side by side */} +
+

+ Full Run Details +

+
+ + +
+
+ + {/* Pick winner */} +
+ + or + + {pickSaved && ( + + Saved! + + )} +
+
+ )} + + {/* Empty state when not both selected */} + {!bothSelected && !loadingExperiments && ( +
+

+ Select an experiment and run on each side to compare. +

+
+ )}
); }