commit 5669f7044babf34cfed2b866f7d284476fd42bbf
parent a25191cd2a25892e07b8dd4a14baa6f7c6035e42
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 09:26:37 +0200
Add Explore page with 6 interactive visualizations
New /explore page with:
1. CorrelationMatrix - which variables matter for which outcomes
(rows=config axes, columns=score dimensions, cell=effect size)
2. EfficiencyFrontier - Pareto frontier on cost vs score scatter
3. BumpChart - model rank changes across conditions, crossing highlights
4. HeatmapMatrix - configurable 2-axis heatmap (any axis x any axis)
5. RadarComparison - spider chart comparing two configs across all
quality dimensions
6. ConfigTreemap - size=runs, color=score, grouped by model
Navigation updated: Grid > Insights > Explore > Compare
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
9 files changed, 2309 insertions(+), 0 deletions(-)
diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx
@@ -0,0 +1,559 @@
+import { useState, useMemo } from "react";
+import {
+ LineChart,
+ Line,
+ XAxis,
+ YAxis,
+ CartesianGrid,
+ Tooltip,
+ ResponsiveContainer,
+ ReferenceDot,
+} from "recharts";
+import type { Run } from "../lib/data";
+import { AXIS_NAMES, type AxisName } from "../lib/data";
+
+interface BumpChartProps {
+ runs: Run[];
+}
+
+const MODEL_COLORS: Record<string, string> = {
+ haiku: "hsl(193 44% 67%)",
+ sonnet: "hsl(40 71% 73%)",
+ opus: "hsl(311 24% 63%)",
+};
+
+const FALLBACK_COLOR = "hsl(213 14% 65%)";
+
+const AXIS_LABELS: Record<AxisName, string> = {
+ model: "Model",
+ effort: "Effort",
+ prompt_style: "Prompt Style",
+ language: "Language",
+ human_language: "Human Language",
+ tool_read: "Read Tool",
+ tool_write: "Write Tool",
+ tool_edit: "Edit Tool",
+ tool_glob: "Glob Tool",
+ tool_grep: "Grep Tool",
+ linter: "Linter",
+ playwright: "Playwright",
+ context_file: "Context File",
+ sub_agents: "Sub-agents",
+ web_search: "Web Search",
+ max_budget: "Budget",
+};
+
+// All axes except "model" since we rank by model
+const CONDITION_AXES = AXIS_NAMES.filter((a) => a !== "model");
+
+interface RankedPoint {
+ conditionValue: string;
+ rank: number;
+ avgScore: number;
+ model: string;
+ n: number;
+}
+
+interface CrossingPoint {
+ conditionValue: string;
+ x: number;
+ rank: number;
+ models: [string, string];
+}
+
+function computeRankings(
+ runs: Run[],
+ axis: AxisName
+): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } {
+ // Get unique condition values for the selected axis
+ const conditionValues = Array.from(
+ new Set(runs.map((r) => String(r.meta[axis])))
+ ).sort();
+
+ // Get unique models
+ const models = Array.from(new Set(runs.map((r) => r.meta.model))).sort();
+
+ // For each condition value, compute average score per model, then rank
+ const ranked: Record<string, RankedPoint[]> = {};
+ for (const model of models) {
+ ranked[model] = [];
+ }
+
+ const prevRanks: Record<string, number> = {};
+
+ const crossings: CrossingPoint[] = [];
+
+ for (let ci = 0; ci < conditionValues.length; ci++) {
+ const cv = conditionValues[ci];
+ const runsForCondition = runs.filter(
+ (r) => String(r.meta[axis]) === cv
+ );
+
+ // Compute average score per model for this condition
+ const modelScores: Array<{
+ model: string;
+ avgScore: number;
+ n: number;
+ }> = [];
+ for (const model of models) {
+ const modelRuns = runsForCondition.filter(
+ (r) => r.meta.model === model
+ );
+ const scores = modelRuns
+ .map((r) => r.eval_results?.score)
+ .filter((s): s is number => s !== null && s !== undefined);
+
+ if (scores.length > 0) {
+ const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
+ modelScores.push({ model, avgScore: avg, n: scores.length });
+ }
+ }
+
+ // Sort by avgScore descending (higher score = rank 1)
+ modelScores.sort((a, b) => b.avgScore - a.avgScore);
+
+ // Assign ranks
+ const currentRanks: Record<string, number> = {};
+ for (let i = 0; i < modelScores.length; i++) {
+ const ms = modelScores[i];
+ const rank = i + 1;
+ currentRanks[ms.model] = rank;
+ ranked[ms.model].push({
+ conditionValue: cv,
+ rank,
+ avgScore: ms.avgScore,
+ model: ms.model,
+ n: ms.n,
+ });
+ }
+
+ // Detect crossings: if any two models swapped relative rank order
+ if (ci > 0) {
+ for (let i = 0; i < models.length; i++) {
+ for (let j = i + 1; j < models.length; j++) {
+ const m1 = models[i];
+ const m2 = models[j];
+ const prev1 = prevRanks[m1];
+ const prev2 = prevRanks[m2];
+ const curr1 = currentRanks[m1];
+ const curr2 = currentRanks[m2];
+
+ if (
+ prev1 !== undefined &&
+ prev2 !== undefined &&
+ curr1 !== undefined &&
+ curr2 !== undefined
+ ) {
+ // Check if they crossed: relative order changed
+ if (
+ (prev1 < prev2 && curr1 > curr2) ||
+ (prev1 > prev2 && curr1 < curr2)
+ ) {
+ // Approximate crossing rank as average of the two at the crossing point
+ const crossRank = (curr1 + curr2) / 2;
+ crossings.push({
+ conditionValue: cv,
+ x: ci,
+ rank: crossRank,
+ models: [m1, m2],
+ });
+ }
+ }
+ }
+ }
+ }
+
+ Object.assign(prevRanks, currentRanks);
+ }
+
+ return { ranked, crossings };
+}
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+type DotProps = { cx?: number; cy?: number; payload?: any; stroke?: string };
+
+function makeRankDot(
+ model: string,
+ color: string,
+ lookup: Record<string, Record<string, RankedPoint>>
+) {
+ return function RankDot({ cx, cy, payload }: DotProps) {
+ if (cx === undefined || cy === undefined || !payload) return null;
+ const point = lookup[model]?.[payload.conditionValue];
+ if (!point) return null;
+ return (
+ <g>
+ <circle
+ cx={cx}
+ cy={cy}
+ r={5}
+ fill={color}
+ stroke="var(--surface-1)"
+ strokeWidth={2}
+ />
+ <text
+ x={cx + 10}
+ y={cy - 8}
+ fill="var(--text)"
+ fontSize={10}
+ fontFamily="'JetBrains Mono', monospace"
+ textAnchor="start"
+ >
+ {(point.avgScore * 100).toFixed(0)}%
+ </text>
+ </g>
+ );
+ };
+}
+
+function CustomTooltipContent({
+ active,
+ payload,
+ lookup,
+}: {
+ active?: boolean;
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ payload?: Array<{ dataKey?: string; payload?: any; stroke: string }>;
+ label?: string;
+ lookup: Record<string, Record<string, RankedPoint>>;
+}) {
+ if (!active || !payload || payload.length === 0) return null;
+
+ const conditionValue = payload[0]?.payload?.conditionValue;
+ if (!conditionValue) return null;
+
+ // Resolve actual RankedPoint data from lookup
+ const resolved = payload
+ .filter((entry) => entry.dataKey && lookup[entry.dataKey])
+ .map((entry) => ({
+ point: lookup[entry.dataKey!]?.[conditionValue],
+ stroke: entry.stroke,
+ }))
+ .filter((r) => r.point);
+
+ const sorted = [...resolved].sort(
+ (a, b) => a.point!.rank - b.point!.rank
+ );
+
+ return (
+ <div
+ style={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ padding: "8px 12px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ }}
+ >
+ <div
+ style={{
+ color: "var(--text)",
+ fontWeight: 600,
+ marginBottom: "6px",
+ }}
+ >
+ {conditionValue}
+ </div>
+ {sorted.map((entry) => (
+ <div
+ key={entry.point!.model}
+ style={{
+ display: "flex",
+ alignItems: "center",
+ gap: "8px",
+ marginBottom: "2px",
+ }}
+ >
+ <span
+ style={{
+ display: "inline-block",
+ width: 8,
+ height: 8,
+ background: entry.stroke,
+ flexShrink: 0,
+ }}
+ />
+ <span style={{ color: "var(--text-muted)", width: "16px" }}>
+ #{entry.point!.rank}
+ </span>
+ <span style={{ color: "var(--text)" }}>
+ {entry.point!.model}
+ </span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "auto" }}>
+ {(entry.point!.avgScore * 100).toFixed(1)}% (n=
+ {entry.point!.n})
+ </span>
+ </div>
+ ))}
+ </div>
+ );
+}
+
+export default function BumpChart({ runs }: BumpChartProps) {
+ const [selectedAxis, setSelectedAxis] = useState<AxisName>("prompt_style");
+
+ const { ranked, crossings, conditionValues, models } = useMemo(() => {
+ const { ranked, crossings } = computeRankings(runs, selectedAxis);
+ const conditionValues = Array.from(
+ new Set(runs.map((r) => String(r.meta[selectedAxis])))
+ ).sort();
+ const models = Object.keys(ranked).filter(
+ (m) => ranked[m].length > 0
+ );
+ return { ranked, crossings, conditionValues, models };
+ }, [runs, selectedAxis]);
+
+ // Build a lookup: model -> conditionValue -> RankedPoint
+ const pointLookup = useMemo(() => {
+ const lookup: Record<string, Record<string, RankedPoint>> = {};
+ for (const model of models) {
+ lookup[model] = {};
+ for (const point of ranked[model]) {
+ lookup[model][point.conditionValue] = point;
+ }
+ }
+ return lookup;
+ }, [models, ranked]);
+
+ // Build recharts data: one entry per condition value
+ const chartData = useMemo(() => {
+ return conditionValues.map((cv) => {
+ const entry: Record<string, unknown> = { conditionValue: cv };
+ for (const model of models) {
+ const point = pointLookup[model]?.[cv];
+ if (point) {
+ entry[model] = point.rank;
+ }
+ }
+ return entry;
+ });
+ }, [conditionValues, models, pointLookup]);
+
+ const maxRank = models.length;
+
+ const scoredRuns = runs.filter(
+ (r) =>
+ r.eval_results?.score !== null && r.eval_results?.score !== undefined
+ );
+
+ if (scoredRuns.length === 0) {
+ return (
+ <div
+ className="card"
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ }}
+ >
+ No scored runs available for ranking.
+ </div>
+ );
+ }
+
+ return (
+ <div className="card">
+ <div
+ style={{
+ display: "flex",
+ alignItems: "center",
+ justifyContent: "space-between",
+ marginBottom: "16px",
+ flexWrap: "wrap",
+ gap: "12px",
+ }}
+ >
+ <div>
+ <h3 style={{ margin: 0 }}>Model Rankings by Condition</h3>
+ <p
+ style={{
+ color: "var(--text-muted)",
+ fontSize: "0.75rem",
+ margin: "4px 0 0",
+ }}
+ >
+ Rank 1 = best average score. Crossings indicate rank swaps.
+ </p>
+ </div>
+ <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
+ <label
+ style={{
+ fontSize: "0.75rem",
+ color: "var(--text-muted)",
+ }}
+ >
+ Condition:
+ </label>
+ <select
+ value={selectedAxis}
+ onChange={(e) => setSelectedAxis(e.target.value as AxisName)}
+ style={{
+ background: "var(--surface-2)",
+ color: "var(--text)",
+ border: "1px solid var(--border)",
+ padding: "4px 8px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "0.75rem",
+ borderRadius: "0",
+ cursor: "pointer",
+ }}
+ >
+ {CONDITION_AXES.map((axis) => (
+ <option key={axis} value={axis}>
+ {AXIS_LABELS[axis]}
+ </option>
+ ))}
+ </select>
+ </div>
+ </div>
+
+ {conditionValues.length < 2 ? (
+ <div
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ fontSize: "0.8rem",
+ }}
+ >
+ Need at least 2 values for "{AXIS_LABELS[selectedAxis]}" to show
+ rankings. Currently only: {conditionValues.join(", ") || "none"}
+ </div>
+ ) : (
+ <>
+ <ResponsiveContainer width="100%" height={300}>
+ <LineChart
+ data={chartData}
+ margin={{ top: 20, right: 60, bottom: 10, left: 10 }}
+ >
+ <CartesianGrid
+ strokeDasharray="3 3"
+ stroke="var(--border)"
+ vertical={false}
+ />
+ <XAxis
+ dataKey="conditionValue"
+ stroke="var(--text-muted)"
+ fontSize={11}
+ fontFamily="'JetBrains Mono', monospace"
+ tickLine={false}
+ axisLine={{ stroke: "var(--border)" }}
+ />
+ <YAxis
+ domain={[0.5, maxRank + 0.5]}
+ ticks={Array.from({ length: maxRank }, (_, i) => i + 1)}
+ reversed
+ stroke="var(--text-muted)"
+ fontSize={11}
+ fontFamily="'JetBrains Mono', monospace"
+ tickLine={false}
+ axisLine={{ stroke: "var(--border)" }}
+ label={{
+ value: "Rank",
+ angle: -90,
+ position: "insideLeft",
+ fill: "var(--text-muted)",
+ fontSize: 11,
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ tickFormatter={(v: number) => `#${v}`}
+ />
+ <Tooltip
+ content={<CustomTooltipContent lookup={pointLookup} />}
+ cursor={{ stroke: "var(--border)", strokeDasharray: "3 3" }}
+ />
+ {models.map((model) => (
+ <Line
+ key={model}
+ type="linear"
+ dataKey={model}
+ stroke={MODEL_COLORS[model] || FALLBACK_COLOR}
+ strokeWidth={2.5}
+ dot={makeRankDot(
+ model,
+ MODEL_COLORS[model] || FALLBACK_COLOR,
+ pointLookup
+ )}
+ activeDot={false}
+ name={model}
+ connectNulls
+ />
+ ))}
+ {crossings.map((crossing, i) => (
+ <ReferenceDot
+ key={`crossing-${i}`}
+ x={crossing.conditionValue}
+ y={crossing.rank}
+ r={10}
+ fill="none"
+ stroke="var(--yellow)"
+ strokeWidth={1.5}
+ strokeDasharray="3 2"
+ />
+ ))}
+ </LineChart>
+ </ResponsiveContainer>
+
+ {/* Legend */}
+ <div
+ style={{
+ display: "flex",
+ alignItems: "center",
+ justifyContent: "center",
+ gap: "20px",
+ marginTop: "12px",
+ flexWrap: "wrap",
+ }}
+ >
+ {models.map((model) => (
+ <div
+ key={model}
+ style={{
+ display: "flex",
+ alignItems: "center",
+ gap: "6px",
+ fontSize: "0.75rem",
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ >
+ <span
+ style={{
+ display: "inline-block",
+ width: 12,
+ height: 3,
+ background: MODEL_COLORS[model] || FALLBACK_COLOR,
+ }}
+ />
+ <span style={{ color: "var(--text)" }}>{model}</span>
+ </div>
+ ))}
+ {crossings.length > 0 && (
+ <div
+ style={{
+ display: "flex",
+ alignItems: "center",
+ gap: "6px",
+ fontSize: "0.75rem",
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ >
+ <span
+ style={{
+ display: "inline-block",
+ width: 12,
+ height: 12,
+ borderRadius: "50%",
+ border: "1.5px dashed var(--yellow)",
+ }}
+ />
+ <span style={{ color: "var(--text-muted)" }}>
+ rank swap
+ </span>
+ </div>
+ )}
+ </div>
+ </>
+ )}
+ </div>
+ );
+}
diff --git a/dashboard/src/components/ConfigTreemap.tsx b/dashboard/src/components/ConfigTreemap.tsx
@@ -0,0 +1,344 @@
+import React, { useState, useCallback } from "react";
+import { Treemap, ResponsiveContainer, Tooltip } from "recharts";
+import type { TreemapNode } from "recharts/types/chart/Treemap";
+import type { Run, AxisName } from "../lib/data";
+
+interface ConfigTreemapProps {
+ runs: Run[];
+}
+
+const SECONDARY_AXES: AxisName[] = [
+ "prompt_style",
+ "effort",
+ "language",
+ "human_language",
+ "linter",
+ "playwright",
+ "context_file",
+ "sub_agents",
+ "web_search",
+ "max_budget",
+];
+
+function scoreColor(avgScore: number | null): string {
+ if (avgScore === null) return "hsl(213 14% 30%)";
+ const pct = avgScore * 100;
+ if (pct > 60) return "hsl(92 28% 45%)";
+ if (pct >= 30) return "hsl(40 71% 50%)";
+ return "hsl(355 52% 48%)";
+}
+
+interface LeafData {
+ name: string;
+ displayName: string;
+ size: number;
+ avgScore: number | null;
+ avgScorePct: string;
+ model: string;
+ configValue: string;
+ color: string;
+ [key: string]: unknown;
+}
+
+interface GroupData {
+ name: string;
+ children: LeafData[];
+ [key: string]: unknown;
+}
+
+function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] {
+ const byModel: Record<string, Record<string, Run[]>> = {};
+
+ for (const run of runs) {
+ const model = run.meta.model;
+ const secondary = String(run.meta[secondaryAxis]);
+ if (!byModel[model]) byModel[model] = {};
+ if (!byModel[model][secondary]) byModel[model][secondary] = [];
+ byModel[model][secondary].push(run);
+ }
+
+ return Object.entries(byModel)
+ .sort(([a], [b]) => a.localeCompare(b))
+ .map(([model, configs]) => ({
+ name: model,
+ children: Object.entries(configs)
+ .sort(([a], [b]) => a.localeCompare(b))
+ .map(([configValue, configRuns]) => {
+ const scores = configRuns
+ .map((r) => r.eval_results?.score)
+ .filter((s): s is number => s !== null && s !== undefined);
+ const avgScore =
+ scores.length > 0
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
+ : null;
+
+ return {
+ name: `${model} / ${configValue}`,
+ displayName: `${model} / ${configValue}`,
+ size: configRuns.length,
+ avgScore,
+ avgScorePct:
+ avgScore !== null ? `${(avgScore * 100).toFixed(0)}%` : "--",
+ model,
+ configValue,
+ color: scoreColor(avgScore),
+ };
+ }),
+ }));
+}
+
+function CustomContent(props: TreemapNode): React.ReactElement {
+ const { x, y, width, height, depth, name } = props;
+
+ // Only render leaf nodes (depth === 2 in a two-level hierarchy via 'flat' type)
+ // depth 1 = model group, depth 2 = leaf
+ if (depth < 2) return <g />;
+
+ const avgScorePct = (props as unknown as LeafData).avgScorePct ?? "--";
+ const count = (props as unknown as LeafData).size ?? 0;
+ const color = (props as unknown as LeafData).color ?? "hsl(213 14% 30%)";
+
+ const showText = width > 50 && height > 36;
+ const showCount = width > 50 && height > 50;
+
+ return (
+ <g>
+ <rect
+ x={x}
+ y={y}
+ width={width}
+ height={height}
+ fill={color}
+ stroke="hsl(213 16% 12%)"
+ strokeWidth={2}
+ />
+ {showText && (
+ <>
+ <text
+ x={x + width / 2}
+ y={y + height / 2 - (showCount ? 8 : 0)}
+ textAnchor="middle"
+ dominantBaseline="central"
+ fill="hsl(213 27% 95%)"
+ style={{
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ fontWeight: 600,
+ textTransform: "uppercase",
+ }}
+ >
+ {width > 100 ? name : (props as unknown as LeafData).configValue}
+ </text>
+ <text
+ x={x + width / 2}
+ y={y + height / 2 + 8}
+ textAnchor="middle"
+ dominantBaseline="central"
+ fill="hsl(213 27% 95%)"
+ style={{
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ fontWeight: 500,
+ textTransform: "uppercase",
+ }}
+ >
+ {avgScorePct}
+ </text>
+ {showCount && (
+ <text
+ x={x + width / 2}
+ y={y + height / 2 + 22}
+ textAnchor="middle"
+ dominantBaseline="central"
+ fill="hsla(213, 27%, 95%, 0.65)"
+ style={{
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ fontWeight: 400,
+ textTransform: "uppercase",
+ }}
+ >
+ n={count}
+ </text>
+ )}
+ </>
+ )}
+ </g>
+ );
+}
+
+function CustomTooltip({
+ active,
+ payload,
+}: {
+ active?: boolean;
+ payload?: Array<{ payload: TreemapNode }>;
+}) {
+ if (!active || !payload || payload.length === 0) return null;
+
+ const node = payload[0].payload as unknown as LeafData;
+ if (!node.displayName) return null;
+
+ return (
+ <div
+ style={{
+ background: "hsl(217 16% 15.5%)",
+ border: "1px solid hsl(217 17% 28%)",
+ padding: "8px 12px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ textTransform: "uppercase",
+ letterSpacing: "0.5px",
+ }}
+ >
+ <div style={{ fontWeight: 600, marginBottom: 4, color: "hsl(213 27% 88%)" }}>
+ {node.displayName}
+ </div>
+ <div style={{ color: "hsl(213 14% 65%)" }}>
+ Score: {node.avgScorePct}
+ </div>
+ <div style={{ color: "hsl(213 14% 65%)" }}>
+ Runs: {node.size}
+ </div>
+ </div>
+ );
+}
+
+export default function ConfigTreemap({ runs }: ConfigTreemapProps) {
+ const [secondaryAxis, setSecondaryAxis] = useState<AxisName>("prompt_style");
+
+ const handleClick = useCallback(
+ (node: TreemapNode) => {
+ const leaf = node as unknown as LeafData;
+ if (leaf.model && leaf.configValue) {
+ const params = new URLSearchParams();
+ params.set("model", leaf.model);
+ params.set(secondaryAxis, leaf.configValue);
+ window.location.href = `/?${params.toString()}`;
+ }
+ },
+ [secondaryAxis],
+ );
+
+ if (runs.length === 0) {
+ return (
+ <div
+ className="card"
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ }}
+ >
+ No data for treemap.
+ </div>
+ );
+ }
+
+ const treeData = buildTreeData(runs, secondaryAxis);
+
+ return (
+ <div
+ className="card"
+ style={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ padding: "20px",
+ }}
+ >
+ <div
+ style={{
+ display: "flex",
+ justifyContent: "space-between",
+ alignItems: "center",
+ marginBottom: "16px",
+ }}
+ >
+ <h3 style={{ margin: 0 }}>Configuration Treemap</h3>
+ <div className="filter-group">
+ <label htmlFor="treemap-axis">Group by</label>
+ <select
+ id="treemap-axis"
+ value={secondaryAxis}
+ onChange={(e) => setSecondaryAxis(e.target.value as AxisName)}
+ >
+ {SECONDARY_AXES.map((axis) => (
+ <option key={axis} value={axis}>
+ {axis}
+ </option>
+ ))}
+ </select>
+ </div>
+ </div>
+
+ <div
+ style={{
+ display: "flex",
+ gap: "16px",
+ marginBottom: "12px",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ textTransform: "uppercase",
+ letterSpacing: "0.5px",
+ color: "var(--text-muted)",
+ }}
+ >
+ <span>
+ <span
+ style={{
+ display: "inline-block",
+ width: 10,
+ height: 10,
+ background: "hsl(92 28% 45%)",
+ marginRight: 4,
+ verticalAlign: "middle",
+ }}
+ />
+ {">"} 60%
+ </span>
+ <span>
+ <span
+ style={{
+ display: "inline-block",
+ width: 10,
+ height: 10,
+ background: "hsl(40 71% 50%)",
+ marginRight: 4,
+ verticalAlign: "middle",
+ }}
+ />
+ 30-60%
+ </span>
+ <span>
+ <span
+ style={{
+ display: "inline-block",
+ width: 10,
+ height: 10,
+ background: "hsl(355 52% 48%)",
+ marginRight: 4,
+ verticalAlign: "middle",
+ }}
+ />
+ {"<"} 30%
+ </span>
+ </div>
+
+ <ResponsiveContainer width="100%" height={400}>
+ <Treemap
+ data={treeData}
+ dataKey="size"
+ nameKey="name"
+ type="flat"
+ content={CustomContent}
+ onClick={handleClick}
+ isAnimationActive={false}
+ stroke="hsl(213 16% 12%)"
+ >
+ <Tooltip content={<CustomTooltip />} />
+ </Treemap>
+ </ResponsiveContainer>
+ </div>
+ );
+}
diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx
@@ -0,0 +1,236 @@
+import type { Run } from "../lib/data";
+
+interface CorrelationMatrixProps {
+ runs: Run[];
+}
+
+const CONFIG_AXES = [
+ { key: "model", label: "Model" },
+ { key: "effort", label: "Effort" },
+ { key: "prompt_style", label: "Prompt Style" },
+ { key: "language", label: "Language" },
+ { key: "tool_read", label: "Read Tool" },
+ { key: "tool_write", label: "Write Tool" },
+ { key: "tool_edit", label: "Edit Tool" },
+ { key: "tool_glob", label: "Glob Tool" },
+ { key: "tool_grep", label: "Grep Tool" },
+ { key: "linter", label: "Linter" },
+ { key: "playwright", label: "Playwright" },
+ { key: "context_file", label: "Context File" },
+ { key: "sub_agents", label: "Sub-agents" },
+ { key: "web_search", label: "Web Search" },
+ { key: "max_budget", label: "Budget" },
+] as const;
+
+type MetricExtractor = (run: Run) => number | null;
+
+const OUTCOME_METRICS: Array<{ key: string; label: string; extract: MetricExtractor }> = [
+ { key: "overall", label: "Overall", extract: (r) => r.eval_results?.score ?? null },
+ { key: "gameplay", label: "Gameplay", extract: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null },
+ { key: "code", label: "Code", extract: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null },
+ { key: "structural", label: "Structural", extract: (r) => r.eval_results?.structural?.score ?? null },
+ { key: "quality", label: "Quality", extract: (r) => r.eval_results?.quality?.score ?? null },
+ { key: "transcript", label: "Transcript", extract: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null },
+ { key: "cost", label: "Cost", extract: (r) => r.claude_output?.total_cost_usd ?? null },
+ { key: "turns", label: "Turns", extract: (r) => r.claude_output?.num_turns ?? null },
+ { key: "time", label: "Wall Time", extract: (r) => r.meta.wall_time_seconds ?? null },
+];
+
+function computeSpread(runs: Run[], axisKey: string, extract: MetricExtractor): number | null {
+ const groups: Record<string, number[]> = {};
+ for (const run of runs) {
+ const val = extract(run);
+ if (val === null) continue;
+ const groupKey = String((run.meta as Record<string, unknown>)[axisKey] ?? "unknown");
+ (groups[groupKey] ??= []).push(val);
+ }
+
+ const keys = Object.keys(groups);
+ if (keys.length < 2) return null;
+
+ const means = keys.map((k) => {
+ const vals = groups[k];
+ return vals.reduce((a, b) => a + b, 0) / vals.length;
+ });
+
+ return Math.max(...means) - Math.min(...means);
+}
+
+export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
+ if (runs.length === 0) {
+ return (
+ <div
+ className="card"
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ }}
+ >
+ No data available for correlation analysis.
+ </div>
+ );
+ }
+
+ // Compute the full matrix: rows = config axes, columns = metrics
+ const matrix: Array<{
+ key: string;
+ label: string;
+ spreads: Array<number | null>;
+ maxSpread: number;
+ }> = [];
+
+ for (const axis of CONFIG_AXES) {
+ const spreads = OUTCOME_METRICS.map((metric) =>
+ computeSpread(runs, axis.key, metric.extract)
+ );
+ const validSpreads = spreads.filter((s): s is number => s !== null);
+ const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0;
+ matrix.push({ key: axis.key, label: axis.label, spreads, maxSpread });
+ }
+
+ // Sort rows by maximum spread (most impactful variable first)
+ matrix.sort((a, b) => b.maxSpread - a.maxSpread);
+
+ // Find global max spread for color scaling
+ const globalMax = Math.max(...matrix.map((r) => r.maxSpread), 0.001);
+
+ return (
+ <div className="card" style={{ padding: "20px" }}>
+ <h3 style={{ marginBottom: "4px" }}>Variable Impact Matrix</h3>
+ <p
+ style={{
+ color: "var(--text-muted)",
+ fontSize: "0.75rem",
+ marginBottom: "16px",
+ }}
+ >
+ Effect size (spread) of each configuration variable on each outcome.
+ Sorted by maximum impact. Stronger color = larger effect.
+ </p>
+
+ <div style={{ overflowX: "auto" }}>
+ <table
+ style={{
+ borderCollapse: "collapse",
+ width: "auto",
+ minWidth: "100%",
+ }}
+ >
+ <thead>
+ <tr>
+ <th
+ style={{
+ padding: "6px 12px",
+ fontSize: "11px",
+ textAlign: "right",
+ background: "var(--surface-2)",
+ borderBottom: "1px solid var(--border)",
+ borderRight: "1px solid var(--border)",
+ position: "sticky",
+ left: 0,
+ zIndex: 1,
+ }}
+ >
+ Variable
+ </th>
+ {OUTCOME_METRICS.map((metric) => (
+ <th
+ key={metric.key}
+ style={{
+ padding: "6px 8px",
+ fontSize: "11px",
+ textAlign: "center",
+ background: "var(--surface-2)",
+ borderBottom: "1px solid var(--border)",
+ fontFamily: "var(--font-mono)",
+ fontWeight: 500,
+ color: "var(--text-muted)",
+ textTransform: "uppercase",
+ letterSpacing: "0.5px",
+ whiteSpace: "nowrap",
+ }}
+ >
+ {metric.label}
+ </th>
+ ))}
+ </tr>
+ </thead>
+ <tbody>
+ {matrix.map((row) => (
+ <tr key={row.key} style={{ background: "transparent" }}>
+ <td
+ style={{
+ padding: "5px 12px",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ fontWeight: 500,
+ textAlign: "right",
+ whiteSpace: "nowrap",
+ borderBottom: "1px solid var(--border)",
+ borderRight: "1px solid var(--border)",
+ background: "var(--surface-1)",
+ position: "sticky",
+ left: 0,
+ zIndex: 1,
+ }}
+ >
+ {row.label}
+ </td>
+ {row.spreads.map((spread, i) => {
+ if (spread === null) {
+ return (
+ <td
+ key={OUTCOME_METRICS[i].key}
+ style={{
+ padding: "5px 8px",
+ textAlign: "center",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ color: "var(--text-muted)",
+ borderBottom: "1px solid var(--border)",
+ }}
+ >
+ --
+ </td>
+ );
+ }
+
+ const opacity = Math.min(spread / globalMax, 1) * 0.7 + 0.05;
+ const isScoreMetric = !["cost", "turns", "time"].includes(
+ OUTCOME_METRICS[i].key
+ );
+ const displayValue = isScoreMetric
+ ? `${(spread * 100).toFixed(1)}%`
+ : OUTCOME_METRICS[i].key === "cost"
+ ? `$${spread.toFixed(2)}`
+ : OUTCOME_METRICS[i].key === "time"
+ ? `${Math.round(spread)}s`
+ : spread.toFixed(1);
+
+ return (
+ <td
+ key={OUTCOME_METRICS[i].key}
+ style={{
+ padding: "5px 8px",
+ textAlign: "center",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ fontWeight: 600,
+ color: "var(--text)",
+ borderBottom: "1px solid var(--border)",
+ background: `rgba(136, 192, 208, ${opacity})`,
+ }}
+ >
+ {displayValue}
+ </td>
+ );
+ })}
+ </tr>
+ ))}
+ </tbody>
+ </table>
+ </div>
+ </div>
+ );
+}
diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx
@@ -0,0 +1,489 @@
+import { useState, useMemo } from "react";
+import {
+ ScatterChart,
+ Scatter,
+ XAxis,
+ YAxis,
+ CartesianGrid,
+ Tooltip,
+ ResponsiveContainer,
+} from "recharts";
+import type { Run } from "../lib/data";
+
+interface EfficiencyFrontierProps {
+ runs: Run[];
+}
+
+const MODEL_COLORS: Record<string, string> = {
+ haiku: "hsl(193 44% 67%)",
+ sonnet: "hsl(40 71% 73%)",
+ opus: "hsl(311 24% 63%)",
+};
+
+const DEFAULT_COLOR = "hsl(213 14% 65%)";
+
+interface ConfigPoint {
+ cell_id: string;
+ model: string;
+ avgCost: number;
+ avgScore: number;
+ runCount: number;
+ config: Record<string, string>;
+ isFrontier: boolean;
+ label: string;
+}
+
+function getModelColor(model: string): string {
+ const key = model.toLowerCase();
+ for (const [m, color] of Object.entries(MODEL_COLORS)) {
+ if (key.includes(m)) return color;
+ }
+ return DEFAULT_COLOR;
+}
+
+function aggregateByConfig(runs: Run[]): ConfigPoint[] {
+ const groups: Record<
+ string,
+ {
+ scores: number[];
+ costs: number[];
+ model: string;
+ config: Record<string, string>;
+ }
+ > = {};
+
+ for (const run of runs) {
+ const id = run.meta.cell_id;
+ if (!groups[id]) {
+ groups[id] = {
+ scores: [],
+ costs: [],
+ model: run.meta.model,
+ config: {
+ model: run.meta.model,
+ effort: run.meta.effort,
+ prompt_style: run.meta.prompt_style,
+ language: run.meta.language,
+ linter: run.meta.linter,
+ playwright: run.meta.playwright,
+ context_file: run.meta.context_file,
+ sub_agents: run.meta.sub_agents,
+ web_search: run.meta.web_search,
+ max_budget: run.meta.max_budget,
+ },
+ };
+ }
+
+ if (run.eval_results?.score != null) {
+ groups[id].scores.push(run.eval_results.score);
+ }
+ if (run.claude_output?.total_cost_usd != null) {
+ groups[id].costs.push(run.claude_output.total_cost_usd);
+ }
+ }
+
+ return Object.entries(groups)
+ .filter(([, g]) => g.scores.length > 0 && g.costs.length > 0)
+ .map(([cell_id, g]) => ({
+ cell_id,
+ model: g.model,
+ avgCost: g.costs.reduce((a, b) => a + b, 0) / g.costs.length,
+ avgScore: g.scores.reduce((a, b) => a + b, 0) / g.scores.length,
+ runCount: g.scores.length,
+ config: g.config,
+ isFrontier: false,
+ label: "",
+ }));
+}
+
+function computeParetoFrontier(points: ConfigPoint[]): ConfigPoint[] {
+ const frontier: ConfigPoint[] = [];
+
+ for (const p of points) {
+ let dominated = false;
+ for (const q of points) {
+ if (q === p) continue;
+ if (q.avgScore >= p.avgScore && q.avgCost <= p.avgCost) {
+ if (q.avgScore > p.avgScore || q.avgCost < p.avgCost) {
+ dominated = true;
+ break;
+ }
+ }
+ }
+ if (!dominated) {
+ frontier.push(p);
+ }
+ }
+
+ frontier.sort((a, b) => a.avgCost - b.avgCost);
+ return frontier;
+}
+
+function findKeyDifference(
+ point: ConfigPoint,
+ allPoints: ConfigPoint[]
+): string {
+ const configKeys = Object.keys(point.config);
+ const valueCounts: Record<string, Record<string, number>> = {};
+
+ for (const key of configKeys) {
+ valueCounts[key] = {};
+ for (const p of allPoints) {
+ const val = p.config[key] || "";
+ valueCounts[key][val] = (valueCounts[key][val] || 0) + 1;
+ }
+ }
+
+ let bestKey = "";
+ let bestRarity = Infinity;
+
+ for (const key of configKeys) {
+ if (key === "model") continue;
+ const val = point.config[key];
+ const count = valueCounts[key][val] || 0;
+ const total = allPoints.length;
+ const rarity = count / total;
+ if (rarity < bestRarity && rarity < 1) {
+ bestRarity = rarity;
+ bestKey = key;
+ }
+ }
+
+ if (bestKey) {
+ return `${point.config.model} / ${bestKey}=${point.config[bestKey]}`;
+ }
+ return point.config.model;
+}
+
+interface TooltipPayloadEntry {
+ payload?: ConfigPoint;
+}
+
+function CustomTooltip({
+ active,
+ payload,
+}: {
+ active?: boolean;
+ payload?: TooltipPayloadEntry[];
+}) {
+ if (!active || !payload || payload.length === 0) return null;
+ const point = payload[0]?.payload;
+ if (!point) return null;
+
+ return (
+ <div
+ style={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ padding: "12px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ color: "var(--text)",
+ maxWidth: "300px",
+ }}
+ >
+ <div
+ style={{
+ fontWeight: 600,
+ marginBottom: "8px",
+ fontSize: "12px",
+ color: getModelColor(point.model),
+ }}
+ >
+ {point.cell_id}
+ </div>
+ <div style={{ marginBottom: "6px" }}>
+ <span style={{ color: "var(--text-muted)" }}>score: </span>
+ <span style={{ fontWeight: 600 }}>
+ {(point.avgScore * 100).toFixed(1)}%
+ </span>
+ </div>
+ <div style={{ marginBottom: "6px" }}>
+ <span style={{ color: "var(--text-muted)" }}>cost: </span>
+ <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span>
+ </div>
+ <div style={{ marginBottom: "8px" }}>
+ <span style={{ color: "var(--text-muted)" }}>runs: </span>
+ <span>{point.runCount}</span>
+ </div>
+ {point.isFrontier && (
+ <div
+ style={{
+ color: "var(--green)",
+ fontWeight: 600,
+ fontSize: "10px",
+ textTransform: "uppercase",
+ letterSpacing: "1px",
+ marginBottom: "8px",
+ }}
+ >
+ Pareto Frontier
+ </div>
+ )}
+ <div
+ style={{
+ borderTop: "1px solid var(--border)",
+ paddingTop: "8px",
+ display: "grid",
+ gridTemplateColumns: "auto 1fr",
+ gap: "2px 8px",
+ }}
+ >
+ {Object.entries(point.config).map(([key, val]) => (
+ <div key={key} style={{ display: "contents" }}>
+ <span style={{ color: "var(--text-muted)" }}>{key}:</span>
+ <span>{val}</span>
+ </div>
+ ))}
+ </div>
+ </div>
+ );
+}
+
+export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
+ const [hoveredId, setHoveredId] = useState<string | null>(null);
+
+ const points = useMemo(() => {
+ const raw = aggregateByConfig(runs);
+ const frontier = computeParetoFrontier(raw);
+ const frontierIds = new Set(frontier.map((p) => p.cell_id));
+
+ return raw.map((p) => ({
+ ...p,
+ isFrontier: frontierIds.has(p.cell_id),
+ label: frontierIds.has(p.cell_id) ? findKeyDifference(p, raw) : "",
+ }));
+ }, [runs]);
+
+ if (points.length === 0) {
+ return (
+ <div
+ className="card"
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ }}
+ >
+ Not enough data to compute efficiency frontier.
+ </div>
+ );
+ }
+
+ const frontierPoints = points
+ .filter((p) => p.isFrontier)
+ .sort((a, b) => a.avgCost - b.avgCost);
+ const nonFrontierPoints = points.filter((p) => !p.isFrontier);
+
+ // Custom shape for non-frontier dots (small, dimmed)
+ const nonFrontierShape = (props: {
+ cx?: number;
+ cy?: number;
+ payload?: ConfigPoint;
+ }) => {
+ const { cx, cy, payload } = props;
+ if (cx == null || cy == null || !payload) return null;
+ const color = getModelColor(payload.model);
+ const opacity =
+ hoveredId === null ? 0.4 : hoveredId === payload.cell_id ? 1 : 0.2;
+ return (
+ <circle
+ cx={cx}
+ cy={cy}
+ r={5}
+ fill={color}
+ fillOpacity={opacity}
+ stroke="none"
+ style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }}
+ />
+ );
+ };
+
+ // Custom shape for frontier dots (large, prominent, green ring)
+ const frontierShape = (props: {
+ cx?: number;
+ cy?: number;
+ payload?: ConfigPoint;
+ }) => {
+ const { cx, cy, payload } = props;
+ if (cx == null || cy == null || !payload) return null;
+ const color = getModelColor(payload.model);
+ const opacity =
+ hoveredId === null ? 1 : hoveredId === payload.cell_id ? 1 : 0.5;
+ return (
+ <circle
+ cx={cx}
+ cy={cy}
+ r={9}
+ fill={color}
+ fillOpacity={opacity}
+ stroke="hsl(92 28% 65%)"
+ strokeWidth={2}
+ style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }}
+ />
+ );
+ };
+
+ return (
+ <div className="card">
+ <h3 style={{ marginBottom: "4px" }}>Efficiency Frontier</h3>
+ <p
+ style={{
+ color: "var(--text-muted)",
+ fontSize: "11px",
+ marginBottom: "16px",
+ }}
+ >
+ Cost vs score per config. Pareto frontier highlights configs not
+ dominated on both axes.
+ </p>
+
+ {/* Legend */}
+ <div
+ style={{
+ display: "flex",
+ gap: "16px",
+ marginBottom: "12px",
+ fontSize: "11px",
+ color: "var(--text-muted)",
+ flexWrap: "wrap",
+ }}
+ >
+ {Object.entries(MODEL_COLORS).map(([model, color]) => (
+ <div
+ key={model}
+ style={{ display: "flex", alignItems: "center", gap: "6px" }}
+ >
+ <div
+ style={{
+ width: "8px",
+ height: "8px",
+ background: color,
+ }}
+ />
+ <span>{model}</span>
+ </div>
+ ))}
+ <div style={{ display: "flex", alignItems: "center", gap: "6px" }}>
+ <div
+ style={{
+ width: "12px",
+ height: "12px",
+ border: "2px solid hsl(92 28% 65%)",
+ background: "transparent",
+ }}
+ />
+ <span>frontier</span>
+ </div>
+ </div>
+
+ <ResponsiveContainer width="100%" height={420}>
+ <ScatterChart margin={{ top: 20, right: 30, bottom: 20, left: 20 }}>
+ <CartesianGrid strokeDasharray="3 3" stroke="var(--border)" />
+ <XAxis
+ dataKey="avgCost"
+ type="number"
+ name="Avg Cost"
+ stroke="var(--text-muted)"
+ fontSize={11}
+ fontFamily="'JetBrains Mono', monospace"
+ tickFormatter={(v: number) => `$${v.toFixed(2)}`}
+ label={{
+ value: "Avg Cost ($)",
+ position: "insideBottom",
+ offset: -10,
+ fill: "var(--text-muted)",
+ fontSize: 11,
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ />
+ <YAxis
+ dataKey="avgScore"
+ type="number"
+ name="Avg Score"
+ stroke="var(--text-muted)"
+ fontSize={11}
+ fontFamily="'JetBrains Mono', monospace"
+ domain={[0, 1]}
+ tickFormatter={(v: number) => `${(v * 100).toFixed(0)}%`}
+ label={{
+ value: "Avg Score (%)",
+ angle: -90,
+ position: "insideLeft",
+ offset: 0,
+ fill: "var(--text-muted)",
+ fontSize: 11,
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ />
+ <Tooltip content={<CustomTooltip />} cursor={false} />
+
+ {/* Non-frontier points (dimmed) */}
+ <Scatter
+ name="configs"
+ data={nonFrontierPoints}
+ shape={nonFrontierShape}
+ isAnimationActive={false}
+ legendType="none"
+ />
+
+ {/* Frontier points (prominent) with connecting line */}
+ <Scatter
+ name="frontier"
+ data={frontierPoints}
+ shape={frontierShape}
+ isAnimationActive={false}
+ legendType="none"
+ line={{ stroke: "hsl(92 28% 65%)", strokeWidth: 1.5, strokeDasharray: "6 3" }}
+ lineType="joint"
+ lineJointType="linear"
+ />
+ </ScatterChart>
+ </ResponsiveContainer>
+
+ {/* Frontier labels below the chart */}
+ {frontierPoints.length > 0 && (
+ <div
+ style={{
+ marginTop: "12px",
+ display: "flex",
+ flexWrap: "wrap",
+ gap: "8px",
+ }}
+ >
+ {frontierPoints
+ .sort((a, b) => a.avgCost - b.avgCost)
+ .map((point) => (
+ <div
+ key={point.cell_id}
+ onMouseEnter={() => setHoveredId(point.cell_id)}
+ onMouseLeave={() => setHoveredId(null)}
+ style={{
+ padding: "4px 8px",
+ background: "var(--surface-2)",
+ border: "1px solid var(--border)",
+ fontSize: "10px",
+ fontFamily: "'JetBrains Mono', monospace",
+ color: getModelColor(point.model),
+ cursor: "default",
+ transition: "border-color 0.15s",
+ borderColor:
+ hoveredId === point.cell_id
+ ? "hsl(92 28% 65%)"
+ : "var(--border)",
+ }}
+ >
+ {point.label}
+ <span
+ style={{ color: "var(--text-muted)", marginLeft: "8px" }}
+ >
+ ${point.avgCost.toFixed(2)} /{" "}
+ {(point.avgScore * 100).toFixed(0)}%
+ </span>
+ </div>
+ ))}
+ </div>
+ )}
+ </div>
+ );
+}
diff --git a/dashboard/src/components/Filters.tsx b/dashboard/src/components/Filters.tsx
@@ -13,6 +13,11 @@ const AXIS_LABELS: Record<AxisName, string> = {
prompt_style: "Prompt",
language: "Language",
human_language: "Human Lang",
+ tool_read: "Tool: Read",
+ tool_write: "Tool: Write",
+ tool_edit: "Tool: Edit",
+ tool_glob: "Tool: Glob",
+ tool_grep: "Tool: Grep",
linter: "Linter",
playwright: "Playwright",
context_file: "Context",
diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx
@@ -0,0 +1,311 @@
+import { useState, useMemo } from "react";
+import type { Run, AxisName } from "../lib/data";
+import { AXIS_NAMES } from "../lib/data";
+
+interface HeatmapMatrixProps {
+ runs: Run[];
+}
+
+const AXIS_LABELS: Record<AxisName, string> = {
+ model: "Model",
+ effort: "Effort",
+ prompt_style: "Prompt Style",
+ language: "Language",
+ human_language: "Human Lang",
+ tool_read: "Tool: Read",
+ tool_write: "Tool: Write",
+ tool_edit: "Tool: Edit",
+ tool_glob: "Tool: Glob",
+ tool_grep: "Tool: Grep",
+ linter: "Linter",
+ playwright: "Playwright",
+ context_file: "Context File",
+ sub_agents: "Sub-agents",
+ web_search: "Web Search",
+ max_budget: "Max Budget",
+};
+
+interface CellData {
+ totalScore: number;
+ count: number;
+}
+
+function scoreToColor(pct: number): string {
+ // red (0%) -> yellow (50%) -> green (100%)
+ // Using the CSS variable HSL values directly for interpolation
+ if (pct <= 50) {
+ // red to yellow
+ const t = pct / 50;
+ const h = 355 + t * (40 - 355 + 360); // wrap around hue
+ const s = 52 + t * (71 - 52);
+ const l = 64 + t * (73 - 64);
+ return `hsl(${h % 360} ${s}% ${l}%)`;
+ } else {
+ // yellow to green
+ const t = (pct - 50) / 50;
+ const h = 40 + t * (92 - 40);
+ const s = 71 + t * (28 - 71);
+ const l = 73 + t * (65 - 73);
+ return `hsl(${h} ${s}% ${l}%)`;
+ }
+}
+
+function cellBackground(pct: number): string {
+ const color = scoreToColor(pct);
+ // Use the color at low opacity for the cell background
+ return color.replace("hsl(", "hsla(").replace(")", " / 0.18)");
+}
+
+export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
+ const [rowAxis, setRowAxis] = useState<AxisName>("model");
+ const [colAxis, setColAxis] = useState<AxisName>("prompt_style");
+
+ const { rowValues, colValues, cells } = useMemo(() => {
+ const cellMap: Record<string, Record<string, CellData>> = {};
+ const rowSet = new Set<string>();
+ const colSet = new Set<string>();
+
+ for (const run of runs) {
+ const score = run.eval_results?.score;
+ if (score === null || score === undefined) continue;
+
+ const rv = String(run.meta[rowAxis]);
+ const cv = String(run.meta[colAxis]);
+
+ rowSet.add(rv);
+ colSet.add(cv);
+
+ if (!cellMap[rv]) cellMap[rv] = {};
+ if (!cellMap[rv][cv]) cellMap[rv][cv] = { totalScore: 0, count: 0 };
+
+ cellMap[rv][cv].totalScore += score;
+ cellMap[rv][cv].count += 1;
+ }
+
+ return {
+ rowValues: Array.from(rowSet).sort(),
+ colValues: Array.from(colSet).sort(),
+ cells: cellMap,
+ };
+ }, [runs, rowAxis, colAxis]);
+
+ const selectorStyle: React.CSSProperties = {
+ background: "var(--surface-2)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ color: "var(--text)",
+ fontFamily: "var(--font-mono)",
+ fontSize: "var(--text-ui)",
+ padding: "6px 10px",
+ textTransform: "uppercase" as const,
+ letterSpacing: "0.5px",
+ };
+
+ const labelStyle: React.CSSProperties = {
+ fontSize: "var(--text-label)",
+ color: "var(--text-muted)",
+ textTransform: "uppercase" as const,
+ letterSpacing: "1px",
+ fontWeight: 500,
+ fontFamily: "var(--font-mono)",
+ };
+
+ return (
+ <div
+ style={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ padding: "20px",
+ }}
+ >
+ {/* Axis selectors */}
+ <div
+ style={{
+ display: "flex",
+ gap: "24px",
+ marginBottom: "20px",
+ flexWrap: "wrap",
+ alignItems: "flex-end",
+ }}
+ >
+ <div style={{ display: "flex", flexDirection: "column", gap: "4px" }}>
+ <label style={labelStyle}>Row Axis</label>
+ <select
+ value={rowAxis}
+ onChange={(e) => setRowAxis(e.target.value as AxisName)}
+ style={selectorStyle}
+ >
+ {AXIS_NAMES.map((axis) => (
+ <option key={axis} value={axis}>
+ {AXIS_LABELS[axis]}
+ </option>
+ ))}
+ </select>
+ </div>
+ <div style={{ display: "flex", flexDirection: "column", gap: "4px" }}>
+ <label style={labelStyle}>Column Axis</label>
+ <select
+ value={colAxis}
+ onChange={(e) => setColAxis(e.target.value as AxisName)}
+ style={selectorStyle}
+ >
+ {AXIS_NAMES.map((axis) => (
+ <option key={axis} value={axis}>
+ {AXIS_LABELS[axis]}
+ </option>
+ ))}
+ </select>
+ </div>
+ </div>
+
+ {/* Heatmap table */}
+ {rowValues.length === 0 || colValues.length === 0 ? (
+ <div
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ fontFamily: "var(--font-mono)",
+ }}
+ >
+ No scored runs available for this axis combination.
+ </div>
+ ) : (
+ <div style={{ overflowX: "auto" }}>
+ <table
+ style={{
+ borderCollapse: "collapse",
+ width: "auto",
+ fontFamily: "var(--font-mono)",
+ }}
+ >
+ <thead>
+ <tr>
+ <th
+ style={{
+ padding: "8px 12px",
+ fontSize: "var(--text-label)",
+ textTransform: "uppercase",
+ letterSpacing: "1px",
+ fontWeight: 500,
+ color: "var(--text-muted)",
+ background: "var(--surface-2)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ textAlign: "left",
+ }}
+ >
+ {AXIS_LABELS[rowAxis]} \ {AXIS_LABELS[colAxis]}
+ </th>
+ {colValues.map((col) => (
+ <th
+ key={col}
+ style={{
+ padding: "8px 12px",
+ fontSize: "var(--text-label)",
+ textTransform: "uppercase",
+ letterSpacing: "1px",
+ fontWeight: 500,
+ color: "var(--text-muted)",
+ background: "var(--surface-2)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ textAlign: "center",
+ fontFamily: "var(--font-mono)",
+ }}
+ >
+ {col}
+ </th>
+ ))}
+ </tr>
+ </thead>
+ <tbody>
+ {rowValues.map((row) => (
+ <tr key={row}>
+ <td
+ style={{
+ padding: "8px 12px",
+ fontSize: "var(--text-label)",
+ textTransform: "uppercase",
+ letterSpacing: "1px",
+ fontWeight: 600,
+ fontFamily: "var(--font-mono)",
+ color: "var(--text)",
+ background: "var(--surface-2)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ whiteSpace: "nowrap",
+ }}
+ >
+ {row}
+ </td>
+ {colValues.map((col) => {
+ const cell = cells[row]?.[col];
+ if (!cell) {
+ return (
+ <td
+ key={col}
+ style={{
+ padding: "10px 16px",
+ textAlign: "center",
+ color: "var(--text-muted)",
+ fontFamily: "var(--font-mono)",
+ fontSize: "var(--text-ui)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ background: "var(--surface-0)",
+ }}
+ >
+ -
+ </td>
+ );
+ }
+
+ const avg = cell.totalScore / cell.count;
+ const pct = avg * 100;
+
+ return (
+ <td
+ key={col}
+ style={{
+ padding: "10px 16px",
+ textAlign: "center",
+ fontFamily: "var(--font-mono)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ background: cellBackground(pct),
+ }}
+ >
+ <div
+ style={{
+ fontSize: "var(--text-ui)",
+ fontWeight: 700,
+ color: scoreToColor(pct),
+ lineHeight: 1.3,
+ }}
+ >
+ {pct.toFixed(0)}%
+ </div>
+ <div
+ style={{
+ fontSize: "var(--text-label)",
+ fontWeight: 400,
+ color: "var(--text-muted)",
+ lineHeight: 1.3,
+ }}
+ >
+ n={cell.count}
+ </div>
+ </td>
+ );
+ })}
+ </tr>
+ ))}
+ </tbody>
+ </table>
+ </div>
+ )}
+ </div>
+ );
+}
diff --git a/dashboard/src/components/RadarComparison.tsx b/dashboard/src/components/RadarComparison.tsx
@@ -0,0 +1,329 @@
+import { useState, useMemo } from "react";
+import {
+ RadarChart,
+ Radar,
+ PolarGrid,
+ PolarAngleAxis,
+ PolarRadiusAxis,
+ ResponsiveContainer,
+ Tooltip,
+} from "recharts";
+import type { Run } from "../lib/data";
+
+interface RadarComparisonProps {
+ runs: Run[];
+}
+
+const DIMENSIONS = [
+ "structural",
+ "functional",
+ "quality",
+ "code_analysis",
+ "gameplay_bot",
+ "transcript_analysis",
+] as const;
+
+type Dimension = (typeof DIMENSIONS)[number];
+
+const DIMENSION_LABELS: Record<Dimension, string> = {
+ structural: "Structural",
+ functional: "Functional",
+ quality: "Quality",
+ code_analysis: "Code Analysis",
+ gameplay_bot: "Gameplay Bot",
+ transcript_analysis: "Transcript",
+};
+
+const COLOR_A = "hsl(193 44% 67%)";
+const COLOR_B = "hsl(40 71% 73%)";
+
+function extractDimensionScore(run: Run, dim: Dimension): number | null {
+ if (!run.eval_results) return null;
+ const section = run.eval_results[dim as keyof typeof run.eval_results];
+ if (section && typeof section === "object" && "score" in section) {
+ const score = (section as { score: number }).score;
+ return typeof score === "number" ? score : null;
+ }
+ return null;
+}
+
+interface CellConfig {
+ cell_id: string;
+ label: string;
+ runs: Run[];
+}
+
+function buildCellConfigs(runs: Run[]): CellConfig[] {
+ const grouped: Record<string, Run[]> = {};
+ for (const run of runs) {
+ const id = run.meta.cell_id;
+ if (!grouped[id]) grouped[id] = [];
+ grouped[id].push(run);
+ }
+
+ return Object.entries(grouped)
+ .map(([cell_id, cellRuns]) => {
+ const m = cellRuns[0].meta;
+ const label = `${m.model} / ${m.language} / ${m.prompt_style} / ${m.effort}`;
+ return { cell_id, label, runs: cellRuns };
+ })
+ .sort((a, b) => a.label.localeCompare(b.label));
+}
+
+function averageScores(
+ runs: Run[],
+ dim: Dimension
+): number | null {
+ const scores = runs
+ .map((r) => extractDimensionScore(r, dim))
+ .filter((s): s is number => s !== null);
+ if (scores.length === 0) return null;
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
+}
+
+interface RadarDatum {
+ dimension: string;
+ scoreA: number;
+ scoreB: number;
+ labelA: string;
+ labelB: string;
+}
+
+function CustomTick({
+ payload,
+ x,
+ y,
+ data,
+}: {
+ payload: { value: string };
+ x: number;
+ y: number;
+ data: RadarDatum[];
+}) {
+ const datum = data.find((d) => d.dimension === payload.value);
+ if (!datum) return null;
+
+ return (
+ <g transform={`translate(${x},${y})`}>
+ <text
+ textAnchor="middle"
+ dy={-8}
+ style={{
+ fill: "var(--text)",
+ fontSize: "11px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontWeight: 500,
+ }}
+ >
+ {payload.value}
+ </text>
+ <text
+ textAnchor="middle"
+ dy={6}
+ style={{
+ fontSize: "10px",
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ >
+ <tspan fill={COLOR_A}>{datum.labelA}</tspan>
+ <tspan fill="var(--text-muted)"> / </tspan>
+ <tspan fill={COLOR_B}>{datum.labelB}</tspan>
+ </text>
+ </g>
+ );
+}
+
+export default function RadarComparison({ runs }: RadarComparisonProps) {
+ const cellConfigs = useMemo(() => buildCellConfigs(runs), [runs]);
+
+ const [selectedA, setSelectedA] = useState<string>(
+ cellConfigs[0]?.cell_id ?? ""
+ );
+ const [selectedB, setSelectedB] = useState<string>(
+ cellConfigs[1]?.cell_id ?? cellConfigs[0]?.cell_id ?? ""
+ );
+
+ const configA = cellConfigs.find((c) => c.cell_id === selectedA);
+ const configB = cellConfigs.find((c) => c.cell_id === selectedB);
+
+ const data: RadarDatum[] = useMemo(() => {
+ return DIMENSIONS.map((dim) => {
+ const scoreA = configA ? averageScores(configA.runs, dim) : null;
+ const scoreB = configB ? averageScores(configB.runs, dim) : null;
+ return {
+ dimension: DIMENSION_LABELS[dim],
+ scoreA: scoreA ?? 0,
+ scoreB: scoreB ?? 0,
+ labelA: scoreA !== null ? (scoreA * 100).toFixed(0) + "%" : "n/a",
+ labelB: scoreB !== null ? (scoreB * 100).toFixed(0) + "%" : "n/a",
+ };
+ });
+ }, [configA, configB]);
+
+ if (cellConfigs.length === 0) {
+ return (
+ <div
+ className="card"
+ style={{
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ borderRadius: 0,
+ }}
+ >
+ No configurations available for comparison.
+ </div>
+ );
+ }
+
+ return (
+ <div
+ style={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ borderRadius: 0,
+ padding: "20px",
+ }}
+ >
+ <h3 style={{ marginBottom: "16px" }}>Quality Radar Comparison</h3>
+
+ <div
+ style={{
+ display: "flex",
+ gap: "24px",
+ marginBottom: "20px",
+ flexWrap: "wrap",
+ }}
+ >
+ <div className="filter-group">
+ <label style={{ color: COLOR_A, fontWeight: 600 }}>Config A</label>
+ <select
+ value={selectedA}
+ onChange={(e) => setSelectedA(e.target.value)}
+ >
+ {cellConfigs.map((c) => (
+ <option key={c.cell_id} value={c.cell_id}>
+ {c.label} (n={c.runs.length})
+ </option>
+ ))}
+ </select>
+ </div>
+
+ <div className="filter-group">
+ <label style={{ color: COLOR_B, fontWeight: 600 }}>Config B</label>
+ <select
+ value={selectedB}
+ onChange={(e) => setSelectedB(e.target.value)}
+ >
+ {cellConfigs.map((c) => (
+ <option key={c.cell_id} value={c.cell_id}>
+ {c.label} (n={c.runs.length})
+ </option>
+ ))}
+ </select>
+ </div>
+ </div>
+
+ <ResponsiveContainer width="100%" height={420}>
+ <RadarChart cx="50%" cy="50%" outerRadius="70%" data={data}>
+ <PolarGrid
+ stroke="var(--border)"
+ strokeDasharray="3 3"
+ />
+ <PolarAngleAxis
+ dataKey="dimension"
+ tick={(props: Record<string, unknown>) => (
+ <CustomTick
+ payload={props.payload as { value: string }}
+ x={props.x as number}
+ y={props.y as number}
+ data={data}
+ />
+ )}
+ />
+ <PolarRadiusAxis
+ angle={90}
+ domain={[0, 1]}
+ tickCount={6}
+ tick={{
+ fill: "var(--text-muted)",
+ fontSize: 10,
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ tickFormatter={(v: number) => (v * 100).toFixed(0) + "%"}
+ stroke="var(--border)"
+ />
+ <Radar
+ name="Config A"
+ dataKey="scoreA"
+ stroke={COLOR_A}
+ fill={COLOR_A}
+ fillOpacity={0.3}
+ strokeWidth={2}
+ />
+ <Radar
+ name="Config B"
+ dataKey="scoreB"
+ stroke={COLOR_B}
+ fill={COLOR_B}
+ fillOpacity={0.3}
+ strokeWidth={2}
+ />
+ <Tooltip
+ contentStyle={{
+ background: "var(--surface-1)",
+ border: "1px solid var(--border)",
+ borderRadius: "2px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ }}
+ formatter={(value: unknown, name: unknown) => [
+ ((Number(value) || 0) * 100).toFixed(1) + "%",
+ String(name),
+ ]}
+ />
+ </RadarChart>
+ </ResponsiveContainer>
+
+ <div
+ style={{
+ display: "flex",
+ justifyContent: "center",
+ gap: "24px",
+ marginTop: "12px",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ }}
+ >
+ <span>
+ <span
+ style={{
+ display: "inline-block",
+ width: "12px",
+ height: "12px",
+ background: COLOR_A,
+ opacity: 0.7,
+ marginRight: "6px",
+ verticalAlign: "middle",
+ }}
+ />
+ <span style={{ color: COLOR_A }}>Config A</span>
+ </span>
+ <span>
+ <span
+ style={{
+ display: "inline-block",
+ width: "12px",
+ height: "12px",
+ background: COLOR_B,
+ opacity: 0.7,
+ marginRight: "6px",
+ verticalAlign: "middle",
+ }}
+ />
+ <span style={{ color: COLOR_B }}>Config B</span>
+ </span>
+ </div>
+ </div>
+ );
+}
diff --git a/dashboard/src/layouts/Base.astro b/dashboard/src/layouts/Base.astro
@@ -35,6 +35,7 @@ try {
<nav style="display: flex; gap: 16px; font-size: 0.875rem;">
<a href="/">Grid</a>
<a href="/insights">Insights</a>
+ <a href="/explore">Explore</a>
<a href="/compare">Compare</a>
</nav>
</div>
diff --git a/dashboard/src/pages/explore.astro b/dashboard/src/pages/explore.astro
@@ -0,0 +1,35 @@
+---
+import Base from "../layouts/Base.astro";
+import { loadAllRuns } from "../lib/data";
+import HeatmapMatrix from "../components/HeatmapMatrix";
+import RadarComparison from "../components/RadarComparison";
+import BumpChart from "../components/BumpChart";
+import ConfigTreemap from "../components/ConfigTreemap";
+import EfficiencyFrontier from "../components/EfficiencyFrontier";
+import CorrelationMatrix from "../components/CorrelationMatrix";
+
+const runs = loadAllRuns();
+---
+
+<Base title="Explore">
+ <h1 style="margin-bottom: 8px;">Explore</h1>
+ <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
+ Interactive visualizations of the experiment space
+ </p>
+
+ <div style="display: flex; flex-direction: column; gap: 32px;">
+ <CorrelationMatrix client:load runs={runs} />
+
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
+ <EfficiencyFrontier client:load runs={runs} />
+ <BumpChart client:load runs={runs} />
+ </div>
+
+ <HeatmapMatrix client:load runs={runs} />
+
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
+ <RadarComparison client:load runs={runs} />
+ <ConfigTreemap client:load runs={runs} />
+ </div>
+ </div>
+</Base>