loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 5669f7044babf34cfed2b866f7d284476fd42bbf
parent a25191cd2a25892e07b8dd4a14baa6f7c6035e42
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 09:26:37 +0200

Add Explore page with 6 interactive visualizations

New /explore page with:
1. CorrelationMatrix - which variables matter for which outcomes
   (rows=config axes, columns=score dimensions, cell=effect size)
2. EfficiencyFrontier - Pareto frontier on cost vs score scatter
3. BumpChart - model rank changes across conditions, crossing highlights
4. HeatmapMatrix - configurable 2-axis heatmap (any axis x any axis)
5. RadarComparison - spider chart comparing two configs across all
   quality dimensions
6. ConfigTreemap - size=runs, color=score, grouped by model

Navigation updated: Grid > Insights > Explore > Compare

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/BumpChart.tsx | 559+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adashboard/src/components/ConfigTreemap.tsx | 344+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adashboard/src/components/CorrelationMatrix.tsx | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adashboard/src/components/EfficiencyFrontier.tsx | 489+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/components/Filters.tsx | 5+++++
Adashboard/src/components/HeatmapMatrix.tsx | 311+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adashboard/src/components/RadarComparison.tsx | 329+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/layouts/Base.astro | 1+
Adashboard/src/pages/explore.astro | 35+++++++++++++++++++++++++++++++++++
9 files changed, 2309 insertions(+), 0 deletions(-)

diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx @@ -0,0 +1,559 @@ +import { useState, useMemo } from "react"; +import { + LineChart, + Line, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + ResponsiveContainer, + ReferenceDot, +} from "recharts"; +import type { Run } from "../lib/data"; +import { AXIS_NAMES, type AxisName } from "../lib/data"; + +interface BumpChartProps { + runs: Run[]; +} + +const MODEL_COLORS: Record<string, string> = { + haiku: "hsl(193 44% 67%)", + sonnet: "hsl(40 71% 73%)", + opus: "hsl(311 24% 63%)", +}; + +const FALLBACK_COLOR = "hsl(213 14% 65%)"; + +const AXIS_LABELS: Record<AxisName, string> = { + model: "Model", + effort: "Effort", + prompt_style: "Prompt Style", + language: "Language", + human_language: "Human Language", + tool_read: "Read Tool", + tool_write: "Write Tool", + tool_edit: "Edit Tool", + tool_glob: "Glob Tool", + tool_grep: "Grep Tool", + linter: "Linter", + playwright: "Playwright", + context_file: "Context File", + sub_agents: "Sub-agents", + web_search: "Web Search", + max_budget: "Budget", +}; + +// All axes except "model" since we rank by model +const CONDITION_AXES = AXIS_NAMES.filter((a) => a !== "model"); + +interface RankedPoint { + conditionValue: string; + rank: number; + avgScore: number; + model: string; + n: number; +} + +interface CrossingPoint { + conditionValue: string; + x: number; + rank: number; + models: [string, string]; +} + +function computeRankings( + runs: Run[], + axis: AxisName +): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } { + // Get unique condition values for the selected axis + const conditionValues = Array.from( + new Set(runs.map((r) => String(r.meta[axis]))) + ).sort(); + + // Get unique models + const models = Array.from(new Set(runs.map((r) => r.meta.model))).sort(); + + // For each condition value, compute average score per model, then rank + const ranked: Record<string, RankedPoint[]> = {}; + for (const model of models) { + ranked[model] = []; + } + + const prevRanks: Record<string, number> = {}; + + const crossings: CrossingPoint[] = []; + + for (let ci = 0; ci < conditionValues.length; ci++) { + const cv = conditionValues[ci]; + const runsForCondition = runs.filter( + (r) => String(r.meta[axis]) === cv + ); + + // Compute average score per model for this condition + const modelScores: Array<{ + model: string; + avgScore: number; + n: number; + }> = []; + for (const model of models) { + const modelRuns = runsForCondition.filter( + (r) => r.meta.model === model + ); + const scores = modelRuns + .map((r) => r.eval_results?.score) + .filter((s): s is number => s !== null && s !== undefined); + + if (scores.length > 0) { + const avg = scores.reduce((a, b) => a + b, 0) / scores.length; + modelScores.push({ model, avgScore: avg, n: scores.length }); + } + } + + // Sort by avgScore descending (higher score = rank 1) + modelScores.sort((a, b) => b.avgScore - a.avgScore); + + // Assign ranks + const currentRanks: Record<string, number> = {}; + for (let i = 0; i < modelScores.length; i++) { + const ms = modelScores[i]; + const rank = i + 1; + currentRanks[ms.model] = rank; + ranked[ms.model].push({ + conditionValue: cv, + rank, + avgScore: ms.avgScore, + model: ms.model, + n: ms.n, + }); + } + + // Detect crossings: if any two models swapped relative rank order + if (ci > 0) { + for (let i = 0; i < models.length; i++) { + for (let j = i + 1; j < models.length; j++) { + const m1 = models[i]; + const m2 = models[j]; + const prev1 = prevRanks[m1]; + const prev2 = prevRanks[m2]; + const curr1 = currentRanks[m1]; + const curr2 = currentRanks[m2]; + + if ( + prev1 !== undefined && + prev2 !== undefined && + curr1 !== undefined && + curr2 !== undefined + ) { + // Check if they crossed: relative order changed + if ( + (prev1 < prev2 && curr1 > curr2) || + (prev1 > prev2 && curr1 < curr2) + ) { + // Approximate crossing rank as average of the two at the crossing point + const crossRank = (curr1 + curr2) / 2; + crossings.push({ + conditionValue: cv, + x: ci, + rank: crossRank, + models: [m1, m2], + }); + } + } + } + } + } + + Object.assign(prevRanks, currentRanks); + } + + return { ranked, crossings }; +} + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +type DotProps = { cx?: number; cy?: number; payload?: any; stroke?: string }; + +function makeRankDot( + model: string, + color: string, + lookup: Record<string, Record<string, RankedPoint>> +) { + return function RankDot({ cx, cy, payload }: DotProps) { + if (cx === undefined || cy === undefined || !payload) return null; + const point = lookup[model]?.[payload.conditionValue]; + if (!point) return null; + return ( + <g> + <circle + cx={cx} + cy={cy} + r={5} + fill={color} + stroke="var(--surface-1)" + strokeWidth={2} + /> + <text + x={cx + 10} + y={cy - 8} + fill="var(--text)" + fontSize={10} + fontFamily="'JetBrains Mono', monospace" + textAnchor="start" + > + {(point.avgScore * 100).toFixed(0)}% + </text> + </g> + ); + }; +} + +function CustomTooltipContent({ + active, + payload, + lookup, +}: { + active?: boolean; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + payload?: Array<{ dataKey?: string; payload?: any; stroke: string }>; + label?: string; + lookup: Record<string, Record<string, RankedPoint>>; +}) { + if (!active || !payload || payload.length === 0) return null; + + const conditionValue = payload[0]?.payload?.conditionValue; + if (!conditionValue) return null; + + // Resolve actual RankedPoint data from lookup + const resolved = payload + .filter((entry) => entry.dataKey && lookup[entry.dataKey]) + .map((entry) => ({ + point: lookup[entry.dataKey!]?.[conditionValue], + stroke: entry.stroke, + })) + .filter((r) => r.point); + + const sorted = [...resolved].sort( + (a, b) => a.point!.rank - b.point!.rank + ); + + return ( + <div + style={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + padding: "8px 12px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + }} + > + <div + style={{ + color: "var(--text)", + fontWeight: 600, + marginBottom: "6px", + }} + > + {conditionValue} + </div> + {sorted.map((entry) => ( + <div + key={entry.point!.model} + style={{ + display: "flex", + alignItems: "center", + gap: "8px", + marginBottom: "2px", + }} + > + <span + style={{ + display: "inline-block", + width: 8, + height: 8, + background: entry.stroke, + flexShrink: 0, + }} + /> + <span style={{ color: "var(--text-muted)", width: "16px" }}> + #{entry.point!.rank} + </span> + <span style={{ color: "var(--text)" }}> + {entry.point!.model} + </span> + <span style={{ color: "var(--text-muted)", marginLeft: "auto" }}> + {(entry.point!.avgScore * 100).toFixed(1)}% (n= + {entry.point!.n}) + </span> + </div> + ))} + </div> + ); +} + +export default function BumpChart({ runs }: BumpChartProps) { + const [selectedAxis, setSelectedAxis] = useState<AxisName>("prompt_style"); + + const { ranked, crossings, conditionValues, models } = useMemo(() => { + const { ranked, crossings } = computeRankings(runs, selectedAxis); + const conditionValues = Array.from( + new Set(runs.map((r) => String(r.meta[selectedAxis]))) + ).sort(); + const models = Object.keys(ranked).filter( + (m) => ranked[m].length > 0 + ); + return { ranked, crossings, conditionValues, models }; + }, [runs, selectedAxis]); + + // Build a lookup: model -> conditionValue -> RankedPoint + const pointLookup = useMemo(() => { + const lookup: Record<string, Record<string, RankedPoint>> = {}; + for (const model of models) { + lookup[model] = {}; + for (const point of ranked[model]) { + lookup[model][point.conditionValue] = point; + } + } + return lookup; + }, [models, ranked]); + + // Build recharts data: one entry per condition value + const chartData = useMemo(() => { + return conditionValues.map((cv) => { + const entry: Record<string, unknown> = { conditionValue: cv }; + for (const model of models) { + const point = pointLookup[model]?.[cv]; + if (point) { + entry[model] = point.rank; + } + } + return entry; + }); + }, [conditionValues, models, pointLookup]); + + const maxRank = models.length; + + const scoredRuns = runs.filter( + (r) => + r.eval_results?.score !== null && r.eval_results?.score !== undefined + ); + + if (scoredRuns.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + }} + > + No scored runs available for ranking. + </div> + ); + } + + return ( + <div className="card"> + <div + style={{ + display: "flex", + alignItems: "center", + justifyContent: "space-between", + marginBottom: "16px", + flexWrap: "wrap", + gap: "12px", + }} + > + <div> + <h3 style={{ margin: 0 }}>Model Rankings by Condition</h3> + <p + style={{ + color: "var(--text-muted)", + fontSize: "0.75rem", + margin: "4px 0 0", + }} + > + Rank 1 = best average score. Crossings indicate rank swaps. + </p> + </div> + <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> + <label + style={{ + fontSize: "0.75rem", + color: "var(--text-muted)", + }} + > + Condition: + </label> + <select + value={selectedAxis} + onChange={(e) => setSelectedAxis(e.target.value as AxisName)} + style={{ + background: "var(--surface-2)", + color: "var(--text)", + border: "1px solid var(--border)", + padding: "4px 8px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "0.75rem", + borderRadius: "0", + cursor: "pointer", + }} + > + {CONDITION_AXES.map((axis) => ( + <option key={axis} value={axis}> + {AXIS_LABELS[axis]} + </option> + ))} + </select> + </div> + </div> + + {conditionValues.length < 2 ? ( + <div + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + fontSize: "0.8rem", + }} + > + Need at least 2 values for "{AXIS_LABELS[selectedAxis]}" to show + rankings. Currently only: {conditionValues.join(", ") || "none"} + </div> + ) : ( + <> + <ResponsiveContainer width="100%" height={300}> + <LineChart + data={chartData} + margin={{ top: 20, right: 60, bottom: 10, left: 10 }} + > + <CartesianGrid + strokeDasharray="3 3" + stroke="var(--border)" + vertical={false} + /> + <XAxis + dataKey="conditionValue" + stroke="var(--text-muted)" + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + tickLine={false} + axisLine={{ stroke: "var(--border)" }} + /> + <YAxis + domain={[0.5, maxRank + 0.5]} + ticks={Array.from({ length: maxRank }, (_, i) => i + 1)} + reversed + stroke="var(--text-muted)" + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + tickLine={false} + axisLine={{ stroke: "var(--border)" }} + label={{ + value: "Rank", + angle: -90, + position: "insideLeft", + fill: "var(--text-muted)", + fontSize: 11, + fontFamily: "'JetBrains Mono', monospace", + }} + tickFormatter={(v: number) => `#${v}`} + /> + <Tooltip + content={<CustomTooltipContent lookup={pointLookup} />} + cursor={{ stroke: "var(--border)", strokeDasharray: "3 3" }} + /> + {models.map((model) => ( + <Line + key={model} + type="linear" + dataKey={model} + stroke={MODEL_COLORS[model] || FALLBACK_COLOR} + strokeWidth={2.5} + dot={makeRankDot( + model, + MODEL_COLORS[model] || FALLBACK_COLOR, + pointLookup + )} + activeDot={false} + name={model} + connectNulls + /> + ))} + {crossings.map((crossing, i) => ( + <ReferenceDot + key={`crossing-${i}`} + x={crossing.conditionValue} + y={crossing.rank} + r={10} + fill="none" + stroke="var(--yellow)" + strokeWidth={1.5} + strokeDasharray="3 2" + /> + ))} + </LineChart> + </ResponsiveContainer> + + {/* Legend */} + <div + style={{ + display: "flex", + alignItems: "center", + justifyContent: "center", + gap: "20px", + marginTop: "12px", + flexWrap: "wrap", + }} + > + {models.map((model) => ( + <div + key={model} + style={{ + display: "flex", + alignItems: "center", + gap: "6px", + fontSize: "0.75rem", + fontFamily: "'JetBrains Mono', monospace", + }} + > + <span + style={{ + display: "inline-block", + width: 12, + height: 3, + background: MODEL_COLORS[model] || FALLBACK_COLOR, + }} + /> + <span style={{ color: "var(--text)" }}>{model}</span> + </div> + ))} + {crossings.length > 0 && ( + <div + style={{ + display: "flex", + alignItems: "center", + gap: "6px", + fontSize: "0.75rem", + fontFamily: "'JetBrains Mono', monospace", + }} + > + <span + style={{ + display: "inline-block", + width: 12, + height: 12, + borderRadius: "50%", + border: "1.5px dashed var(--yellow)", + }} + /> + <span style={{ color: "var(--text-muted)" }}> + rank swap + </span> + </div> + )} + </div> + </> + )} + </div> + ); +} diff --git a/dashboard/src/components/ConfigTreemap.tsx b/dashboard/src/components/ConfigTreemap.tsx @@ -0,0 +1,344 @@ +import React, { useState, useCallback } from "react"; +import { Treemap, ResponsiveContainer, Tooltip } from "recharts"; +import type { TreemapNode } from "recharts/types/chart/Treemap"; +import type { Run, AxisName } from "../lib/data"; + +interface ConfigTreemapProps { + runs: Run[]; +} + +const SECONDARY_AXES: AxisName[] = [ + "prompt_style", + "effort", + "language", + "human_language", + "linter", + "playwright", + "context_file", + "sub_agents", + "web_search", + "max_budget", +]; + +function scoreColor(avgScore: number | null): string { + if (avgScore === null) return "hsl(213 14% 30%)"; + const pct = avgScore * 100; + if (pct > 60) return "hsl(92 28% 45%)"; + if (pct >= 30) return "hsl(40 71% 50%)"; + return "hsl(355 52% 48%)"; +} + +interface LeafData { + name: string; + displayName: string; + size: number; + avgScore: number | null; + avgScorePct: string; + model: string; + configValue: string; + color: string; + [key: string]: unknown; +} + +interface GroupData { + name: string; + children: LeafData[]; + [key: string]: unknown; +} + +function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] { + const byModel: Record<string, Record<string, Run[]>> = {}; + + for (const run of runs) { + const model = run.meta.model; + const secondary = String(run.meta[secondaryAxis]); + if (!byModel[model]) byModel[model] = {}; + if (!byModel[model][secondary]) byModel[model][secondary] = []; + byModel[model][secondary].push(run); + } + + return Object.entries(byModel) + .sort(([a], [b]) => a.localeCompare(b)) + .map(([model, configs]) => ({ + name: model, + children: Object.entries(configs) + .sort(([a], [b]) => a.localeCompare(b)) + .map(([configValue, configRuns]) => { + const scores = configRuns + .map((r) => r.eval_results?.score) + .filter((s): s is number => s !== null && s !== undefined); + const avgScore = + scores.length > 0 + ? scores.reduce((a, b) => a + b, 0) / scores.length + : null; + + return { + name: `${model} / ${configValue}`, + displayName: `${model} / ${configValue}`, + size: configRuns.length, + avgScore, + avgScorePct: + avgScore !== null ? `${(avgScore * 100).toFixed(0)}%` : "--", + model, + configValue, + color: scoreColor(avgScore), + }; + }), + })); +} + +function CustomContent(props: TreemapNode): React.ReactElement { + const { x, y, width, height, depth, name } = props; + + // Only render leaf nodes (depth === 2 in a two-level hierarchy via 'flat' type) + // depth 1 = model group, depth 2 = leaf + if (depth < 2) return <g />; + + const avgScorePct = (props as unknown as LeafData).avgScorePct ?? "--"; + const count = (props as unknown as LeafData).size ?? 0; + const color = (props as unknown as LeafData).color ?? "hsl(213 14% 30%)"; + + const showText = width > 50 && height > 36; + const showCount = width > 50 && height > 50; + + return ( + <g> + <rect + x={x} + y={y} + width={width} + height={height} + fill={color} + stroke="hsl(213 16% 12%)" + strokeWidth={2} + /> + {showText && ( + <> + <text + x={x + width / 2} + y={y + height / 2 - (showCount ? 8 : 0)} + textAnchor="middle" + dominantBaseline="central" + fill="hsl(213 27% 95%)" + style={{ + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + fontWeight: 600, + textTransform: "uppercase", + }} + > + {width > 100 ? name : (props as unknown as LeafData).configValue} + </text> + <text + x={x + width / 2} + y={y + height / 2 + 8} + textAnchor="middle" + dominantBaseline="central" + fill="hsl(213 27% 95%)" + style={{ + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + fontWeight: 500, + textTransform: "uppercase", + }} + > + {avgScorePct} + </text> + {showCount && ( + <text + x={x + width / 2} + y={y + height / 2 + 22} + textAnchor="middle" + dominantBaseline="central" + fill="hsla(213, 27%, 95%, 0.65)" + style={{ + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + fontWeight: 400, + textTransform: "uppercase", + }} + > + n={count} + </text> + )} + </> + )} + </g> + ); +} + +function CustomTooltip({ + active, + payload, +}: { + active?: boolean; + payload?: Array<{ payload: TreemapNode }>; +}) { + if (!active || !payload || payload.length === 0) return null; + + const node = payload[0].payload as unknown as LeafData; + if (!node.displayName) return null; + + return ( + <div + style={{ + background: "hsl(217 16% 15.5%)", + border: "1px solid hsl(217 17% 28%)", + padding: "8px 12px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + textTransform: "uppercase", + letterSpacing: "0.5px", + }} + > + <div style={{ fontWeight: 600, marginBottom: 4, color: "hsl(213 27% 88%)" }}> + {node.displayName} + </div> + <div style={{ color: "hsl(213 14% 65%)" }}> + Score: {node.avgScorePct} + </div> + <div style={{ color: "hsl(213 14% 65%)" }}> + Runs: {node.size} + </div> + </div> + ); +} + +export default function ConfigTreemap({ runs }: ConfigTreemapProps) { + const [secondaryAxis, setSecondaryAxis] = useState<AxisName>("prompt_style"); + + const handleClick = useCallback( + (node: TreemapNode) => { + const leaf = node as unknown as LeafData; + if (leaf.model && leaf.configValue) { + const params = new URLSearchParams(); + params.set("model", leaf.model); + params.set(secondaryAxis, leaf.configValue); + window.location.href = `/?${params.toString()}`; + } + }, + [secondaryAxis], + ); + + if (runs.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + }} + > + No data for treemap. + </div> + ); + } + + const treeData = buildTreeData(runs, secondaryAxis); + + return ( + <div + className="card" + style={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + borderRadius: 0, + padding: "20px", + }} + > + <div + style={{ + display: "flex", + justifyContent: "space-between", + alignItems: "center", + marginBottom: "16px", + }} + > + <h3 style={{ margin: 0 }}>Configuration Treemap</h3> + <div className="filter-group"> + <label htmlFor="treemap-axis">Group by</label> + <select + id="treemap-axis" + value={secondaryAxis} + onChange={(e) => setSecondaryAxis(e.target.value as AxisName)} + > + {SECONDARY_AXES.map((axis) => ( + <option key={axis} value={axis}> + {axis} + </option> + ))} + </select> + </div> + </div> + + <div + style={{ + display: "flex", + gap: "16px", + marginBottom: "12px", + fontSize: "11px", + fontFamily: "var(--font-mono)", + textTransform: "uppercase", + letterSpacing: "0.5px", + color: "var(--text-muted)", + }} + > + <span> + <span + style={{ + display: "inline-block", + width: 10, + height: 10, + background: "hsl(92 28% 45%)", + marginRight: 4, + verticalAlign: "middle", + }} + /> + {">"} 60% + </span> + <span> + <span + style={{ + display: "inline-block", + width: 10, + height: 10, + background: "hsl(40 71% 50%)", + marginRight: 4, + verticalAlign: "middle", + }} + /> + 30-60% + </span> + <span> + <span + style={{ + display: "inline-block", + width: 10, + height: 10, + background: "hsl(355 52% 48%)", + marginRight: 4, + verticalAlign: "middle", + }} + /> + {"<"} 30% + </span> + </div> + + <ResponsiveContainer width="100%" height={400}> + <Treemap + data={treeData} + dataKey="size" + nameKey="name" + type="flat" + content={CustomContent} + onClick={handleClick} + isAnimationActive={false} + stroke="hsl(213 16% 12%)" + > + <Tooltip content={<CustomTooltip />} /> + </Treemap> + </ResponsiveContainer> + </div> + ); +} diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx @@ -0,0 +1,236 @@ +import type { Run } from "../lib/data"; + +interface CorrelationMatrixProps { + runs: Run[]; +} + +const CONFIG_AXES = [ + { key: "model", label: "Model" }, + { key: "effort", label: "Effort" }, + { key: "prompt_style", label: "Prompt Style" }, + { key: "language", label: "Language" }, + { key: "tool_read", label: "Read Tool" }, + { key: "tool_write", label: "Write Tool" }, + { key: "tool_edit", label: "Edit Tool" }, + { key: "tool_glob", label: "Glob Tool" }, + { key: "tool_grep", label: "Grep Tool" }, + { key: "linter", label: "Linter" }, + { key: "playwright", label: "Playwright" }, + { key: "context_file", label: "Context File" }, + { key: "sub_agents", label: "Sub-agents" }, + { key: "web_search", label: "Web Search" }, + { key: "max_budget", label: "Budget" }, +] as const; + +type MetricExtractor = (run: Run) => number | null; + +const OUTCOME_METRICS: Array<{ key: string; label: string; extract: MetricExtractor }> = [ + { key: "overall", label: "Overall", extract: (r) => r.eval_results?.score ?? null }, + { key: "gameplay", label: "Gameplay", extract: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null }, + { key: "code", label: "Code", extract: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null }, + { key: "structural", label: "Structural", extract: (r) => r.eval_results?.structural?.score ?? null }, + { key: "quality", label: "Quality", extract: (r) => r.eval_results?.quality?.score ?? null }, + { key: "transcript", label: "Transcript", extract: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null }, + { key: "cost", label: "Cost", extract: (r) => r.claude_output?.total_cost_usd ?? null }, + { key: "turns", label: "Turns", extract: (r) => r.claude_output?.num_turns ?? null }, + { key: "time", label: "Wall Time", extract: (r) => r.meta.wall_time_seconds ?? null }, +]; + +function computeSpread(runs: Run[], axisKey: string, extract: MetricExtractor): number | null { + const groups: Record<string, number[]> = {}; + for (const run of runs) { + const val = extract(run); + if (val === null) continue; + const groupKey = String((run.meta as Record<string, unknown>)[axisKey] ?? "unknown"); + (groups[groupKey] ??= []).push(val); + } + + const keys = Object.keys(groups); + if (keys.length < 2) return null; + + const means = keys.map((k) => { + const vals = groups[k]; + return vals.reduce((a, b) => a + b, 0) / vals.length; + }); + + return Math.max(...means) - Math.min(...means); +} + +export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) { + if (runs.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + }} + > + No data available for correlation analysis. + </div> + ); + } + + // Compute the full matrix: rows = config axes, columns = metrics + const matrix: Array<{ + key: string; + label: string; + spreads: Array<number | null>; + maxSpread: number; + }> = []; + + for (const axis of CONFIG_AXES) { + const spreads = OUTCOME_METRICS.map((metric) => + computeSpread(runs, axis.key, metric.extract) + ); + const validSpreads = spreads.filter((s): s is number => s !== null); + const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0; + matrix.push({ key: axis.key, label: axis.label, spreads, maxSpread }); + } + + // Sort rows by maximum spread (most impactful variable first) + matrix.sort((a, b) => b.maxSpread - a.maxSpread); + + // Find global max spread for color scaling + const globalMax = Math.max(...matrix.map((r) => r.maxSpread), 0.001); + + return ( + <div className="card" style={{ padding: "20px" }}> + <h3 style={{ marginBottom: "4px" }}>Variable Impact Matrix</h3> + <p + style={{ + color: "var(--text-muted)", + fontSize: "0.75rem", + marginBottom: "16px", + }} + > + Effect size (spread) of each configuration variable on each outcome. + Sorted by maximum impact. Stronger color = larger effect. + </p> + + <div style={{ overflowX: "auto" }}> + <table + style={{ + borderCollapse: "collapse", + width: "auto", + minWidth: "100%", + }} + > + <thead> + <tr> + <th + style={{ + padding: "6px 12px", + fontSize: "11px", + textAlign: "right", + background: "var(--surface-2)", + borderBottom: "1px solid var(--border)", + borderRight: "1px solid var(--border)", + position: "sticky", + left: 0, + zIndex: 1, + }} + > + Variable + </th> + {OUTCOME_METRICS.map((metric) => ( + <th + key={metric.key} + style={{ + padding: "6px 8px", + fontSize: "11px", + textAlign: "center", + background: "var(--surface-2)", + borderBottom: "1px solid var(--border)", + fontFamily: "var(--font-mono)", + fontWeight: 500, + color: "var(--text-muted)", + textTransform: "uppercase", + letterSpacing: "0.5px", + whiteSpace: "nowrap", + }} + > + {metric.label} + </th> + ))} + </tr> + </thead> + <tbody> + {matrix.map((row) => ( + <tr key={row.key} style={{ background: "transparent" }}> + <td + style={{ + padding: "5px 12px", + fontSize: "11px", + fontFamily: "var(--font-mono)", + fontWeight: 500, + textAlign: "right", + whiteSpace: "nowrap", + borderBottom: "1px solid var(--border)", + borderRight: "1px solid var(--border)", + background: "var(--surface-1)", + position: "sticky", + left: 0, + zIndex: 1, + }} + > + {row.label} + </td> + {row.spreads.map((spread, i) => { + if (spread === null) { + return ( + <td + key={OUTCOME_METRICS[i].key} + style={{ + padding: "5px 8px", + textAlign: "center", + fontSize: "11px", + fontFamily: "var(--font-mono)", + color: "var(--text-muted)", + borderBottom: "1px solid var(--border)", + }} + > + -- + </td> + ); + } + + const opacity = Math.min(spread / globalMax, 1) * 0.7 + 0.05; + const isScoreMetric = !["cost", "turns", "time"].includes( + OUTCOME_METRICS[i].key + ); + const displayValue = isScoreMetric + ? `${(spread * 100).toFixed(1)}%` + : OUTCOME_METRICS[i].key === "cost" + ? `$${spread.toFixed(2)}` + : OUTCOME_METRICS[i].key === "time" + ? `${Math.round(spread)}s` + : spread.toFixed(1); + + return ( + <td + key={OUTCOME_METRICS[i].key} + style={{ + padding: "5px 8px", + textAlign: "center", + fontSize: "11px", + fontFamily: "var(--font-mono)", + fontWeight: 600, + color: "var(--text)", + borderBottom: "1px solid var(--border)", + background: `rgba(136, 192, 208, ${opacity})`, + }} + > + {displayValue} + </td> + ); + })} + </tr> + ))} + </tbody> + </table> + </div> + </div> + ); +} diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx @@ -0,0 +1,489 @@ +import { useState, useMemo } from "react"; +import { + ScatterChart, + Scatter, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + ResponsiveContainer, +} from "recharts"; +import type { Run } from "../lib/data"; + +interface EfficiencyFrontierProps { + runs: Run[]; +} + +const MODEL_COLORS: Record<string, string> = { + haiku: "hsl(193 44% 67%)", + sonnet: "hsl(40 71% 73%)", + opus: "hsl(311 24% 63%)", +}; + +const DEFAULT_COLOR = "hsl(213 14% 65%)"; + +interface ConfigPoint { + cell_id: string; + model: string; + avgCost: number; + avgScore: number; + runCount: number; + config: Record<string, string>; + isFrontier: boolean; + label: string; +} + +function getModelColor(model: string): string { + const key = model.toLowerCase(); + for (const [m, color] of Object.entries(MODEL_COLORS)) { + if (key.includes(m)) return color; + } + return DEFAULT_COLOR; +} + +function aggregateByConfig(runs: Run[]): ConfigPoint[] { + const groups: Record< + string, + { + scores: number[]; + costs: number[]; + model: string; + config: Record<string, string>; + } + > = {}; + + for (const run of runs) { + const id = run.meta.cell_id; + if (!groups[id]) { + groups[id] = { + scores: [], + costs: [], + model: run.meta.model, + config: { + model: run.meta.model, + effort: run.meta.effort, + prompt_style: run.meta.prompt_style, + language: run.meta.language, + linter: run.meta.linter, + playwright: run.meta.playwright, + context_file: run.meta.context_file, + sub_agents: run.meta.sub_agents, + web_search: run.meta.web_search, + max_budget: run.meta.max_budget, + }, + }; + } + + if (run.eval_results?.score != null) { + groups[id].scores.push(run.eval_results.score); + } + if (run.claude_output?.total_cost_usd != null) { + groups[id].costs.push(run.claude_output.total_cost_usd); + } + } + + return Object.entries(groups) + .filter(([, g]) => g.scores.length > 0 && g.costs.length > 0) + .map(([cell_id, g]) => ({ + cell_id, + model: g.model, + avgCost: g.costs.reduce((a, b) => a + b, 0) / g.costs.length, + avgScore: g.scores.reduce((a, b) => a + b, 0) / g.scores.length, + runCount: g.scores.length, + config: g.config, + isFrontier: false, + label: "", + })); +} + +function computeParetoFrontier(points: ConfigPoint[]): ConfigPoint[] { + const frontier: ConfigPoint[] = []; + + for (const p of points) { + let dominated = false; + for (const q of points) { + if (q === p) continue; + if (q.avgScore >= p.avgScore && q.avgCost <= p.avgCost) { + if (q.avgScore > p.avgScore || q.avgCost < p.avgCost) { + dominated = true; + break; + } + } + } + if (!dominated) { + frontier.push(p); + } + } + + frontier.sort((a, b) => a.avgCost - b.avgCost); + return frontier; +} + +function findKeyDifference( + point: ConfigPoint, + allPoints: ConfigPoint[] +): string { + const configKeys = Object.keys(point.config); + const valueCounts: Record<string, Record<string, number>> = {}; + + for (const key of configKeys) { + valueCounts[key] = {}; + for (const p of allPoints) { + const val = p.config[key] || ""; + valueCounts[key][val] = (valueCounts[key][val] || 0) + 1; + } + } + + let bestKey = ""; + let bestRarity = Infinity; + + for (const key of configKeys) { + if (key === "model") continue; + const val = point.config[key]; + const count = valueCounts[key][val] || 0; + const total = allPoints.length; + const rarity = count / total; + if (rarity < bestRarity && rarity < 1) { + bestRarity = rarity; + bestKey = key; + } + } + + if (bestKey) { + return `${point.config.model} / ${bestKey}=${point.config[bestKey]}`; + } + return point.config.model; +} + +interface TooltipPayloadEntry { + payload?: ConfigPoint; +} + +function CustomTooltip({ + active, + payload, +}: { + active?: boolean; + payload?: TooltipPayloadEntry[]; +}) { + if (!active || !payload || payload.length === 0) return null; + const point = payload[0]?.payload; + if (!point) return null; + + return ( + <div + style={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + padding: "12px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + color: "var(--text)", + maxWidth: "300px", + }} + > + <div + style={{ + fontWeight: 600, + marginBottom: "8px", + fontSize: "12px", + color: getModelColor(point.model), + }} + > + {point.cell_id} + </div> + <div style={{ marginBottom: "6px" }}> + <span style={{ color: "var(--text-muted)" }}>score: </span> + <span style={{ fontWeight: 600 }}> + {(point.avgScore * 100).toFixed(1)}% + </span> + </div> + <div style={{ marginBottom: "6px" }}> + <span style={{ color: "var(--text-muted)" }}>cost: </span> + <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span> + </div> + <div style={{ marginBottom: "8px" }}> + <span style={{ color: "var(--text-muted)" }}>runs: </span> + <span>{point.runCount}</span> + </div> + {point.isFrontier && ( + <div + style={{ + color: "var(--green)", + fontWeight: 600, + fontSize: "10px", + textTransform: "uppercase", + letterSpacing: "1px", + marginBottom: "8px", + }} + > + Pareto Frontier + </div> + )} + <div + style={{ + borderTop: "1px solid var(--border)", + paddingTop: "8px", + display: "grid", + gridTemplateColumns: "auto 1fr", + gap: "2px 8px", + }} + > + {Object.entries(point.config).map(([key, val]) => ( + <div key={key} style={{ display: "contents" }}> + <span style={{ color: "var(--text-muted)" }}>{key}:</span> + <span>{val}</span> + </div> + ))} + </div> + </div> + ); +} + +export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { + const [hoveredId, setHoveredId] = useState<string | null>(null); + + const points = useMemo(() => { + const raw = aggregateByConfig(runs); + const frontier = computeParetoFrontier(raw); + const frontierIds = new Set(frontier.map((p) => p.cell_id)); + + return raw.map((p) => ({ + ...p, + isFrontier: frontierIds.has(p.cell_id), + label: frontierIds.has(p.cell_id) ? findKeyDifference(p, raw) : "", + })); + }, [runs]); + + if (points.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + }} + > + Not enough data to compute efficiency frontier. + </div> + ); + } + + const frontierPoints = points + .filter((p) => p.isFrontier) + .sort((a, b) => a.avgCost - b.avgCost); + const nonFrontierPoints = points.filter((p) => !p.isFrontier); + + // Custom shape for non-frontier dots (small, dimmed) + const nonFrontierShape = (props: { + cx?: number; + cy?: number; + payload?: ConfigPoint; + }) => { + const { cx, cy, payload } = props; + if (cx == null || cy == null || !payload) return null; + const color = getModelColor(payload.model); + const opacity = + hoveredId === null ? 0.4 : hoveredId === payload.cell_id ? 1 : 0.2; + return ( + <circle + cx={cx} + cy={cy} + r={5} + fill={color} + fillOpacity={opacity} + stroke="none" + style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }} + /> + ); + }; + + // Custom shape for frontier dots (large, prominent, green ring) + const frontierShape = (props: { + cx?: number; + cy?: number; + payload?: ConfigPoint; + }) => { + const { cx, cy, payload } = props; + if (cx == null || cy == null || !payload) return null; + const color = getModelColor(payload.model); + const opacity = + hoveredId === null ? 1 : hoveredId === payload.cell_id ? 1 : 0.5; + return ( + <circle + cx={cx} + cy={cy} + r={9} + fill={color} + fillOpacity={opacity} + stroke="hsl(92 28% 65%)" + strokeWidth={2} + style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }} + /> + ); + }; + + return ( + <div className="card"> + <h3 style={{ marginBottom: "4px" }}>Efficiency Frontier</h3> + <p + style={{ + color: "var(--text-muted)", + fontSize: "11px", + marginBottom: "16px", + }} + > + Cost vs score per config. Pareto frontier highlights configs not + dominated on both axes. + </p> + + {/* Legend */} + <div + style={{ + display: "flex", + gap: "16px", + marginBottom: "12px", + fontSize: "11px", + color: "var(--text-muted)", + flexWrap: "wrap", + }} + > + {Object.entries(MODEL_COLORS).map(([model, color]) => ( + <div + key={model} + style={{ display: "flex", alignItems: "center", gap: "6px" }} + > + <div + style={{ + width: "8px", + height: "8px", + background: color, + }} + /> + <span>{model}</span> + </div> + ))} + <div style={{ display: "flex", alignItems: "center", gap: "6px" }}> + <div + style={{ + width: "12px", + height: "12px", + border: "2px solid hsl(92 28% 65%)", + background: "transparent", + }} + /> + <span>frontier</span> + </div> + </div> + + <ResponsiveContainer width="100%" height={420}> + <ScatterChart margin={{ top: 20, right: 30, bottom: 20, left: 20 }}> + <CartesianGrid strokeDasharray="3 3" stroke="var(--border)" /> + <XAxis + dataKey="avgCost" + type="number" + name="Avg Cost" + stroke="var(--text-muted)" + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + tickFormatter={(v: number) => `$${v.toFixed(2)}`} + label={{ + value: "Avg Cost ($)", + position: "insideBottom", + offset: -10, + fill: "var(--text-muted)", + fontSize: 11, + fontFamily: "'JetBrains Mono', monospace", + }} + /> + <YAxis + dataKey="avgScore" + type="number" + name="Avg Score" + stroke="var(--text-muted)" + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + domain={[0, 1]} + tickFormatter={(v: number) => `${(v * 100).toFixed(0)}%`} + label={{ + value: "Avg Score (%)", + angle: -90, + position: "insideLeft", + offset: 0, + fill: "var(--text-muted)", + fontSize: 11, + fontFamily: "'JetBrains Mono', monospace", + }} + /> + <Tooltip content={<CustomTooltip />} cursor={false} /> + + {/* Non-frontier points (dimmed) */} + <Scatter + name="configs" + data={nonFrontierPoints} + shape={nonFrontierShape} + isAnimationActive={false} + legendType="none" + /> + + {/* Frontier points (prominent) with connecting line */} + <Scatter + name="frontier" + data={frontierPoints} + shape={frontierShape} + isAnimationActive={false} + legendType="none" + line={{ stroke: "hsl(92 28% 65%)", strokeWidth: 1.5, strokeDasharray: "6 3" }} + lineType="joint" + lineJointType="linear" + /> + </ScatterChart> + </ResponsiveContainer> + + {/* Frontier labels below the chart */} + {frontierPoints.length > 0 && ( + <div + style={{ + marginTop: "12px", + display: "flex", + flexWrap: "wrap", + gap: "8px", + }} + > + {frontierPoints + .sort((a, b) => a.avgCost - b.avgCost) + .map((point) => ( + <div + key={point.cell_id} + onMouseEnter={() => setHoveredId(point.cell_id)} + onMouseLeave={() => setHoveredId(null)} + style={{ + padding: "4px 8px", + background: "var(--surface-2)", + border: "1px solid var(--border)", + fontSize: "10px", + fontFamily: "'JetBrains Mono', monospace", + color: getModelColor(point.model), + cursor: "default", + transition: "border-color 0.15s", + borderColor: + hoveredId === point.cell_id + ? "hsl(92 28% 65%)" + : "var(--border)", + }} + > + {point.label} + <span + style={{ color: "var(--text-muted)", marginLeft: "8px" }} + > + ${point.avgCost.toFixed(2)} /{" "} + {(point.avgScore * 100).toFixed(0)}% + </span> + </div> + ))} + </div> + )} + </div> + ); +} diff --git a/dashboard/src/components/Filters.tsx b/dashboard/src/components/Filters.tsx @@ -13,6 +13,11 @@ const AXIS_LABELS: Record<AxisName, string> = { prompt_style: "Prompt", language: "Language", human_language: "Human Lang", + tool_read: "Tool: Read", + tool_write: "Tool: Write", + tool_edit: "Tool: Edit", + tool_glob: "Tool: Glob", + tool_grep: "Tool: Grep", linter: "Linter", playwright: "Playwright", context_file: "Context", diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx @@ -0,0 +1,311 @@ +import { useState, useMemo } from "react"; +import type { Run, AxisName } from "../lib/data"; +import { AXIS_NAMES } from "../lib/data"; + +interface HeatmapMatrixProps { + runs: Run[]; +} + +const AXIS_LABELS: Record<AxisName, string> = { + model: "Model", + effort: "Effort", + prompt_style: "Prompt Style", + language: "Language", + human_language: "Human Lang", + tool_read: "Tool: Read", + tool_write: "Tool: Write", + tool_edit: "Tool: Edit", + tool_glob: "Tool: Glob", + tool_grep: "Tool: Grep", + linter: "Linter", + playwright: "Playwright", + context_file: "Context File", + sub_agents: "Sub-agents", + web_search: "Web Search", + max_budget: "Max Budget", +}; + +interface CellData { + totalScore: number; + count: number; +} + +function scoreToColor(pct: number): string { + // red (0%) -> yellow (50%) -> green (100%) + // Using the CSS variable HSL values directly for interpolation + if (pct <= 50) { + // red to yellow + const t = pct / 50; + const h = 355 + t * (40 - 355 + 360); // wrap around hue + const s = 52 + t * (71 - 52); + const l = 64 + t * (73 - 64); + return `hsl(${h % 360} ${s}% ${l}%)`; + } else { + // yellow to green + const t = (pct - 50) / 50; + const h = 40 + t * (92 - 40); + const s = 71 + t * (28 - 71); + const l = 73 + t * (65 - 73); + return `hsl(${h} ${s}% ${l}%)`; + } +} + +function cellBackground(pct: number): string { + const color = scoreToColor(pct); + // Use the color at low opacity for the cell background + return color.replace("hsl(", "hsla(").replace(")", " / 0.18)"); +} + +export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) { + const [rowAxis, setRowAxis] = useState<AxisName>("model"); + const [colAxis, setColAxis] = useState<AxisName>("prompt_style"); + + const { rowValues, colValues, cells } = useMemo(() => { + const cellMap: Record<string, Record<string, CellData>> = {}; + const rowSet = new Set<string>(); + const colSet = new Set<string>(); + + for (const run of runs) { + const score = run.eval_results?.score; + if (score === null || score === undefined) continue; + + const rv = String(run.meta[rowAxis]); + const cv = String(run.meta[colAxis]); + + rowSet.add(rv); + colSet.add(cv); + + if (!cellMap[rv]) cellMap[rv] = {}; + if (!cellMap[rv][cv]) cellMap[rv][cv] = { totalScore: 0, count: 0 }; + + cellMap[rv][cv].totalScore += score; + cellMap[rv][cv].count += 1; + } + + return { + rowValues: Array.from(rowSet).sort(), + colValues: Array.from(colSet).sort(), + cells: cellMap, + }; + }, [runs, rowAxis, colAxis]); + + const selectorStyle: React.CSSProperties = { + background: "var(--surface-2)", + border: "1px solid var(--border)", + borderRadius: 0, + color: "var(--text)", + fontFamily: "var(--font-mono)", + fontSize: "var(--text-ui)", + padding: "6px 10px", + textTransform: "uppercase" as const, + letterSpacing: "0.5px", + }; + + const labelStyle: React.CSSProperties = { + fontSize: "var(--text-label)", + color: "var(--text-muted)", + textTransform: "uppercase" as const, + letterSpacing: "1px", + fontWeight: 500, + fontFamily: "var(--font-mono)", + }; + + return ( + <div + style={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + borderRadius: 0, + padding: "20px", + }} + > + {/* Axis selectors */} + <div + style={{ + display: "flex", + gap: "24px", + marginBottom: "20px", + flexWrap: "wrap", + alignItems: "flex-end", + }} + > + <div style={{ display: "flex", flexDirection: "column", gap: "4px" }}> + <label style={labelStyle}>Row Axis</label> + <select + value={rowAxis} + onChange={(e) => setRowAxis(e.target.value as AxisName)} + style={selectorStyle} + > + {AXIS_NAMES.map((axis) => ( + <option key={axis} value={axis}> + {AXIS_LABELS[axis]} + </option> + ))} + </select> + </div> + <div style={{ display: "flex", flexDirection: "column", gap: "4px" }}> + <label style={labelStyle}>Column Axis</label> + <select + value={colAxis} + onChange={(e) => setColAxis(e.target.value as AxisName)} + style={selectorStyle} + > + {AXIS_NAMES.map((axis) => ( + <option key={axis} value={axis}> + {AXIS_LABELS[axis]} + </option> + ))} + </select> + </div> + </div> + + {/* Heatmap table */} + {rowValues.length === 0 || colValues.length === 0 ? ( + <div + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + fontFamily: "var(--font-mono)", + }} + > + No scored runs available for this axis combination. + </div> + ) : ( + <div style={{ overflowX: "auto" }}> + <table + style={{ + borderCollapse: "collapse", + width: "auto", + fontFamily: "var(--font-mono)", + }} + > + <thead> + <tr> + <th + style={{ + padding: "8px 12px", + fontSize: "var(--text-label)", + textTransform: "uppercase", + letterSpacing: "1px", + fontWeight: 500, + color: "var(--text-muted)", + background: "var(--surface-2)", + border: "1px solid var(--border)", + borderRadius: 0, + textAlign: "left", + }} + > + {AXIS_LABELS[rowAxis]} \ {AXIS_LABELS[colAxis]} + </th> + {colValues.map((col) => ( + <th + key={col} + style={{ + padding: "8px 12px", + fontSize: "var(--text-label)", + textTransform: "uppercase", + letterSpacing: "1px", + fontWeight: 500, + color: "var(--text-muted)", + background: "var(--surface-2)", + border: "1px solid var(--border)", + borderRadius: 0, + textAlign: "center", + fontFamily: "var(--font-mono)", + }} + > + {col} + </th> + ))} + </tr> + </thead> + <tbody> + {rowValues.map((row) => ( + <tr key={row}> + <td + style={{ + padding: "8px 12px", + fontSize: "var(--text-label)", + textTransform: "uppercase", + letterSpacing: "1px", + fontWeight: 600, + fontFamily: "var(--font-mono)", + color: "var(--text)", + background: "var(--surface-2)", + border: "1px solid var(--border)", + borderRadius: 0, + whiteSpace: "nowrap", + }} + > + {row} + </td> + {colValues.map((col) => { + const cell = cells[row]?.[col]; + if (!cell) { + return ( + <td + key={col} + style={{ + padding: "10px 16px", + textAlign: "center", + color: "var(--text-muted)", + fontFamily: "var(--font-mono)", + fontSize: "var(--text-ui)", + border: "1px solid var(--border)", + borderRadius: 0, + background: "var(--surface-0)", + }} + > + - + </td> + ); + } + + const avg = cell.totalScore / cell.count; + const pct = avg * 100; + + return ( + <td + key={col} + style={{ + padding: "10px 16px", + textAlign: "center", + fontFamily: "var(--font-mono)", + border: "1px solid var(--border)", + borderRadius: 0, + background: cellBackground(pct), + }} + > + <div + style={{ + fontSize: "var(--text-ui)", + fontWeight: 700, + color: scoreToColor(pct), + lineHeight: 1.3, + }} + > + {pct.toFixed(0)}% + </div> + <div + style={{ + fontSize: "var(--text-label)", + fontWeight: 400, + color: "var(--text-muted)", + lineHeight: 1.3, + }} + > + n={cell.count} + </div> + </td> + ); + })} + </tr> + ))} + </tbody> + </table> + </div> + )} + </div> + ); +} diff --git a/dashboard/src/components/RadarComparison.tsx b/dashboard/src/components/RadarComparison.tsx @@ -0,0 +1,329 @@ +import { useState, useMemo } from "react"; +import { + RadarChart, + Radar, + PolarGrid, + PolarAngleAxis, + PolarRadiusAxis, + ResponsiveContainer, + Tooltip, +} from "recharts"; +import type { Run } from "../lib/data"; + +interface RadarComparisonProps { + runs: Run[]; +} + +const DIMENSIONS = [ + "structural", + "functional", + "quality", + "code_analysis", + "gameplay_bot", + "transcript_analysis", +] as const; + +type Dimension = (typeof DIMENSIONS)[number]; + +const DIMENSION_LABELS: Record<Dimension, string> = { + structural: "Structural", + functional: "Functional", + quality: "Quality", + code_analysis: "Code Analysis", + gameplay_bot: "Gameplay Bot", + transcript_analysis: "Transcript", +}; + +const COLOR_A = "hsl(193 44% 67%)"; +const COLOR_B = "hsl(40 71% 73%)"; + +function extractDimensionScore(run: Run, dim: Dimension): number | null { + if (!run.eval_results) return null; + const section = run.eval_results[dim as keyof typeof run.eval_results]; + if (section && typeof section === "object" && "score" in section) { + const score = (section as { score: number }).score; + return typeof score === "number" ? score : null; + } + return null; +} + +interface CellConfig { + cell_id: string; + label: string; + runs: Run[]; +} + +function buildCellConfigs(runs: Run[]): CellConfig[] { + const grouped: Record<string, Run[]> = {}; + for (const run of runs) { + const id = run.meta.cell_id; + if (!grouped[id]) grouped[id] = []; + grouped[id].push(run); + } + + return Object.entries(grouped) + .map(([cell_id, cellRuns]) => { + const m = cellRuns[0].meta; + const label = `${m.model} / ${m.language} / ${m.prompt_style} / ${m.effort}`; + return { cell_id, label, runs: cellRuns }; + }) + .sort((a, b) => a.label.localeCompare(b.label)); +} + +function averageScores( + runs: Run[], + dim: Dimension +): number | null { + const scores = runs + .map((r) => extractDimensionScore(r, dim)) + .filter((s): s is number => s !== null); + if (scores.length === 0) return null; + return scores.reduce((a, b) => a + b, 0) / scores.length; +} + +interface RadarDatum { + dimension: string; + scoreA: number; + scoreB: number; + labelA: string; + labelB: string; +} + +function CustomTick({ + payload, + x, + y, + data, +}: { + payload: { value: string }; + x: number; + y: number; + data: RadarDatum[]; +}) { + const datum = data.find((d) => d.dimension === payload.value); + if (!datum) return null; + + return ( + <g transform={`translate(${x},${y})`}> + <text + textAnchor="middle" + dy={-8} + style={{ + fill: "var(--text)", + fontSize: "11px", + fontFamily: "'JetBrains Mono', monospace", + fontWeight: 500, + }} + > + {payload.value} + </text> + <text + textAnchor="middle" + dy={6} + style={{ + fontSize: "10px", + fontFamily: "'JetBrains Mono', monospace", + }} + > + <tspan fill={COLOR_A}>{datum.labelA}</tspan> + <tspan fill="var(--text-muted)"> / </tspan> + <tspan fill={COLOR_B}>{datum.labelB}</tspan> + </text> + </g> + ); +} + +export default function RadarComparison({ runs }: RadarComparisonProps) { + const cellConfigs = useMemo(() => buildCellConfigs(runs), [runs]); + + const [selectedA, setSelectedA] = useState<string>( + cellConfigs[0]?.cell_id ?? "" + ); + const [selectedB, setSelectedB] = useState<string>( + cellConfigs[1]?.cell_id ?? cellConfigs[0]?.cell_id ?? "" + ); + + const configA = cellConfigs.find((c) => c.cell_id === selectedA); + const configB = cellConfigs.find((c) => c.cell_id === selectedB); + + const data: RadarDatum[] = useMemo(() => { + return DIMENSIONS.map((dim) => { + const scoreA = configA ? averageScores(configA.runs, dim) : null; + const scoreB = configB ? averageScores(configB.runs, dim) : null; + return { + dimension: DIMENSION_LABELS[dim], + scoreA: scoreA ?? 0, + scoreB: scoreB ?? 0, + labelA: scoreA !== null ? (scoreA * 100).toFixed(0) + "%" : "n/a", + labelB: scoreB !== null ? (scoreB * 100).toFixed(0) + "%" : "n/a", + }; + }); + }, [configA, configB]); + + if (cellConfigs.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + borderRadius: 0, + }} + > + No configurations available for comparison. + </div> + ); + } + + return ( + <div + style={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + borderRadius: 0, + padding: "20px", + }} + > + <h3 style={{ marginBottom: "16px" }}>Quality Radar Comparison</h3> + + <div + style={{ + display: "flex", + gap: "24px", + marginBottom: "20px", + flexWrap: "wrap", + }} + > + <div className="filter-group"> + <label style={{ color: COLOR_A, fontWeight: 600 }}>Config A</label> + <select + value={selectedA} + onChange={(e) => setSelectedA(e.target.value)} + > + {cellConfigs.map((c) => ( + <option key={c.cell_id} value={c.cell_id}> + {c.label} (n={c.runs.length}) + </option> + ))} + </select> + </div> + + <div className="filter-group"> + <label style={{ color: COLOR_B, fontWeight: 600 }}>Config B</label> + <select + value={selectedB} + onChange={(e) => setSelectedB(e.target.value)} + > + {cellConfigs.map((c) => ( + <option key={c.cell_id} value={c.cell_id}> + {c.label} (n={c.runs.length}) + </option> + ))} + </select> + </div> + </div> + + <ResponsiveContainer width="100%" height={420}> + <RadarChart cx="50%" cy="50%" outerRadius="70%" data={data}> + <PolarGrid + stroke="var(--border)" + strokeDasharray="3 3" + /> + <PolarAngleAxis + dataKey="dimension" + tick={(props: Record<string, unknown>) => ( + <CustomTick + payload={props.payload as { value: string }} + x={props.x as number} + y={props.y as number} + data={data} + /> + )} + /> + <PolarRadiusAxis + angle={90} + domain={[0, 1]} + tickCount={6} + tick={{ + fill: "var(--text-muted)", + fontSize: 10, + fontFamily: "'JetBrains Mono', monospace", + }} + tickFormatter={(v: number) => (v * 100).toFixed(0) + "%"} + stroke="var(--border)" + /> + <Radar + name="Config A" + dataKey="scoreA" + stroke={COLOR_A} + fill={COLOR_A} + fillOpacity={0.3} + strokeWidth={2} + /> + <Radar + name="Config B" + dataKey="scoreB" + stroke={COLOR_B} + fill={COLOR_B} + fillOpacity={0.3} + strokeWidth={2} + /> + <Tooltip + contentStyle={{ + background: "var(--surface-1)", + border: "1px solid var(--border)", + borderRadius: "2px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + }} + formatter={(value: unknown, name: unknown) => [ + ((Number(value) || 0) * 100).toFixed(1) + "%", + String(name), + ]} + /> + </RadarChart> + </ResponsiveContainer> + + <div + style={{ + display: "flex", + justifyContent: "center", + gap: "24px", + marginTop: "12px", + fontSize: "11px", + fontFamily: "var(--font-mono)", + }} + > + <span> + <span + style={{ + display: "inline-block", + width: "12px", + height: "12px", + background: COLOR_A, + opacity: 0.7, + marginRight: "6px", + verticalAlign: "middle", + }} + /> + <span style={{ color: COLOR_A }}>Config A</span> + </span> + <span> + <span + style={{ + display: "inline-block", + width: "12px", + height: "12px", + background: COLOR_B, + opacity: 0.7, + marginRight: "6px", + verticalAlign: "middle", + }} + /> + <span style={{ color: COLOR_B }}>Config B</span> + </span> + </div> + </div> + ); +} diff --git a/dashboard/src/layouts/Base.astro b/dashboard/src/layouts/Base.astro @@ -35,6 +35,7 @@ try { <nav style="display: flex; gap: 16px; font-size: 0.875rem;"> <a href="/">Grid</a> <a href="/insights">Insights</a> + <a href="/explore">Explore</a> <a href="/compare">Compare</a> </nav> </div> diff --git a/dashboard/src/pages/explore.astro b/dashboard/src/pages/explore.astro @@ -0,0 +1,35 @@ +--- +import Base from "../layouts/Base.astro"; +import { loadAllRuns } from "../lib/data"; +import HeatmapMatrix from "../components/HeatmapMatrix"; +import RadarComparison from "../components/RadarComparison"; +import BumpChart from "../components/BumpChart"; +import ConfigTreemap from "../components/ConfigTreemap"; +import EfficiencyFrontier from "../components/EfficiencyFrontier"; +import CorrelationMatrix from "../components/CorrelationMatrix"; + +const runs = loadAllRuns(); +--- + +<Base title="Explore"> + <h1 style="margin-bottom: 8px;">Explore</h1> + <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> + Interactive visualizations of the experiment space + </p> + + <div style="display: flex; flex-direction: column; gap: 32px;"> + <CorrelationMatrix client:load runs={runs} /> + + <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> + <EfficiencyFrontier client:load runs={runs} /> + <BumpChart client:load runs={runs} /> + </div> + + <HeatmapMatrix client:load runs={runs} /> + + <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> + <RadarComparison client:load runs={runs} /> + <ConfigTreemap client:load runs={runs} /> + </div> + </div> +</Base>

Impressum · Datenschutz