loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit a611a5d3411b89d26ba3192da0591c010d0a784a
parent 12077385fc8b2b124de9766fa7796e70b4487ef3
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 17:37:53 +0200

Replace task chart with Top/Bottom 10 configs on grid page

Best & Worst configurations with key differentiator badges.
Removed redundant Pass Rate by Task chart (only one task).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 309+++++++++++++------------------------------------------------------------------
Adashboard/src/components/TopBottomConfigs.tsx | 321+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/index.astro | 6+++++-
3 files changed, 376 insertions(+), 260 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -8,9 +8,7 @@ import { CartesianGrid, Tooltip, ResponsiveContainer, - Legend, Cell, - ZAxis, } from "recharts"; import type { Run } from "../lib/types"; import { getModelColor, modelSortOrder } from "../lib/colors"; @@ -35,28 +33,6 @@ interface BoxPlotData { color: string; } -interface TaskBoxPlotData { - label: string; - // Score distribution - sMin: number; - sQ1: number; - sMedian: number; - sQ3: number; - sMax: number; - sBase: number; - sIqr: number; - sScores: number[]; - // Pass rate distribution - pMin: number; - pQ1: number; - pMedian: number; - pQ3: number; - pMax: number; - pBase: number; - pIqr: number; - pScores: number[]; - cellCount: number; -} const SMUI = { surface0: "hsl(213 16% 12%)", @@ -199,33 +175,6 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] { }); } -function aggregateByTask(runs: Run[]): TaskBoxPlotData[] { - const cells = aggregateCells(runs); - const byTask: Record<string, CellAggregate[]> = {}; - - for (const cell of cells) { - if (!byTask[cell.task]) byTask[cell.task] = []; - byTask[cell.task].push(cell); - } - - return Object.entries(byTask).map(([task, taskCells]) => { - const scores = taskCells.map((c) => Math.round(c.avgScore * 100)); - const passRates = taskCells.map((c) => Math.round(c.passRate * 100)); - const sStats = computeBoxStats(scores); - const pStats = computeBoxStats(passRates); - - return { - label: `${task} (n=${taskCells.length})`, - sMin: sStats.min, sQ1: sStats.q1, sMedian: sStats.median, sQ3: sStats.q3, sMax: sStats.max, - sBase: sStats.q1, sIqr: sStats.q3 - sStats.q1, - sScores: scores, - pMin: pStats.min, pQ1: pStats.q1, pMedian: pStats.median, pQ3: pStats.q3, pMax: pStats.max, - pBase: pStats.q1, pIqr: pStats.q3 - pStats.q1, - pScores: passRates, - cellCount: taskCells.length, - }; - }); -} // Custom shape: draws a box from q1 to q3 with whiskers from min to max and a median line function BoxPlotShape(props: any) { @@ -272,69 +221,8 @@ function BoxPlotShape(props: any) { ); } -// Custom shape for task box plots (score or pass rate) -function TaskBoxPlotShape(prefix: "s" | "p", color: string) { - return function Shape(props: any) { - const { x, y, width, height, payload } = props as { - x: number; y: number; width: number; height: number; - payload: TaskBoxPlotData; - }; - if (!payload || height === undefined) return null; - - const min = payload[`${prefix}Min`] as number; - const q1 = payload[`${prefix}Q1`] as number; - const q3 = payload[`${prefix}Q3`] as number; - const median = payload[`${prefix}Median`] as number; - const max = payload[`${prefix}Max`] as number; - const boxTop = y; - const boxBottom = y + height; - const centerX = x + width / 2; - const dataToY = (val: number): number => { - if (q3 === q1) return boxTop; - return boxTop + ((q3 - val) / (q3 - q1)) * (boxBottom - boxTop); - }; - - const minY = dataToY(min); - const maxY = dataToY(max); - const medianY = dataToY(median); - const whiskerHalfW = width * 0.3; - - return ( - <g> - <line x1={centerX} y1={minY} x2={centerX} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> - <line x1={centerX - whiskerHalfW} y1={minY} x2={centerX + whiskerHalfW} y2={minY} stroke={SMUI.muted} strokeWidth={1} /> - <line x1={centerX - whiskerHalfW} y1={maxY} x2={centerX + whiskerHalfW} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> - <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} /> - <line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} /> - </g> - ); - }; -} - -// Build scatter data for individual cell dots on model chart -function modelScatterData(data: BoxPlotData[]): Array<{ label: string; score: number; color: string }> { - const points: Array<{ label: string; score: number; color: string }> = []; - for (const d of data) { - for (const s of d.scores) { - points.push({ label: d.label, score: s, color: d.color }); - } - } - return points; -} - -// Build scatter data for task chart -function taskScatterData(data: TaskBoxPlotData[], prefix: "s" | "p", color: string): Array<{ label: string; value: number; color: string }> { - const key = `${prefix}Scores` as "sScores" | "pScores"; - const points: Array<{ label: string; value: number; color: string }> = []; - for (const d of data) { - for (const s of d[key]) { - points.push({ label: d.label, value: s, color }); - } - } - return points; -} // Custom tooltip for model box plot function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; payload?: Array<{ payload: BoxPlotData }>; label?: string }) { @@ -352,28 +240,6 @@ function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; ); } -// Custom tooltip for task box plot -function TaskBoxTooltipContent({ active, payload, label }: { active?: boolean; payload?: Array<{ payload: TaskBoxPlotData }>; label?: string }) { - if (!active || !payload || payload.length === 0) return null; - const d = payload[0].payload; - return ( - <div style={TOOLTIP_STYLE}> - <div style={{ marginBottom: 4, fontWeight: 600 }}>{label}</div> - <div style={{ marginBottom: 4 }}> - <div style={{ color: SMUI.frost2, fontWeight: 600 }}>Score</div> - <div>Max: {d.sMax}% / Q3: {Math.round(d.sQ3)}%</div> - <div>Median: {Math.round(d.sMedian)}%</div> - <div>Q1: {Math.round(d.sQ1)}% / Min: {d.sMin}%</div> - </div> - <div> - <div style={{ color: SMUI.green, fontWeight: 600 }}>Pass Rate</div> - <div>Max: {d.pMax}% / Q3: {Math.round(d.pQ3)}%</div> - <div>Median: {Math.round(d.pMedian)}%</div> - <div>Q1: {Math.round(d.pQ1)}% / Min: {d.pMin}%</div> - </div> - </div> - ); -} export default function Charts({ runs }: ChartsProps) { // Extract unique models sorted consistently @@ -397,134 +263,59 @@ export default function Charts({ runs }: ChartsProps) { const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model)); const modelData = aggregateByModel(filteredRuns); - const taskData = aggregateByTask(filteredRuns); - const modelDots = modelScatterData(modelData); - const taskScoreDots = taskScatterData(taskData, "s", SMUI.frost2); - const taskPassDots = taskScatterData(taskData, "p", SMUI.green); return ( - <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "16px" }}> - <div className="card"> - <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> - <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> - <ModelSelector - allModels={allModels} - selectedModels={selectedModels} - onChange={setSelectedModels} - /> - </div> - <ResponsiveContainer width="100%" height={270}> - <ComposedChart data={modelData} barCategoryGap="20%"> - <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} /> - <XAxis - dataKey="label" - stroke={SMUI.muted} - tickLine={false} - axisLine={{ stroke: SMUI.border }} - interval={0} - tick={({ x, y, payload }: any) => { - const [name, count] = (payload.value as string).split("|"); - return ( - <g> - <text x={x} y={y + 12} textAnchor="middle" fill={SMUI.muted} fontSize={10} fontFamily="'JetBrains Mono', monospace">{name}</text> - <text x={x} y={y + 24} textAnchor="middle" fill={SMUI.muted} fontSize={8} fontFamily="'JetBrains Mono', monospace" opacity={0.6}>{count}</text> - </g> - ); - }} - height={40} - /> - <YAxis - stroke={SMUI.muted} - fontSize={11} - fontFamily="'JetBrains Mono', monospace" - domain={[0, 100]} - tickLine={false} - axisLine={false} - yAxisId="score" - /> - <Tooltip content={<ModelBoxTooltipContent />} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> - {/* Invisible base bar to push the visible box up to q1 */} - <Bar dataKey="base" stackId="box" fill="transparent" barSize={40} yAxisId="score" /> - {/* Visible IQR box with custom shape for whiskers and median */} - <Bar dataKey="iqr" stackId="box" barSize={40} yAxisId="score" shape={<BoxPlotShape />}> - {modelData.map((entry) => ( - <Cell key={entry.label} fill={entry.color} /> - ))} - </Bar> - {/* Hidden scatter to keep recharts scale consistent */} - <Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" /> - </ComposedChart> - </ResponsiveContainer> - </div> - - <div className="card"> - <h3 style={{ marginBottom: "16px" }}>Score & Pass Rate Distribution by Task</h3> - <ResponsiveContainer width="100%" height={250}> - <ComposedChart data={taskData} barCategoryGap="20%"> - <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} /> - <XAxis - dataKey="label" - stroke={SMUI.muted} - fontSize={11} - fontFamily="'JetBrains Mono', monospace" - tickLine={false} - axisLine={{ stroke: SMUI.border }} - /> - <YAxis - stroke={SMUI.muted} - fontSize={11} - fontFamily="'JetBrains Mono', monospace" - domain={[0, 100]} - tickLine={false} - axisLine={false} - yAxisId="score" - /> - <Tooltip content={<TaskBoxTooltipContent />} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> - <Legend - wrapperStyle={{ - fontFamily: "'JetBrains Mono', monospace", - fontSize: "11px", - textTransform: "uppercase", - letterSpacing: "0.5px", - }} - /> - {/* Score box plot */} - <Bar dataKey="sBase" stackId="scoreBox" fill="transparent" barSize={30} yAxisId="score" name=" " legendType="none" /> - <Bar dataKey="sIqr" stackId="scoreBox" barSize={30} yAxisId="score" name="Score %" shape={TaskBoxPlotShape("s", SMUI.frost2)} fill={SMUI.frost2} /> - {/* Pass rate box plot */} - <Bar dataKey="pBase" stackId="passBox" fill="transparent" barSize={30} yAxisId="score" name=" " legendType="none" /> - <Bar dataKey="pIqr" stackId="passBox" barSize={30} yAxisId="score" name="Pass Rate %" shape={TaskBoxPlotShape("p", SMUI.green)} fill={SMUI.green} /> - {/* Individual score dots */} - <Scatter - data={taskScoreDots} - dataKey="value" - yAxisId="score" - fill={SMUI.frost2} - fillOpacity={0.5} - legendType="none" - > - <ZAxis range={[15, 15]} /> - {taskScoreDots.map((_, i) => ( - <Cell key={i} fill={SMUI.frost2} fillOpacity={0.5} /> - ))} - </Scatter> - {/* Individual pass rate dots */} - <Scatter - data={taskPassDots} - dataKey="value" - yAxisId="score" - fill={SMUI.green} - fillOpacity={0.5} - legendType="none" - > - <ZAxis range={[15, 15]} /> - {taskPassDots.map((_, i) => ( - <Cell key={i} fill={SMUI.green} fillOpacity={0.5} /> - ))} - </Scatter> - </ComposedChart> - </ResponsiveContainer> + <div className="card"> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> + <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> + <ModelSelector + allModels={allModels} + selectedModels={selectedModels} + onChange={setSelectedModels} + /> </div> + <ResponsiveContainer width="100%" height={270}> + <ComposedChart data={modelData} barCategoryGap="20%"> + <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} /> + <XAxis + dataKey="label" + stroke={SMUI.muted} + tickLine={false} + axisLine={{ stroke: SMUI.border }} + interval={0} + tick={({ x, y, payload }: any) => { + const [name, count] = (payload.value as string).split("|"); + return ( + <g> + <text x={x} y={y + 12} textAnchor="middle" fill={SMUI.muted} fontSize={10} fontFamily="'JetBrains Mono', monospace">{name}</text> + <text x={x} y={y + 24} textAnchor="middle" fill={SMUI.muted} fontSize={8} fontFamily="'JetBrains Mono', monospace" opacity={0.6}>{count}</text> + </g> + ); + }} + height={40} + /> + <YAxis + stroke={SMUI.muted} + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + domain={[0, 100]} + tickLine={false} + axisLine={false} + yAxisId="score" + /> + <Tooltip content={<ModelBoxTooltipContent />} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> + {/* Invisible base bar to push the visible box up to q1 */} + <Bar dataKey="base" stackId="box" fill="transparent" barSize={40} yAxisId="score" /> + {/* Visible IQR box with custom shape for whiskers and median */} + <Bar dataKey="iqr" stackId="box" barSize={40} yAxisId="score" shape={<BoxPlotShape />}> + {modelData.map((entry) => ( + <Cell key={entry.label} fill={entry.color} /> + ))} + </Bar> + {/* Hidden scatter to keep recharts scale consistent */} + <Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" /> + </ComposedChart> + </ResponsiveContainer> </div> ); } diff --git a/dashboard/src/components/TopBottomConfigs.tsx b/dashboard/src/components/TopBottomConfigs.tsx @@ -0,0 +1,321 @@ +import { useMemo } from "react"; +import type { Run, AxisName } from "../lib/types"; +import { AXIS_NAMES } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; +import { getModelColor } from "../lib/colors"; + +interface TopBottomConfigsProps { + runs: Run[]; +} + +const SMUI = { + surface0: "hsl(213 16% 12%)", + surface1: "hsl(217 16% 15.5%)", + surface2: "hsl(216 15% 19%)", + border: "hsl(217 17% 28%)", + muted: "hsl(213 14% 65%)", + green: "hsl(92 28% 65%)", + red: "hsl(355 52% 64%)", +}; + +// Axes to skip when computing differentiators (metadata, not config) +const SKIP_DIFF_AXES = new Set([ + "actual_model", + "provider", +]); + +// Compute which axes vary across cells, and find the most common (default) value for each +function computeDefaults( + cells: Array<{ meta: Run["meta"] }> +): { varyingAxes: AxisName[]; defaults: Record<string, string> } { + const axisCounts: Record<string, Record<string, number>> = {}; + + for (const axis of AXIS_NAMES) { + if (SKIP_DIFF_AXES.has(axis)) continue; + axisCounts[axis] = {}; + for (const cell of cells) { + const val = String((cell.meta as unknown as Record<string, unknown>)[axis] ?? ""); + axisCounts[axis][val] = (axisCounts[axis][val] || 0) + 1; + } + } + + const defaults: Record<string, string> = {}; + const varyingAxes: AxisName[] = []; + + for (const axis of AXIS_NAMES) { + if (SKIP_DIFF_AXES.has(axis)) continue; + const counts = axisCounts[axis]; + if (!counts) continue; + const values = Object.keys(counts); + if (values.length <= 1) continue; // same value everywhere, skip + varyingAxes.push(axis); + // Default = most common value + let maxCount = 0; + let defaultVal = values[0]; + for (const [val, count] of Object.entries(counts)) { + if (count > maxCount) { + maxCount = count; + defaultVal = val; + } + } + defaults[axis] = defaultVal; + } + + return { varyingAxes, defaults }; +} + +function getDifferentiators( + meta: Run["meta"], + varyingAxes: AxisName[], + defaults: Record<string, string> +): string[] { + const badges: string[] = []; + for (const axis of varyingAxes) { + const val = String((meta as unknown as Record<string, unknown>)[axis] ?? ""); + if (val !== defaults[axis]) { + // Format: show just the value for known axes, or axis=value for clarity + const label = formatBadge(axis, val); + if (label) badges.push(label); + } + } + return badges; +} + +function formatBadge(axis: string, value: string): string { + // For boolean-like axes (tool_*, linter, playwright, web_search, etc.), show axis name + state + if (value === "on" || value === "off") { + const shortName = axis.replace("tool_", "").replace("_", " "); + return value === "on" ? shortName : `no ${shortName}`; + } + // For model, effort, prompt_style, language, etc., just show the value + return value; +} + +interface CellEntry { + cellId: string; + model: string; + avgScore: number; + meta: Run["meta"]; + runCount: number; + badges: string[]; +} + +function BarRow({ + entry, + maxScore, + accentColor, +}: { + entry: CellEntry; + maxScore: number; + accentColor: string; +}) { + const pct = Math.round(entry.avgScore * 100); + const barWidth = maxScore > 0 ? (entry.avgScore / maxScore) * 100 : 0; + const modelColor = getModelColor(entry.model); + + return ( + <div + style={{ + display: "flex", + alignItems: "center", + gap: 8, + marginBottom: 3, + fontFamily: "'JetBrains Mono', monospace", + fontSize: 11, + }} + > + {/* Bar */} + <div + style={{ + position: "relative", + width: "40%", + minWidth: 80, + height: 18, + background: SMUI.surface2, + flexShrink: 0, + }} + > + <div + style={{ + position: "absolute", + top: 0, + left: 0, + height: "100%", + width: `${barWidth}%`, + background: modelColor, + opacity: 0.7, + }} + /> + <div + style={{ + position: "absolute", + top: 0, + left: 0, + height: "100%", + width: `${barWidth}%`, + borderLeft: `2px solid ${accentColor}`, + boxSizing: "border-box", + }} + /> + <span + style={{ + position: "absolute", + left: 4, + top: 1, + fontSize: 10, + color: "#fff", + fontWeight: 600, + textShadow: "0 1px 2px rgba(0,0,0,0.6)", + }} + > + {pct}% + </span> + </div> + + {/* Model name */} + <span + style={{ + color: modelColor, + fontWeight: 600, + fontSize: 10, + flexShrink: 0, + minWidth: 40, + }} + > + {entry.model} + </span> + + {/* Config badges */} + <div + style={{ + display: "flex", + flexWrap: "wrap", + gap: 3, + overflow: "hidden", + }} + > + {entry.badges.map((badge, i) => ( + <span + key={i} + style={{ + background: SMUI.surface2, + border: `1px solid ${SMUI.border}`, + padding: "1px 5px", + fontSize: 9, + color: SMUI.muted, + fontFamily: "'JetBrains Mono', monospace", + whiteSpace: "nowrap", + lineHeight: "14px", + }} + > + {badge} + </span> + ))} + </div> + </div> + ); +} + +export default function TopBottomConfigs({ runs }: TopBottomConfigsProps) { + const { top10, bottom10 } = useMemo(() => { + const cells = groupIntoCells(runs); + if (cells.length === 0) return { top10: [], bottom10: [] }; + + const { varyingAxes, defaults } = computeDefaults(cells); + + const entries: CellEntry[] = cells + .filter((c) => c.score.avg > 0 || c.n > 0) + .map((c) => ({ + cellId: c.cell_id, + model: c.meta.actual_model || c.meta.model, + avgScore: c.score.avg, + meta: c.meta, + runCount: c.n, + badges: getDifferentiators(c.meta, varyingAxes, defaults), + })) + .sort((a, b) => b.avgScore - a.avgScore); + + const top10 = entries.slice(0, 10); + const bottom10 = entries.slice(-10).reverse(); // worst first (lowest at bottom) + + return { top10, bottom10 }; + }, [runs]); + + if (top10.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: 40, + color: SMUI.muted, + fontFamily: "'JetBrains Mono', monospace", + }} + > + No data yet. + </div> + ); + } + + const maxScore = Math.max( + ...top10.map((e) => e.avgScore), + ...bottom10.map((e) => e.avgScore) + ); + + return ( + <div className="card"> + <h3 style={{ margin: "0 0 16px 0" }}>Best & Worst Configurations</h3> + <div style={{ display: "flex", gap: 24, flexWrap: "wrap" }}> + {/* Top 10 */} + <div style={{ flex: 1, minWidth: 200 }}> + <div + style={{ + fontSize: 10, + fontFamily: "'JetBrains Mono', monospace", + textTransform: "uppercase", + letterSpacing: "0.5px", + color: SMUI.green, + marginBottom: 8, + fontWeight: 600, + }} + > + Top 10 + </div> + {top10.map((entry) => ( + <BarRow + key={entry.cellId} + entry={entry} + maxScore={maxScore} + accentColor={SMUI.green} + /> + ))} + </div> + + {/* Bottom 10 */} + <div style={{ flex: 1, minWidth: 200 }}> + <div + style={{ + fontSize: 10, + fontFamily: "'JetBrains Mono', monospace", + textTransform: "uppercase", + letterSpacing: "0.5px", + color: SMUI.red, + marginBottom: 8, + fontWeight: 600, + }} + > + Bottom 10 + </div> + {bottom10.map((entry) => ( + <BarRow + key={entry.cellId} + entry={entry} + maxScore={maxScore} + accentColor={SMUI.red} + /> + ))} + </div> + </div> + </div> + ); +} diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro @@ -4,6 +4,7 @@ import { loadAllRuns, getAxisValues, getTaskNames } from "../lib/data"; import type { Run } from "../lib/types"; import Grid from "../components/Grid"; import Charts from "../components/Charts"; +import TopBottomConfigs from "../components/TopBottomConfigs"; const runs = loadAllRuns(); const axisValues = getAxisValues(runs); @@ -102,7 +103,10 @@ const totalCells = new Set(runs.map(r => r.meta.cell_id)).size; </div> ))} - <Charts client:load runs={runs} /> + <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> + <Charts client:load runs={runs} /> + <TopBottomConfigs client:load runs={runs} /> + </div> <div style="margin-top: 32px;"> <h2 style="margin-bottom: 16px;">All Cells</h2>

Impressum · Datenschutz