loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 82f6a5b0b78b3ac64bfad4963937ddcd7687a317
parent 5e358b275032b8351588c74c53ef7c5853c1b8b4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 08:03:28 +0200

Add variability analysis to insights page

Three new visualizations on /insights:

1. Score Distribution by Model (box plots)
   - Horizontal box plots showing min/Q1/median/Q3/max per model
   - Individual cell dots overlaid
   - Shows which model is most consistent

2. Reliability Ranking by Variable
   - For each variable value, shows average within-cell range
   - Sorted most reliable first (smallest range)
   - Green/yellow/red bars for consistency level

3. Variance Contribution (ANOVA decomposition)
   - Stacked bar showing: % from config choices vs % from randomness
   - Answers "does the loop config actually matter?"

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/Variability.tsx | 718+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/insights.astro | 7++++++-
2 files changed, 724 insertions(+), 1 deletion(-)

diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx @@ -0,0 +1,718 @@ +import { useMemo } from "react"; +import type { Run, AxisName } from "../lib/types"; +import { AXIS_NAMES } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; +import type { Cell } from "../lib/analysis"; + +interface VariabilityProps { + runs: Run[]; +} + +const AXIS_LABELS: Record<string, string> = { + model: "Model", + effort: "Effort", + prompt_style: "Prompt Style", + language: "Language", + human_language: "Human Language", + tool_read: "Read Tool", + tool_write: "Write Tool", + tool_edit: "Edit Tool", + tool_glob: "Glob Tool", + tool_grep: "Grep Tool", + linter: "Linter", + playwright: "Playwright", + context_file: "Context File", + sub_agents: "Sub-agents", + web_search: "Web Search", + max_budget: "Budget", +}; + +/* ---------- helpers ---------- */ + +function quantile(sorted: number[], q: number): number { + if (sorted.length === 0) return 0; + if (sorted.length === 1) return sorted[0]; + const pos = q * (sorted.length - 1); + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sorted[lo]; + return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]); +} + +function variance(values: number[]): number { + if (values.length < 2) return 0; + const mean = values.reduce((a, b) => a + b, 0) / values.length; + return values.reduce((s, v) => s + (v - mean) ** 2, 0) / values.length; +} + +/* ---------- Section 1: Box Plots ---------- */ + +interface BoxPlotStats { + model: string; + min: number; + q1: number; + median: number; + q3: number; + max: number; + points: number[]; + cellCount: number; +} + +function computeBoxPlots(cells: Cell[]): BoxPlotStats[] { + const models = Array.from(new Set(cells.map((c) => c.meta.model))).sort(); + const results: BoxPlotStats[] = []; + + for (const model of models) { + const modelCells = cells.filter((c) => c.meta.model === model); + const scores = modelCells + .map((c) => c.score.avg) + .filter((s) => s > 0) + .sort((a, b) => a - b); + + if (scores.length === 0) continue; + + results.push({ + model, + min: scores[0], + q1: quantile(scores, 0.25), + median: quantile(scores, 0.5), + q3: quantile(scores, 0.75), + max: scores[scores.length - 1], + points: scores, + cellCount: scores.length, + }); + } + + return results; +} + +function BoxPlotSection({ cells }: { cells: Cell[] }) { + const stats = useMemo(() => computeBoxPlots(cells), [cells]); + + if (stats.length === 0) { + return ( + <div style={{ color: "var(--text-muted)", padding: "20px" }}> + No scored cells available. + </div> + ); + } + + // Global scale across all models + const globalMin = Math.min(...stats.map((s) => s.min)); + const globalMax = Math.max(...stats.map((s) => s.max)); + const range = globalMax - globalMin || 0.01; + + const toPercent = (v: number) => ((v - globalMin) / range) * 100; + + return ( + <div> + {/* Axis labels */} + <div + style={{ + display: "flex", + justifyContent: "space-between", + marginBottom: "4px", + paddingLeft: "140px", + paddingRight: "12px", + }} + > + <span style={axisLabelStyle}>{(globalMin * 100).toFixed(0)}%</span> + <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span> + </div> + + {stats.map((s) => ( + <div + key={s.model} + style={{ + display: "flex", + alignItems: "center", + marginBottom: "16px", + gap: "12px", + }} + > + {/* Label */} + <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}> + <div style={labelPrimaryStyle}>{s.model}</div> + <div style={labelSecondaryStyle}> + median {(s.median * 100).toFixed(1)}% / {s.cellCount} cells + </div> + </div> + + {/* Box plot */} + <div + style={{ + flex: 1, + position: "relative", + height: "32px", + marginRight: "12px", + }} + > + {/* Background track */} + <div + style={{ + position: "absolute", + top: "50%", + left: 0, + right: 0, + height: "1px", + background: "hsl(var(--border))", + transform: "translateY(-50%)", + }} + /> + + {/* Whisker line: min to max */} + <div + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(s.min)}%`, + width: `${toPercent(s.max) - toPercent(s.min)}%`, + height: "2px", + background: "var(--accent)", + opacity: 0.5, + transform: "translateY(-50%)", + }} + /> + + {/* Min whisker cap */} + <div + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(s.min)}%`, + width: "1px", + height: "12px", + background: "var(--accent)", + opacity: 0.5, + transform: "translate(-50%, -50%)", + }} + /> + + {/* Max whisker cap */} + <div + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(s.max)}%`, + width: "1px", + height: "12px", + background: "var(--accent)", + opacity: 0.5, + transform: "translate(-50%, -50%)", + }} + /> + + {/* IQR box: Q1 to Q3 */} + <div + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(s.q1)}%`, + width: `${Math.max(toPercent(s.q3) - toPercent(s.q1), 0.5)}%`, + height: "18px", + background: "var(--accent)", + opacity: 0.2, + border: "1px solid var(--accent)", + transform: "translateY(-50%)", + }} + /> + + {/* Median line */} + <div + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(s.median)}%`, + width: "2px", + height: "22px", + background: "var(--accent)", + transform: "translate(-50%, -50%)", + }} + /> + + {/* Individual cell dots */} + {s.points.map((p, i) => ( + <div + key={i} + style={{ + position: "absolute", + top: "50%", + left: `${toPercent(p)}%`, + width: "5px", + height: "5px", + borderRadius: "50%", + background: "var(--accent)", + opacity: 0.6, + transform: "translate(-50%, -50%)", + zIndex: 1, + }} + /> + ))} + </div> + </div> + ))} + </div> + ); +} + +/* ---------- Section 2: Reliability Ranking ---------- */ + +interface ReliabilityRow { + axis: string; + value: string; + avgScore: number; + avgRange: number; + n: number; +} + +function computeReliability(cells: Cell[]): ReliabilityRow[] { + const rows: ReliabilityRow[] = []; + + for (const axis of AXIS_NAMES) { + const groups: Record<string, { scores: number[]; ranges: number[] }> = {}; + for (const cell of cells) { + const val = String( + (cell.meta as Record<string, unknown>)[axis] ?? "unknown" + ); + const g = (groups[val] ??= { scores: [], ranges: [] }); + if (cell.score.avg > 0) { + g.scores.push(cell.score.avg); + g.ranges.push(cell.score.range); + } + } + + for (const [val, { scores, ranges }] of Object.entries(groups)) { + if (scores.length < 2) continue; + rows.push({ + axis, + value: val, + avgScore: scores.reduce((a, b) => a + b, 0) / scores.length, + avgRange: ranges.reduce((a, b) => a + b, 0) / ranges.length, + n: scores.length, + }); + } + } + + return rows.sort((a, b) => a.avgRange - b.avgRange); +} + +function reliabilityColor(avgRange: number): string { + if (avgRange <= 0.05) return "var(--green)"; + if (avgRange <= 0.12) return "var(--yellow)"; + return "var(--red)"; +} + +function ReliabilitySection({ cells }: { cells: Cell[] }) { + const rows = useMemo(() => computeReliability(cells), [cells]); + + if (rows.length === 0) { + return ( + <div style={{ color: "var(--text-muted)", padding: "20px" }}> + Not enough multi-run cells to compute reliability. + </div> + ); + } + + const maxRange = Math.max(...rows.map((r) => r.avgRange), 0.01); + + return ( + <div style={{ overflowX: "auto" }}> + <table style={{ borderCollapse: "collapse", width: "100%" }}> + <thead> + <tr> + {["VARIABLE", "VALUE", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map( + (h) => ( + <th key={h} style={thStyle}> + {h} + </th> + ) + )} + </tr> + </thead> + <tbody> + {rows.map((row, i) => { + const barWidth = (row.avgRange / maxRange) * 100; + const color = reliabilityColor(row.avgRange); + return ( + <tr + key={`${row.axis}-${row.value}`} + style={{ + borderBottom: "1px solid hsl(var(--border))", + background: + i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)", + }} + > + <td style={tdStyle}> + {AXIS_LABELS[row.axis] || row.axis} + </td> + <td style={{ ...tdStyle, fontFamily: "var(--font-mono)" }}> + {row.value} + </td> + <td + style={{ + ...tdStyle, + fontFamily: "var(--font-mono)", + textAlign: "right", + }} + > + {(row.avgScore * 100).toFixed(1)}% + </td> + <td + style={{ + ...tdStyle, + fontFamily: "var(--font-mono)", + textAlign: "right", + color, + }} + > + {(row.avgRange * 100).toFixed(1)}% + </td> + <td style={{ ...tdStyle, width: "200px" }}> + <div + style={{ + position: "relative", + height: "12px", + background: "hsl(var(--border) / 0.2)", + }} + > + <div + style={{ + position: "absolute", + top: 0, + left: 0, + height: "100%", + width: `${Math.max(barWidth, 1)}%`, + background: color, + opacity: 0.7, + }} + /> + </div> + </td> + </tr> + ); + })} + </tbody> + </table> + </div> + ); +} + +/* ---------- Section 3: Variance Contribution ---------- */ + +interface VarianceDecomp { + totalVariance: number; + withinVariance: number; + betweenVariance: number; + betweenPct: number; + withinPct: number; +} + +function computeVarianceDecomp( + runs: Run[], + cells: Cell[] +): VarianceDecomp | null { + // All individual run scores + const allScores = runs + .map((r) => r.eval_results?.score ?? null) + .filter((s): s is number => s !== null && s > 0); + + if (allScores.length < 2) return null; + + const totalVar = variance(allScores); + if (totalVar === 0) return null; + + // Within-cell variance: average variance within each cell + const cellVariances: number[] = []; + for (const cell of cells) { + const scores = cell.runs + .map((r) => r.eval_results?.score ?? null) + .filter((s): s is number => s !== null && s > 0); + if (scores.length >= 2) { + cellVariances.push(variance(scores)); + } + } + + const withinVar = + cellVariances.length > 0 + ? cellVariances.reduce((a, b) => a + b, 0) / cellVariances.length + : 0; + + const betweenVar = Math.max(totalVar - withinVar, 0); + const betweenPct = totalVar > 0 ? betweenVar / totalVar : 0; + const withinPct = totalVar > 0 ? withinVar / totalVar : 0; + + return { + totalVariance: totalVar, + withinVariance: withinVar, + betweenVariance: betweenVar, + betweenPct, + withinPct, + }; +} + +function VarianceSection({ + runs, + cells, +}: { + runs: Run[]; + cells: Cell[]; +}) { + const decomp = useMemo( + () => computeVarianceDecomp(runs, cells), + [runs, cells] + ); + + if (!decomp) { + return ( + <div style={{ color: "var(--text-muted)", padding: "20px" }}> + Not enough data to decompose variance. + </div> + ); + } + + return ( + <div> + <div + style={{ + display: "flex", + height: "32px", + marginBottom: "12px", + border: "1px solid hsl(var(--border))", + }} + > + {/* Between-cell (config choices) */} + <div + style={{ + width: `${decomp.betweenPct * 100}%`, + background: "var(--accent)", + opacity: 0.7, + display: "flex", + alignItems: "center", + justifyContent: "center", + fontSize: "11px", + fontFamily: "var(--font-mono)", + color: "var(--text)", + fontWeight: 600, + minWidth: decomp.betweenPct > 0.08 ? undefined : "0px", + overflow: "hidden", + whiteSpace: "nowrap", + }} + > + {decomp.betweenPct > 0.08 && + `${(decomp.betweenPct * 100).toFixed(0)}%`} + </div> + {/* Within-cell (randomness) */} + <div + style={{ + width: `${decomp.withinPct * 100}%`, + background: "var(--yellow)", + opacity: 0.5, + display: "flex", + alignItems: "center", + justifyContent: "center", + fontSize: "11px", + fontFamily: "var(--font-mono)", + color: "var(--text)", + fontWeight: 600, + minWidth: decomp.withinPct > 0.08 ? undefined : "0px", + overflow: "hidden", + whiteSpace: "nowrap", + }} + > + {decomp.withinPct > 0.08 && + `${(decomp.withinPct * 100).toFixed(0)}%`} + </div> + </div> + + <div + style={{ + display: "flex", + gap: "24px", + flexWrap: "wrap", + }} + > + <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> + <div + style={{ + width: "12px", + height: "12px", + background: "var(--accent)", + opacity: 0.7, + }} + /> + <span style={legendStyle}> + CONFIG CHOICES: {(decomp.betweenPct * 100).toFixed(0)}% + </span> + </div> + <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> + <div + style={{ + width: "12px", + height: "12px", + background: "var(--yellow)", + opacity: 0.5, + }} + /> + <span style={legendStyle}> + RANDOMNESS: {(decomp.withinPct * 100).toFixed(0)}% + </span> + </div> + </div> + + <p + style={{ + marginTop: "12px", + fontSize: "12px", + color: "var(--text-muted)", + lineHeight: "1.5", + }} + > + {decomp.betweenPct >= 0.5 + ? `Configuration choices explain ${(decomp.betweenPct * 100).toFixed(0)}% of score variance. The config matters more than run-to-run randomness.` + : decomp.betweenPct >= 0.3 + ? `Configuration and randomness contribute roughly equally. Scores are moderately sensitive to config choices.` + : `Run-to-run randomness dominates (${(decomp.withinPct * 100).toFixed(0)}%). Config choices have limited impact on scores -- results are noisy.`} + </p> + </div> + ); +} + +/* ---------- shared styles ---------- */ + +const sectionHeaderStyle: React.CSSProperties = { + fontSize: "11px", + fontFamily: "var(--font-mono)", + textTransform: "uppercase", + letterSpacing: "0.08em", + color: "var(--text-muted)", + marginBottom: "4px", +}; + +const sectionTitleStyle: React.CSSProperties = { + fontSize: "16px", + fontWeight: 600, + marginBottom: "4px", +}; + +const sectionDescStyle: React.CSSProperties = { + fontSize: "12px", + color: "var(--text-muted)", + marginBottom: "16px", + lineHeight: "1.4", +}; + +const cardStyle: React.CSSProperties = { + border: "1px solid hsl(var(--border))", + padding: "20px", + marginBottom: "16px", + background: "var(--surface-1)", +}; + +const axisLabelStyle: React.CSSProperties = { + fontSize: "10px", + fontFamily: "var(--font-mono)", + color: "var(--text-muted)", + textTransform: "uppercase", + letterSpacing: "0.06em", +}; + +const labelPrimaryStyle: React.CSSProperties = { + fontSize: "13px", + fontFamily: "var(--font-mono)", + fontWeight: 600, + color: "var(--text)", +}; + +const labelSecondaryStyle: React.CSSProperties = { + fontSize: "10px", + fontFamily: "var(--font-mono)", + color: "var(--text-muted)", + marginTop: "2px", +}; + +const thStyle: React.CSSProperties = { + fontSize: "11px", + fontFamily: "var(--font-mono)", + textTransform: "uppercase", + letterSpacing: "0.06em", + color: "var(--text-muted)", + padding: "8px 12px", + textAlign: "left", + borderBottom: "1px solid hsl(var(--border))", + fontWeight: 500, +}; + +const tdStyle: React.CSSProperties = { + fontSize: "12px", + padding: "6px 12px", + color: "var(--text)", +}; + +const legendStyle: React.CSSProperties = { + fontSize: "11px", + fontFamily: "var(--font-mono)", + textTransform: "uppercase", + letterSpacing: "0.06em", + color: "var(--text-muted)", +}; + +/* ---------- Main Component ---------- */ + +export default function Variability({ runs }: VariabilityProps) { + const cells = useMemo(() => groupIntoCells(runs), [runs]); + + if (runs.length === 0) { + return ( + <div + style={{ + ...cardStyle, + textAlign: "center", + padding: "40px", + color: "var(--text-muted)", + }} + > + No runs available for variability analysis. + </div> + ); + } + + return ( + <div> + {/* Section 1: Box Plots */} + <div style={cardStyle}> + <div style={sectionHeaderStyle}>CONSISTENCY</div> + <div style={sectionTitleStyle}>Score Distribution by Model</div> + <p style={sectionDescStyle}> + Each dot is a cell (unique config). The box spans Q1-Q3; the line + marks the median. Tighter boxes mean more consistent results across + configs. + </p> + <BoxPlotSection cells={cells} /> + </div> + + {/* Section 2: Reliability Ranking */} + <div style={cardStyle}> + <div style={sectionHeaderStyle}>RELIABILITY</div> + <div style={sectionTitleStyle}>Reliability Ranking by Variable</div> + <p style={sectionDescStyle}> + How much do repeat runs of the same config vary? Sorted by average + range (smallest = most reliable). Green means scores are consistent + across re-runs; red means volatile. + </p> + <ReliabilitySection cells={cells} /> + </div> + + {/* Section 3: Variance Decomposition */} + <div style={cardStyle}> + <div style={sectionHeaderStyle}>VARIANCE</div> + <div style={sectionTitleStyle}>Variance Contribution</div> + <p style={sectionDescStyle}> + ANOVA-style decomposition: how much of the total score variance comes + from config choices (between cells) vs run-to-run randomness (within + cells)? + </p> + <VarianceSection runs={runs} cells={cells} /> + </div> + </div> + ); +} diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro @@ -4,6 +4,7 @@ import { loadAllRuns } from "../lib/data"; import Insights from "../components/Insights"; import ScatterPlot from "../components/ScatterPlot"; import Surprises from "../components/Surprises"; +import Variability from "../components/Variability"; const runs = loadAllRuns(); --- @@ -11,11 +12,15 @@ const runs = loadAllRuns(); <Base title="Insights"> <h1 style="margin-bottom: 8px;">Insights</h1> <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> - Which variables move the needle? Where do weaker configs win? + Which variables move the needle? Where do weaker configs win? How consistent are the results? </p> <Surprises client:load runs={runs} /> + <div style="margin-top: 32px;"> + <Variability client:load runs={runs} /> + </div> + <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" /> <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />

Impressum · Datenschutz