loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit e9c7251cd07c133098a32de1b00898bc7ea79d3f
parent 4c5457fbc3c2f5ff52de70289b518e2f956800f4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Wed,  8 Apr 2026 07:58:36 +0200

Add 95% CI bands, statistical power card, tornado CI whiskers

- Box plot: CI band overlay with mean dot, tooltip shows CI range
- Statistical Power card: avg CI width, detectable effect, color status
- Tornado: CI whiskers on effect bars, non-significant dimmed with "n.s."
- confidenceInterval() function with t-distribution for small samples

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 65++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdashboard/src/components/Insights.tsx | 2+-
Adashboard/src/components/StatisticalPowerCard.tsx | 137+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/components/TornadoChart.tsx | 164++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mdashboard/src/lib/analysis.ts | 38++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/index.astro | 3+++
6 files changed, 402 insertions(+), 7 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -12,6 +12,7 @@ import { } from "recharts"; import type { Run } from "../lib/types"; import { getModelColor, modelSortOrder } from "../lib/colors"; +import { confidenceInterval } from "../lib/analysis"; import ModelSelector from "./ModelSelector"; interface ChartsProps { @@ -32,6 +33,10 @@ interface BoxPlotData { base: number; // invisible bar height = q1 iqr: number; // visible box height = q3 - q1 color: string; + // 95% confidence interval of the mean + ciMean: number; + ciLower: number; + ciUpper: number; } @@ -165,6 +170,7 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] { const totalRuns = modelCells.reduce((sum, c) => sum + c.runCount, 0); const stats = computeBoxStats(scores); const baseModel = model; + const ci = confidenceInterval(scores); return { label: `${model}|(n=${totalRuns})`, ...stats, @@ -174,6 +180,9 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] { runCount: totalRuns, scores, color: getModelColor(baseModel), + ciMean: ci.mean, + ciLower: ci.lower, + ciUpper: ci.upper, }; }); } @@ -187,7 +196,7 @@ function BoxPlotShape(props: any) { }; if (!payload || height === undefined) return null; - const { min, median, max, color, cellCount } = payload; + const { min, median, max, color, cellCount, ciLower, ciUpper, ciMean } = payload; const lowN = cellCount < 3; const boxOpacity = lowN ? 0.4 : 1; // The bar is rendered from q1 (base) with height iqr (q3-q1). @@ -222,6 +231,35 @@ function BoxPlotShape(props: any) { <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} strokeDasharray={lowN ? "4 2" : undefined} /> {/* Median line */} <line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} /> + {/* 95% CI band on the mean */} + {ciLower !== ciUpper && (() => { + const ciTopY = dataToY(Math.min(ciUpper, max)); + const ciBotY = dataToY(Math.max(ciLower, min)); + const ciMeanY = dataToY(ciMean); + const ciHalfW = width * 0.45; + return ( + <> + {/* Shaded CI band */} + <rect + x={centerX - ciHalfW} + y={ciTopY} + width={ciHalfW * 2} + height={Math.max(ciBotY - ciTopY, 1)} + fill={color} + fillOpacity={0.2} + stroke="none" + /> + {/* CI vertical line */} + <line x1={centerX} y1={ciTopY} x2={centerX} y2={ciBotY} stroke={color} strokeWidth={1.5} strokeDasharray="2 2" /> + {/* CI top cap */} + <line x1={centerX - 4} y1={ciTopY} x2={centerX + 4} y2={ciTopY} stroke={color} strokeWidth={1.5} /> + {/* CI bottom cap */} + <line x1={centerX - 4} y1={ciBotY} x2={centerX + 4} y2={ciBotY} stroke={color} strokeWidth={1.5} /> + {/* Mean dot */} + <circle cx={centerX} cy={ciMeanY} r={2.5} fill={color} stroke="none" /> + </> + ); + })()} </g> ); } @@ -242,6 +280,12 @@ function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; <div>Median: {Math.round(d.median)}%</div> <div>Q1: {Math.round(d.q1)}%</div> <div>Min: {d.min}%</div> + {d.ciLower !== d.ciUpper && ( + <div style={{ marginTop: 4, borderTop: `1px solid ${SMUI.border}`, paddingTop: 4 }}> + <div style={{ color: SMUI.frost2 }}>Mean: {Math.round(d.ciMean)}%</div> + <div style={{ color: SMUI.frost2 }}>95% CI: [{Math.round(d.ciLower)}% - {Math.round(d.ciUpper)}%]</div> + </div> + )} </div> ); } @@ -270,6 +314,14 @@ export default function Charts({ runs }: ChartsProps) { const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model)); const modelData = aggregateByModel(filteredRuns); + const detectableDifference = useMemo(() => { + const ciWidths = modelData + .filter((d) => d.ciLower !== d.ciUpper) + .map((d) => d.ciUpper - d.ciLower); + if (ciWidths.length === 0) return null; + return Math.round(Math.max(...ciWidths)); + }, [modelData]); + return ( <div className="card"> <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> @@ -327,6 +379,17 @@ export default function Charts({ runs }: ChartsProps) { <Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" /> </ComposedChart> </ResponsiveContainer> + {detectableDifference != null && ( + <div style={{ + fontSize: "10px", + fontFamily: "'JetBrains Mono', monospace", + color: SMUI.muted, + marginTop: "4px", + textAlign: "center", + }}> + Detectable difference: differences of ±{detectableDifference}% are statistically significant with current data + </div> + )} </div> ); } diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx @@ -109,7 +109,7 @@ export default function Insights({ runs }: InsightsProps) { </div> {/* Tornado chart */} - <TornadoChart effects={effects} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} /> + <TornadoChart effects={effects} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} runs={filteredRuns} /> {/* Interaction explorer */} <div className="card"> diff --git a/dashboard/src/components/StatisticalPowerCard.tsx b/dashboard/src/components/StatisticalPowerCard.tsx @@ -0,0 +1,137 @@ +import { useMemo } from "react"; +import type { Run } from "../lib/types"; +import { groupIntoCells, confidenceInterval } from "../lib/analysis"; + +interface StatisticalPowerCardProps { + runs: Run[]; +} + +const SMUI = { + surface1: "hsl(217 16% 15.5%)", + surface2: "hsl(216 15% 19%)", + border: "hsl(217 17% 28%)", + muted: "hsl(213 14% 65%)", + green: "hsl(92 28% 65%)", + yellow: "hsl(40 71% 73%)", + red: "hsl(355 52% 64%)", +}; + +export default function StatisticalPowerCard({ runs }: StatisticalPowerCardProps) { + const stats = useMemo(() => { + const cells = groupIntoCells(runs); + const totalRuns = runs.length; + const totalCells = cells.length; + if (totalCells === 0) return null; + + const avgRunsPerCell = totalRuns / totalCells; + + // Compute CI width for each cell with 3+ runs + const ciWidths: number[] = []; + for (const cell of cells) { + const scores = cell.runs + .map((r) => r.eval_results?.score) + .filter((s): s is number => s != null) + .map((s) => s * 100); + if (scores.length >= 2) { + const ci = confidenceInterval(scores); + const width = ci.upper - ci.lower; + if (isFinite(width)) ciWidths.push(width); + } + } + + const avgCiWidth = ciWidths.length > 0 + ? ciWidths.reduce((a, b) => a + b, 0) / ciWidths.length + : null; + + // Minimum detectable effect = largest CI half-width across cells + const minDetectable = ciWidths.length > 0 + ? Math.max(...ciWidths) / 2 + : null; + + return { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable }; + }, [runs]); + + if (!stats) return null; + + const { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable } = stats; + + // Color indicator based on average CI width + let statusColor = SMUI.green; + let statusLabel = "Strong"; + if (avgCiWidth == null) { + statusColor = SMUI.muted; + statusLabel = "Insufficient data"; + } else if (avgCiWidth > 10) { + statusColor = SMUI.red; + statusLabel = "Low power"; + } else if (avgCiWidth > 5) { + statusColor = SMUI.yellow; + statusLabel = "Moderate"; + } + + return ( + <div + className="card" + style={{ + padding: "16px", + marginBottom: "16px", + }} + > + <div style={{ display: "flex", alignItems: "center", gap: "12px", marginBottom: "12px" }}> + <h3 style={{ margin: 0 }}>Statistical Power</h3> + <span + style={{ + fontSize: "10px", + fontFamily: "'JetBrains Mono', monospace", + fontWeight: 600, + color: statusColor, + border: `1px solid ${statusColor}`, + padding: "2px 8px", + letterSpacing: "0.5px", + textTransform: "uppercase", + }} + > + {statusLabel} + </span> + </div> + <div + style={{ + display: "flex", + gap: "24px", + flexWrap: "wrap", + fontSize: "13px", + fontFamily: "'JetBrains Mono', monospace", + }} + > + <div> + <span style={{ color: SMUI.muted }}>runs </span> + <span style={{ fontWeight: 600 }}>{totalRuns}</span> + </div> + <div> + <span style={{ color: SMUI.muted }}>cells </span> + <span style={{ fontWeight: 600 }}>{totalCells}</span> + </div> + <div> + <span style={{ color: SMUI.muted }}>avg runs/cell </span> + <span style={{ fontWeight: 600 }}>{avgRunsPerCell.toFixed(1)}</span> + </div> + {avgCiWidth != null && ( + <div> + <span style={{ color: SMUI.muted }}>avg 95% CI </span> + <span style={{ fontWeight: 600, color: statusColor }}> + ±{(avgCiWidth / 2).toFixed(1)}% + </span> + </div> + )} + {minDetectable != null && ( + <div> + <span style={{ color: SMUI.muted }}>min detectable effect </span> + <span style={{ fontWeight: 600, color: statusColor }}> + ±{minDetectable.toFixed(1)}% + </span> + </div> + )} + </div> + </div> + ); +} diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx @@ -1,10 +1,13 @@ +import type { Run } from "../lib/types"; import type { AxisEffect } from "../lib/analysis"; +import { groupIntoCells, confidenceInterval } from "../lib/analysis"; interface TornadoChartProps { effects: AxisEffect[]; metric: string; totalRuns?: number; totalCells?: number; + runs?: Run[]; } const AXIS_LABELS: Record<string, string> = { @@ -33,7 +36,90 @@ const AXIS_LABELS: Record<string, string> = { provider: "Provider", }; -export default function TornadoChart({ effects, metric, totalRuns, totalCells }: TornadoChartProps) { +// Metric extractors matching analysis.ts +const METRIC_EXTRACTORS: Record<string, (r: Run) => number | null> = { + score: (r) => r.eval_results?.score ?? null, + cost: (r) => r.claude_output?.total_cost_usd ?? null, + turns: (r) => r.claude_output?.num_turns ?? null, + wall_time: (r) => r.meta.wall_time_seconds ?? null, + gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null, + code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null, + structural: (r) => r.eval_results?.structural?.score ?? null, + quality: (r) => r.eval_results?.quality?.score ?? null, + transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null, + sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null, +}; + +const SKIP_KEYS = new Set([ + "task", "cell_id", "run_id", "run_number", "runs_per_cell", + "max_budget_usd", "timeout_seconds", "base_tools", "started_at", + "completed_at", "wall_time_seconds", "exit_code", "short_id", + "short_cell_id", "claude_version", "sub_agents", "actual_model", +]); + +interface EffectCI { + ciLower: number; + ciUpper: number; + crossesZero: boolean; +} + +function computeEffectCIs( + runs: Run[], + metric: string +): Record<string, Record<string, EffectCI>> { + const extract = METRIC_EXTRACTORS[metric]; + if (!extract || runs.length === 0) return {}; + + const cells = groupIntoCells(runs); + + // Get per-cell averages + const cellData: Array<{ meta: Run["meta"]; avg: number }> = []; + for (const cell of cells) { + const vals: number[] = []; + for (const run of cell.runs) { + const v = extract(run); + if (v !== null) vals.push(v); + } + if (vals.length === 0) continue; + cellData.push({ meta: cell.meta, avg: vals.reduce((a, b) => a + b, 0) / vals.length }); + } + + if (cellData.length === 0) return {}; + + const grandMean = cellData.reduce((s, c) => s + c.avg, 0) / cellData.length; + + const axisKeys = Object.keys(cellData[0].meta).filter((k) => !SKIP_KEYS.has(k)); + const result: Record<string, Record<string, EffectCI>> = {}; + + for (const axis of axisKeys) { + const groups: Record<string, number[]> = {}; + for (const { meta, avg } of cellData) { + const key = String((meta as Record<string, unknown>)[axis] ?? "unknown"); + (groups[key] ??= []).push(avg); + } + + result[axis] = {}; + for (const [val, avgs] of Object.entries(groups)) { + if (avgs.length < 2) { + result[axis][val] = { ciLower: 0, ciUpper: 0, crossesZero: true }; + continue; + } + // Compute CI of the effect (mean - grandMean) + const ci = confidenceInterval(avgs); + const effectLower = ci.lower - grandMean; + const effectUpper = ci.upper - grandMean; + result[axis][val] = { + ciLower: effectLower, + ciUpper: effectUpper, + crossesZero: effectLower <= 0 && effectUpper >= 0, + }; + } + } + + return result; +} + +export default function TornadoChart({ effects, metric, totalRuns, totalCells, runs }: TornadoChartProps) { if (effects.length === 0) { return ( <div @@ -50,11 +136,18 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: ); } - // Scale must account for variance bands extending beyond effect bars + // Compute CIs if runs are provided + const effectCIs = runs ? computeEffectCIs(runs, metric) : {}; + + // Scale must account for variance bands and CI whiskers extending beyond effect bars + const ciExtents = Object.values(effectCIs).flatMap((axisCIs) => + Object.values(axisCIs).map((ci) => Math.max(Math.abs(ci.ciLower), Math.abs(ci.ciUpper))) + ); const maxExtent = Math.max( ...effects.flatMap((e) => e.values.map((v) => Math.abs(v.effect) + v.variance) - ) + ), + ...ciExtents ); const scale = maxExtent > 0 ? 200 / maxExtent : 1; @@ -109,11 +202,22 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: }} > {effect.values.map((entry) => { + const ci = effectCIs[effect.axis]?.[entry.value]; + const crossesZero = ci?.crossesZero ?? true; const effectWidth = Math.abs(entry.effect) * scale; const varianceBandWidth = (Math.abs(entry.effect) + entry.variance) * scale; const isPositive = entry.effect >= 0; const isLowN = entry.n < 3; + // CI whisker positions (in px from left edge of bar area) + const ciLowerPx = ci ? Math.abs(ci.ciLower) * scale : 0; + const ciUpperPx = ci ? Math.abs(ci.ciUpper) * scale : 0; + // For the whisker, we show the full CI extent + const ciMaxPx = ci ? Math.max(ciLowerPx, ciUpperPx) : 0; + const ciMinPx = ci ? Math.min(ciLowerPx, ciUpperPx) : 0; + // Dim bars where CI crosses zero (effect not significant) + const notSignificant = ci && crossesZero && !isLowN; + const barContainerWidth = Math.max(varianceBandWidth, effectWidth, ciMaxPx, 2); return ( <div key={entry.value} @@ -121,7 +225,7 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: display: "flex", alignItems: "center", gap: "8px", - opacity: isLowN ? 0.4 : 1, + opacity: isLowN ? 0.4 : notSignificant ? 0.5 : 1, }} > <div @@ -140,7 +244,7 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: style={{ position: "relative", height: "16px", - width: `${Math.max(varianceBandWidth, effectWidth, 2)}px`, + width: `${barContainerWidth}px`, }} > {/* Variance band (behind, wider, semi-transparent) */} @@ -176,6 +280,44 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: ...(isLowN ? { borderStyle: "dashed", borderWidth: "1px", borderColor: isPositive ? "var(--green)" : "var(--red)" } : {}), }} /> + {/* CI whisker */} + {ci && !isLowN && ciMaxPx > 0 && ( + <> + {/* Whisker line */} + <div + style={{ + position: "absolute", + top: "7px", + left: `${ciMinPx}px`, + width: `${Math.max(ciMaxPx - ciMinPx, 1)}px`, + height: "2px", + background: "var(--text-muted)", + }} + /> + {/* Left cap */} + <div + style={{ + position: "absolute", + top: "4px", + left: `${ciMinPx}px`, + width: "1px", + height: "8px", + background: "var(--text-muted)", + }} + /> + {/* Right cap */} + <div + style={{ + position: "absolute", + top: "4px", + left: `${ciMaxPx}px`, + width: "1px", + height: "8px", + background: "var(--text-muted)", + }} + /> + </> + )} </div> <div style={{ @@ -197,6 +339,18 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }: ±{(entry.variance * 100).toFixed(1)}% </span> )} + {ci && !isLowN && ( + <span + style={{ + color: crossesZero ? "var(--yellow)" : "var(--text-muted)", + marginLeft: "4px", + fontSize: "0.6rem", + }} + > + CI [{(ci.ciLower * 100).toFixed(1)}, {(ci.ciUpper * 100).toFixed(1)}] + {crossesZero ? " n.s." : ""} + </span> + )} </div> <div style={{ diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts @@ -1,6 +1,44 @@ import type { Run, AxisName } from "./types"; import { AXIS_NAMES } from "./types"; +export interface ConfidenceIntervalResult { + mean: number; + ci: number; + lower: number; + upper: number; + n: number; +} + +export function confidenceInterval( + values: number[], + confidence = 0.95 +): ConfidenceIntervalResult { + const n = values.length; + if (n < 2) { + const mean = n === 1 ? values[0] : 0; + return { mean, ci: 0, lower: mean, upper: mean, n }; + } + const mean = values.reduce((a, b) => a + b, 0) / n; + const stdDev = Math.sqrt( + values.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1) + ); + // t-value approximation for 95% CI (exact for small n, 1.96 for large n) + const tValues: Record<number, number> = { + 2: 12.71, + 3: 4.3, + 4: 3.18, + 5: 2.78, + 6: 2.57, + 7: 2.45, + 8: 2.36, + 9: 2.31, + 10: 2.26, + }; + const t = tValues[n] ?? (n > 30 ? 1.96 : 2.0); + const ci = t * stdDev / Math.sqrt(n); + return { mean, ci, lower: mean - ci, upper: mean + ci, n }; +} + export interface Cell { cell_id: string; runs: Run[]; diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro @@ -5,6 +5,7 @@ import type { Run } from "../lib/types"; import Grid from "../components/Grid"; import Charts from "../components/Charts"; import TopBottomConfigs from "../components/TopBottomConfigs"; +import StatisticalPowerCard from "../components/StatisticalPowerCard"; const runs = loadAllRuns(); const axisValues = getAxisValues(runs); @@ -103,6 +104,8 @@ const totalCells = new Set(runs.map(r => r.meta.cell_id)).size; </div> ))} + <StatisticalPowerCard client:load runs={runs} /> + <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> <Charts client:load runs={runs} /> <TopBottomConfigs client:load runs={runs} />

Impressum · Datenschutz