loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 2f6ed75dc4e056efff1ea561d015f298ff63592b
parent 76fb10ff9eca33d3209b81e8d567d35ca2689dd9
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 18:00:18 +0200

Add n= confidence indicators to Grid page

- Box plot: dashed/dimmed for models with <3 cells, tooltip shows n=
- Top/Bottom 10: n= per bar, dashed border for n=1 cells
- Grid table: n= per cell row, dimmed when <3 runs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 25++++++++++++++++++-------
Mdashboard/src/components/Grid.tsx | 1+
Mdashboard/src/components/TopBottomConfigs.tsx | 30+++++++++++++++++++++++++-----
3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -26,6 +26,7 @@ interface BoxPlotData { q3: number; max: number; cellCount: number; + runCount: number; scores: number[]; // Derived fields for recharts stacked bar trick base: number; // invisible bar height = q1 @@ -161,14 +162,16 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] { return sortedEntries.map(([model, modelCells]) => { const scores = modelCells.map((c) => Math.round(c.avgScore * 100)); + const totalRuns = modelCells.reduce((sum, c) => sum + c.runCount, 0); const stats = computeBoxStats(scores); const baseModel = model; return { - label: `${model}|(n=${modelCells.length})`, + label: `${model}|(n=${totalRuns})`, ...stats, base: stats.q1, iqr: stats.q3 - stats.q1, cellCount: modelCells.length, + runCount: totalRuns, scores, color: getModelColor(baseModel), }; @@ -184,7 +187,9 @@ function BoxPlotShape(props: any) { }; if (!payload || height === undefined) return null; - const { min, median, max, color } = payload; + const { min, median, max, color, cellCount } = payload; + const lowN = cellCount < 3; + const boxOpacity = lowN ? 0.4 : 1; // The bar is rendered from q1 (base) with height iqr (q3-q1). // y is the top of the bar (q3 in chart coords), y+height is the bottom (q1). const boxTop = y; @@ -206,15 +211,15 @@ function BoxPlotShape(props: any) { const whiskerHalfW = width * 0.3; return ( - <g> + <g opacity={boxOpacity}> {/* Whisker line: min to max */} <line x1={centerX} y1={minY} x2={centerX} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> {/* Min whisker cap */} <line x1={centerX - whiskerHalfW} y1={minY} x2={centerX + whiskerHalfW} y2={minY} stroke={SMUI.muted} strokeWidth={1} /> {/* Max whisker cap */} <line x1={centerX - whiskerHalfW} y1={maxY} x2={centerX + whiskerHalfW} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> - {/* Box (IQR) */} - <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} /> + {/* Box (IQR) -- dashed stroke when low sample size */} + <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} strokeDasharray={lowN ? "4 2" : undefined} /> {/* Median line */} <line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} /> </g> @@ -230,7 +235,8 @@ function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; const d = payload[0].payload; return ( <div style={TOOLTIP_STYLE}> - <div style={{ marginBottom: 4, fontWeight: 600 }}>{label}</div> + <div style={{ marginBottom: 4, fontWeight: 600 }}>{label?.split("|")[0]}</div> + <div style={{ marginBottom: 4, color: SMUI.muted, fontSize: 10 }}>n={d.runCount} runs across {d.cellCount} cells</div> <div>Max: {d.max}%</div> <div>Q3: {Math.round(d.q3)}%</div> <div>Median: {Math.round(d.median)}%</div> @@ -267,7 +273,12 @@ export default function Charts({ runs }: ChartsProps) { return ( <div className="card"> <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> - <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> + <div> + <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> + <div style={{ fontSize: "10px", color: "var(--text-muted, hsl(213 14% 65%))", fontFamily: "'JetBrains Mono', monospace", marginTop: "2px" }}> + (n={filteredRuns.length} runs across {modelData.reduce((sum, d) => sum + d.cellCount, 0)} cells) + </div> + </div> <ModelSelector allModels={allModels} selectedModels={selectedModels} diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -230,6 +230,7 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { <td> <div style={{ fontSize: "0.75rem", display: "flex", alignItems: "center", gap: "6px" }}> <a href={`/c/${g.runs[0]?.meta.short_cell_id || g.cell_id}`} style={{ color: "var(--accent)", fontSize: "0.65rem", textTransform: "uppercase", letterSpacing: "0.5px", opacity: 0.7 }} title="View cell detail">cell</a> + <span style={{ color: "var(--text-muted)", fontSize: "0.6rem", fontFamily: "var(--font-mono)", opacity: g.runs.length < 3 ? 0.5 : 0.8 }}>n={g.runs.length}</span> {g.runs.map((r, i) => ( <span key={r.meta.run_id}> {i > 0 && " "} diff --git a/dashboard/src/components/TopBottomConfigs.tsx b/dashboard/src/components/TopBottomConfigs.tsx @@ -112,6 +112,8 @@ function BarRow({ const pct = Math.round(entry.avgScore * 100); const barWidth = maxScore > 0 ? (entry.avgScore / maxScore) * 100 : 0; const modelColor = getModelColor(entry.model); + const lowN = entry.runCount < 3; + const singleRun = entry.runCount === 1; return ( <div @@ -122,6 +124,7 @@ function BarRow({ marginBottom: 3, fontFamily: "'JetBrains Mono', monospace", fontSize: 11, + opacity: lowN ? 0.4 : 1, }} > {/* Bar */} @@ -153,7 +156,7 @@ function BarRow({ left: 0, height: "100%", width: `${barWidth}%`, - borderLeft: `2px solid ${accentColor}`, + borderLeft: singleRun ? `2px dashed ${accentColor}` : `2px solid ${accentColor}`, boxSizing: "border-box", }} /> @@ -172,6 +175,19 @@ function BarRow({ </span> </div> + {/* n= indicator */} + <span + style={{ + color: SMUI.muted, + fontSize: 9, + flexShrink: 0, + minWidth: 22, + fontFamily: "'JetBrains Mono', monospace", + }} + > + n={entry.runCount} + </span> + {/* Model name */} <span style={{ @@ -217,9 +233,9 @@ function BarRow({ } export default function TopBottomConfigs({ runs }: TopBottomConfigsProps) { - const { top10, bottom10 } = useMemo(() => { + const { top10, bottom10, totalCells, totalRuns } = useMemo(() => { const cells = groupIntoCells(runs); - if (cells.length === 0) return { top10: [], bottom10: [] }; + if (cells.length === 0) return { top10: [], bottom10: [], totalCells: 0, totalRuns: 0 }; const { varyingAxes, defaults } = computeDefaults(cells); @@ -237,8 +253,9 @@ export default function TopBottomConfigs({ runs }: TopBottomConfigsProps) { const top10 = entries.slice(0, 10); const bottom10 = entries.slice(-10).reverse(); // worst first (lowest at bottom) + const totalRuns = entries.reduce((sum, e) => sum + e.runCount, 0); - return { top10, bottom10 }; + return { top10, bottom10, totalCells: entries.length, totalRuns }; }, [runs]); if (top10.length === 0) { @@ -264,7 +281,10 @@ export default function TopBottomConfigs({ runs }: TopBottomConfigsProps) { return ( <div className="card"> - <h3 style={{ margin: "0 0 16px 0" }}>Best & Worst Configurations</h3> + <h3 style={{ margin: 0 }}>Best & Worst Configurations</h3> + <div style={{ fontSize: "10px", color: "var(--text-muted, hsl(213 14% 65%))", fontFamily: "'JetBrains Mono', monospace", marginTop: "2px", marginBottom: "16px" }}> + (n={totalRuns} runs across {totalCells} cells) + </div> <div style={{ display: "flex", gap: 24, flexWrap: "wrap" }}> {/* Top 10 */} <div style={{ flex: 1, minWidth: 200 }}>

Impressum · Datenschutz