loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 9248c6ccdcb5b59ddbde631e68d2c952050f1d04
parent 2d012184ea2e786f842637b1f6501ccc3dd0802b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 18:06:24 +0200

Add n= confidence to Insights page

- Insights: overall n= subtitle with filtered counts
- Heatmap: dim cells with <3 runs, yellow n= label for low-n
- Variability: n= per model in box plots and reliability table,
  dim entries with <3 cells
- TornadoChart and ScatterPlot already had n= indicators

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Heatmap.tsx | 16+++++++++++++---
Mdashboard/src/components/Insights.tsx | 13++++++++++---
Mdashboard/src/components/Variability.tsx | 31++++++++++++++++++++++++++-----
3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/dashboard/src/components/Heatmap.tsx b/dashboard/src/components/Heatmap.tsx @@ -3,9 +3,11 @@ import type { InteractionResult } from "../lib/analysis"; interface HeatmapProps { data: InteractionResult; metric: string; + totalRuns?: number; + totalCells?: number; } -export default function Heatmap({ data, metric }: HeatmapProps) { +export default function Heatmap({ data, metric, totalRuns, totalCells }: HeatmapProps) { const { axisA, axisB, table } = data; const aValues = Object.keys(table).sort(); @@ -50,6 +52,11 @@ export default function Heatmap({ data, metric }: HeatmapProps) { <h3 style={{ marginBottom: "4px" }}> {axisA} x {axisB} </h3> + {totalRuns != null && totalCells != null && ( + <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))", marginBottom: "4px" }}> + (n={totalRuns} runs across {totalCells} cells) + </div> + )} <p style={{ color: "var(--text-muted)", @@ -128,6 +135,7 @@ export default function Heatmap({ data, metric }: HeatmapProps) { </td> ); } + const isLowN = cell.n < 3; return ( <td key={b} @@ -139,14 +147,16 @@ export default function Heatmap({ data, metric }: HeatmapProps) { fontSize: "0.8rem", fontWeight: 600, borderRadius: "2px", + opacity: isLowN ? 0.4 : 1, + ...(isLowN ? { borderStyle: "dashed", borderWidth: "1px", borderColor: "var(--text-muted)" } : {}), }} > {(cell.mean * 100).toFixed(0)}% <div style={{ fontSize: "0.6rem", - fontWeight: 400, - color: "var(--text-muted)", + color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)", + fontWeight: isLowN ? 600 : 400, }} > n={cell.n} diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx @@ -1,6 +1,6 @@ import { useState, useMemo } from "react"; import type { Run } from "../lib/types"; -import { computeMainEffects, computeInteraction } from "../lib/analysis"; +import { computeMainEffects, computeInteraction, groupIntoCells } from "../lib/analysis"; import { modelSortOrder } from "../lib/colors"; import TornadoChart from "./TornadoChart"; import Heatmap from "./Heatmap"; @@ -49,6 +49,8 @@ export default function Insights({ runs }: InsightsProps) { [filteredRuns, metric] ); + const filteredCells = useMemo(() => groupIntoCells(filteredRuns), [filteredRuns]); + // Auto-pick top 2 axes for interaction if not selected const topAxes = useMemo(() => effects.slice(0, 6).map((e) => e.axis), [effects]); @@ -61,6 +63,11 @@ export default function Insights({ runs }: InsightsProps) { return ( <div style={{ display: "flex", flexDirection: "column", gap: "24px" }}> + {/* Overall sample size subtitle */} + <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))" }}> + (n={filteredRuns.length} runs across {filteredCells.length} cells) + </div> + {/* Metric selector */} <div style={{ display: "flex", gap: "8px", alignItems: "center", flexWrap: "wrap" }}> <span style={{ fontSize: "0.8rem", color: "var(--text-muted)" }}> @@ -102,7 +109,7 @@ export default function Insights({ runs }: InsightsProps) { </div> {/* Tornado chart */} - <TornadoChart effects={effects} metric={metric} /> + <TornadoChart effects={effects} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} /> {/* Interaction explorer */} <div className="card"> @@ -138,7 +145,7 @@ export default function Insights({ runs }: InsightsProps) { </div> </div> - {interaction && <Heatmap data={interaction} metric={metric} />} + {interaction && <Heatmap data={interaction} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} />} </div> </div> ); diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx @@ -129,7 +129,9 @@ function BoxPlotSection({ cells }: { cells: Cell[] }) { <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span> </div> - {stats.map((s) => ( + {stats.map((s) => { + const isLowN = s.cellCount < 3; + return ( <div key={s.model} style={{ @@ -137,13 +139,14 @@ function BoxPlotSection({ cells }: { cells: Cell[] }) { alignItems: "center", marginBottom: "16px", gap: "12px", + opacity: isLowN ? 0.4 : 1, }} > {/* Label */} <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}> <div style={labelPrimaryStyle}>{s.model}</div> - <div style={labelSecondaryStyle}> - median {(s.median * 100).toFixed(1)}% / {s.cellCount} cells + <div style={{ ...labelSecondaryStyle, color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)" }}> + median {(s.median * 100).toFixed(1)}% / n={s.cellCount} cell{s.cellCount !== 1 ? "s" : ""} </div> </div> @@ -259,7 +262,8 @@ function BoxPlotSection({ cells }: { cells: Cell[] }) { ))} </div> </div> - ))} + ); + })} </div> ); } @@ -329,7 +333,7 @@ function ReliabilitySection({ cells }: { cells: Cell[] }) { <table style={{ borderCollapse: "collapse", width: "100%" }}> <thead> <tr> - {["VARIABLE", "VALUE", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map( + {["VARIABLE", "VALUE", "N", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map( (h) => ( <th key={h} style={thStyle}> {h} @@ -342,6 +346,7 @@ function ReliabilitySection({ cells }: { cells: Cell[] }) { {rows.map((row, i) => { const barWidth = (row.avgRange / maxRange) * 100; const color = reliabilityColor(row.avgRange); + const isLowN = row.n < 3; return ( <tr key={`${row.axis}-${row.value}`} @@ -349,6 +354,7 @@ function ReliabilitySection({ cells }: { cells: Cell[] }) { borderBottom: "1px solid hsl(var(--border))", background: i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)", + opacity: isLowN ? 0.4 : 1, }} > <td style={tdStyle}> @@ -362,6 +368,17 @@ function ReliabilitySection({ cells }: { cells: Cell[] }) { ...tdStyle, fontFamily: "var(--font-mono)", textAlign: "right", + color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)", + fontWeight: isLowN ? 600 : 400, + }} + > + {row.n} + </td> + <td + style={{ + ...tdStyle, + fontFamily: "var(--font-mono)", + textAlign: "right", }} > {(row.avgScore * 100).toFixed(1)}% @@ -687,6 +704,10 @@ export default function Variability({ runs }: VariabilityProps) { return ( <div> + {/* Overall sample size subtitle */} + <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))", marginBottom: "8px" }}> + (n={runs.length} runs across {cells.length} cells) + </div> {/* Section 1: Box Plots */} <div style={cardStyle}> <div style={sectionHeaderStyle}>CONSISTENCY</div>

Impressum · Datenschutz