loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 7925fb44e0bb06e82b6284b9aae10888b58a0128
parent c846bee44baf5e45fbb78d3b92494c10e718c9cb
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 17:14:54 +0200

Add variability violin chart to Compare page

Beeswarm chart showing CV% per cell grouped by model. Box plots with
jittered dots. Uses shared color palette. Lower = more consistent.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/VariabilityViolin.tsx | 398+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/compare.astro | 5+++++
2 files changed, 403 insertions(+), 0 deletions(-)

diff --git a/dashboard/src/components/VariabilityViolin.tsx b/dashboard/src/components/VariabilityViolin.tsx @@ -0,0 +1,398 @@ +import { useMemo } from "react"; +import { getModelColor, modelSortOrder } from "../lib/colors"; +import { + ComposedChart, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + ResponsiveContainer, + Scatter, + Cell, + ZAxis, + Bar, +} from "recharts"; +import type { Run } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; + +interface VariabilityViolinProps { + runs: Run[]; +} + +const SMUI = { + surface0: "hsl(213 16% 12%)", + surface1: "hsl(217 16% 15.5%)", + surface2: "hsl(216 15% 19%)", + border: "hsl(217 17% 28%)", + muted: "hsl(213 14% 65%)", + frost1: "hsl(176 25% 65%)", + frost2: "hsl(193 44% 67%)", + frost3: "hsl(210 34% 63%)", + frost4: "hsl(213 32% 52%)", + green: "hsl(92 28% 65%)", + yellow: "hsl(40 71% 73%)", + red: "hsl(355 52% 64%)", + purple: "hsl(311 24% 63%)", +}; + +// Colors and sort order from shared palette + +const TOOLTIP_STYLE: React.CSSProperties = { + background: SMUI.surface1, + border: `1px solid ${SMUI.border}`, + borderRadius: "0", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + padding: "8px 12px", +}; + +interface ModelCVData { + label: string; + model: string; + cvValues: number[]; + min: number; + q1: number; + median: number; + q3: number; + max: number; + base: number; + iqr: number; + color: string; + cellCount: number; +} + +interface ScatterPoint { + label: string; + cv: number; + color: string; + jitter: number; +} + +function quantile(sorted: number[], q: number): number { + if (sorted.length === 0) return 0; + if (sorted.length === 1) return sorted[0]; + const pos = q * (sorted.length - 1); + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sorted[lo]; + return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]); +} + +function computeModelCV(runs: Run[]): ModelCVData[] { + const cells = groupIntoCells(runs); + + const byModel: Record<string, number[]> = {}; + + for (const cell of cells) { + if (cell.n < 2) continue; + const scores = cell.runs + .map((r) => r.eval_results?.score) + .filter((s): s is number => s != null); + if (scores.length < 2) continue; + const mean = scores.reduce((a, b) => a + b, 0) / scores.length; + if (mean === 0) continue; + const stdDev = Math.sqrt( + scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length + ); + const cv = (stdDev / mean) * 100; + const model = cell.meta.actual_model || cell.meta.model; + (byModel[model] ??= []).push(cv); + } + + const sortedEntries = Object.entries(byModel).sort( + ([a], [b]) => + modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b) + ); + + return sortedEntries.map(([model, cvs]) => { + const sorted = [...cvs].sort((a, b) => a - b); + const q1 = quantile(sorted, 0.25); + const q3 = quantile(sorted, 0.75); + return { + label: `${model}|(n=${cvs.length})`, + model, + cvValues: cvs, + min: sorted[0], + q1, + median: quantile(sorted, 0.5), + q3, + max: sorted[sorted.length - 1], + base: q1, + iqr: q3 - q1, + color: getModelColor(model), + cellCount: cvs.length, + }; + }); +} + +function buildScatterData(data: ModelCVData[]): ScatterPoint[] { + const points: ScatterPoint[] = []; + // Deterministic jitter based on index within each model group + for (const d of data) { + const sorted = [...d.cvValues].sort((a, b) => a - b); + for (let i = 0; i < sorted.length; i++) { + // Simple deterministic jitter: alternate sides, proportional to index + const side = i % 2 === 0 ? 1 : -1; + const magnitude = ((i % 5) + 1) * 0.06; + points.push({ + label: d.label, + cv: sorted[i], + color: d.color, + jitter: side * magnitude, + }); + } + } + return points; +} + +// Custom shape for the box + whiskers +function CVBoxPlotShape(props: any) { + const { x, y, width, height, payload } = props as { + x: number; + y: number; + width: number; + height: number; + payload: ModelCVData; + }; + if (!payload || height === undefined) return null; + + const { min, median, max, q1, q3, color } = payload; + const boxTop = y; + const boxBottom = y + height; + const centerX = x + width / 2; + + const dataToY = (val: number): number => { + if (q3 === q1) return boxTop; + return boxTop + ((q3 - val) / (q3 - q1)) * (boxBottom - boxTop); + }; + + const minY = dataToY(min); + const maxY = dataToY(max); + const medianY = dataToY(median); + const whiskerHalfW = width * 0.3; + + return ( + <g> + {/* Whisker line: min to max */} + <line + x1={centerX} + y1={minY} + x2={centerX} + y2={maxY} + stroke={SMUI.muted} + strokeWidth={1} + /> + {/* Min whisker cap */} + <line + x1={centerX - whiskerHalfW} + y1={minY} + x2={centerX + whiskerHalfW} + y2={minY} + stroke={SMUI.muted} + strokeWidth={1} + /> + {/* Max whisker cap */} + <line + x1={centerX - whiskerHalfW} + y1={maxY} + x2={centerX + whiskerHalfW} + y2={maxY} + stroke={SMUI.muted} + strokeWidth={1} + /> + {/* Box (IQR) */} + <rect + x={x} + y={boxTop} + width={width} + height={Math.max(height, 1)} + fill={color} + fillOpacity={0.3} + stroke={color} + strokeWidth={1} + /> + {/* Median line */} + <line + x1={x} + y1={medianY} + x2={x + width} + y2={medianY} + stroke={color} + strokeWidth={2} + /> + </g> + ); +} + +function CVTooltipContent({ + active, + payload, +}: { + active?: boolean; + payload?: Array<{ payload: ModelCVData }>; + label?: string; +}) { + if (!active || !payload || payload.length === 0) return null; + const d = payload[0].payload; + if (!d.model) return null; + return ( + <div style={TOOLTIP_STYLE}> + <div style={{ marginBottom: 4, fontWeight: 600 }}>{d.model}</div> + <div>Cells: {d.cellCount}</div> + <div>Max CV: {d.max.toFixed(1)}%</div> + <div>Q3: {d.q3.toFixed(1)}%</div> + <div>Median: {d.median.toFixed(1)}%</div> + <div>Q1: {d.q1.toFixed(1)}%</div> + <div>Min CV: {d.min.toFixed(1)}%</div> + </div> + ); +} + +export default function VariabilityViolin({ runs }: VariabilityViolinProps) { + const data = useMemo(() => computeModelCV(runs), [runs]); + const scatterData = useMemo(() => buildScatterData(data), [data]); + + if (data.length === 0) { + return ( + <div + className="card" + style={{ + textAlign: "center", + padding: "40px", + color: SMUI.muted, + fontFamily: "'JetBrains Mono', monospace", + }} + > + Not enough multi-run cells to compute variability. + </div> + ); + } + + const maxCV = Math.max(...data.map((d) => d.max), 10); + const yMax = Math.ceil(maxCV / 10) * 10; + + return ( + <div className="card"> + <h3 + style={{ + marginBottom: "4px", + fontFamily: "'JetBrains Mono', monospace", + }} + > + Score Variability by Model (CV%) + </h3> + <p + style={{ + color: SMUI.muted, + fontSize: "11px", + fontFamily: "'JetBrains Mono', monospace", + marginBottom: "16px", + }} + > + Lower = more consistent. Each dot is one cell's coefficient of + variation. + </p> + <ResponsiveContainer width="100%" height={320}> + <ComposedChart data={data} barCategoryGap="25%"> + <CartesianGrid + strokeDasharray="3 3" + stroke={SMUI.border} + vertical={false} + /> + <XAxis + dataKey="label" + stroke={SMUI.muted} + tickLine={false} + axisLine={{ stroke: SMUI.border }} + interval={0} + tick={({ x, y, payload }: any) => { + const [name, count] = (payload.value as string).split("|"); + return ( + <g> + <text + x={x} + y={y + 12} + textAnchor="middle" + fill={SMUI.muted} + fontSize={10} + fontFamily="'JetBrains Mono', monospace" + > + {name} + </text> + <text + x={x} + y={y + 24} + textAnchor="middle" + fill={SMUI.muted} + fontSize={8} + fontFamily="'JetBrains Mono', monospace" + opacity={0.6} + > + {count} + </text> + </g> + ); + }} + height={40} + /> + <YAxis + stroke={SMUI.muted} + fontSize={11} + fontFamily="'JetBrains Mono', monospace" + domain={[0, yMax]} + tickLine={false} + axisLine={false} + yAxisId="cv" + label={{ + value: "CV%", + angle: -90, + position: "insideLeft", + style: { + fill: SMUI.muted, + fontSize: 10, + fontFamily: "'JetBrains Mono', monospace", + }, + }} + /> + <Tooltip + content={<CVTooltipContent />} + cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} + /> + {/* Invisible base bar to push the visible box up to q1 */} + <Bar + dataKey="base" + stackId="box" + fill="transparent" + barSize={40} + yAxisId="cv" + /> + {/* Visible IQR box with custom shape for whiskers and median */} + <Bar + dataKey="iqr" + stackId="box" + barSize={40} + yAxisId="cv" + shape={<CVBoxPlotShape />} + > + {data.map((entry) => ( + <Cell key={entry.label} fill={entry.color} /> + ))} + </Bar> + {/* Jittered scatter dots for individual cell CV values */} + <Scatter + data={scatterData} + dataKey="cv" + yAxisId="cv" + fill={SMUI.frost2} + fillOpacity={0.6} + > + <ZAxis range={[50, 50]} /> + {scatterData.map((pt, i) => ( + <Cell key={i} fill={pt.color} fillOpacity={0.6} /> + ))} + </Scatter> + </ComposedChart> + </ResponsiveContainer> + </div> + ); +} diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro @@ -2,6 +2,7 @@ import Base from "../layouts/Base.astro"; import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES } from "../lib/data"; import type { Run, AxisName } from "../lib/data"; +import VariabilityViolin from "../components/VariabilityViolin"; const runs = loadAllRuns(); const axisValues = getAxisValues(runs); @@ -198,4 +199,8 @@ for (const axis of AXIS_NAMES) { </table> </div> )} + + <div style="margin-top: 24px;"> + <VariabilityViolin client:load runs={runs} /> + </div> </Base>

Impressum · Datenschutz