loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit f8ffb79af032ef659f675c9670f6d83092244df1
parent b1e91277b76e9c69c234d533f9f088d5b4e84d57
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 08:47:26 +0200

Grid: per-task summary with cells/runs/score/cost. Cell: variance stats. Box plots: model order fix.

Grid page:
- Replaced generic stat cards with per-task summary cards
- Shows: task name, cell count, run count, avg score, pass rate, avg cost
- Cell-based aggregation

Cell detail page:
- Added variance statistics: range, spread (pp), std dev (pp), median,
  cost range, turns range
- Coefficient of variation shown as CV%

Variability box plots:
- Model order: haiku, opus, sonnet (was alphabetical)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/CellDetail.tsx | 33++++++++++++++++++++++++++++++++-
Mdashboard/src/components/Variability.tsx | 4+++-
Mdashboard/src/pages/index.astro | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
3 files changed, 122 insertions(+), 32 deletions(-)

diff --git a/dashboard/src/components/CellDetail.tsx b/dashboard/src/components/CellDetail.tsx @@ -178,6 +178,11 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) { const range = maxScore - minScore; const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0; + const stdDev = scores.length > 1 + ? Math.sqrt(scores.reduce((sum, s) => sum + (s - avgScore) ** 2, 0) / (scores.length - 1)) + : 0; + const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0; + let consistencyLabel: string; let consistencyColor: string; if (coefficientOfVariation < 10) { @@ -338,9 +343,35 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) { {consistencyLabel} </span> <span style={{ fontSize: "0.65rem", color: "var(--text-muted)" }}> - {coefficientOfVariation.toFixed(0)}% range/avg + {coefficientOfVariation.toFixed(0)}% CV </span> </div> + <div style={{ marginTop: "8px", fontSize: "0.65rem", display: "flex", flexDirection: "column", gap: "2px" }}> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Range</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(minScore)} - {formatPct(maxScore)}</span> + </div> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Spread</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{(range * 100).toFixed(1)}pp</span> + </div> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Std Dev</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{(stdDev * 100).toFixed(1)}pp</span> + </div> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Median</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(medianScore)}</span> + </div> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Cost range</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{formatCost(costs.length > 0 ? Math.min(...costs) : null)} - {formatCost(costs.length > 0 ? Math.max(...costs) : null)}</span> + </div> + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>Turns range</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{turnsList.length > 0 ? Math.min(...turnsList) : "-"} - {turnsList.length > 0 ? Math.max(...turnsList) : "-"}</span> + </div> + </div> </div> </div> diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx @@ -59,7 +59,9 @@ interface BoxPlotStats { } function computeBoxPlots(cells: Cell[]): BoxPlotStats[] { - const models = Array.from(new Set(cells.map((c) => c.meta.model))).sort(); + const MODEL_ORDER: Record<string, number> = { haiku: 1, opus: 2, sonnet: 3 }; + const models = Array.from(new Set(cells.map((c) => c.meta.model))) + .sort((a, b) => (MODEL_ORDER[a] || 99) - (MODEL_ORDER[b] || 99)); const results: BoxPlotStats[] = []; for (const model of models) { diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro @@ -1,54 +1,111 @@ --- import Base from "../layouts/Base.astro"; -import { loadAllRuns, getAxisValues, getTaskNames, aggregateRuns } from "../lib/data"; +import { loadAllRuns, getAxisValues, getTaskNames } from "../lib/data"; +import type { Run } from "../lib/types"; import Grid from "../components/Grid"; import Charts from "../components/Charts"; const runs = loadAllRuns(); const axisValues = getAxisValues(runs); const tasks = getTaskNames(runs); -const stats = aggregateRuns(runs); + +// Compute per-task cell-based stats +interface TaskSummary { + task: string; + cells: number; + runs: number; + avg_score: number | null; + pass_rate: number | null; + avg_cost: number | null; +} + +function computeTaskSummaries(runs: Run[]): TaskSummary[] { + const byTask: Record<string, Run[]> = {}; + for (const run of runs) { + (byTask[run.meta.task] ??= []).push(run); + } + + return Object.entries(byTask).map(([task, taskRuns]) => { + // Group into cells + const cells = new Map<string, Run[]>(); + for (const run of taskRuns) { + const id = run.meta.cell_id; + if (!cells.has(id)) cells.set(id, []); + cells.get(id)!.push(run); + } + + // Compute cell averages + const cellScores: number[] = []; + const cellCosts: number[] = []; + let totalPasses = 0; + + for (const [, cellRuns] of cells) { + const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null); + if (scores.length > 0) cellScores.push(scores.reduce((a, b) => a + b, 0) / scores.length); + + const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); + if (costs.length > 0) cellCosts.push(costs.reduce((a, b) => a + b, 0) / costs.length); + + totalPasses += cellRuns.filter(r => r.eval_results?.functional?.pass === true).length; + } + + const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; + + return { + task, + cells: cells.size, + runs: taskRuns.length, + avg_score: avg(cellScores), + pass_rate: taskRuns.length > 0 ? totalPasses / taskRuns.length : null, + avg_cost: avg(cellCosts), + }; + }); +} + +const taskSummaries = computeTaskSummaries(runs); +const totalCells = new Set(runs.map(r => r.meta.cell_id)).size; --- <Base title="Grid Overview"> <h1 style="margin-bottom: 8px;">Benchmark Results</h1> - <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;"> + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> Comparing agentic coding loop configurations across tasks and setups. </p> - <div class="stats-grid" style="margin-bottom: 32px;"> - <div class="stat-card"> - <div class="stat-value">{runs.length}</div> - <div class="stat-label">Total Runs</div> - </div> - <div class="stat-card"> - <div class="stat-value">{tasks.length}</div> - <div class="stat-label">Tasks</div> - </div> - <div class="stat-card"> - <div class="stat-value"> - {stats.avg_score != null ? (stats.avg_score * 100).toFixed(0) + "%" : "-"} - </div> - <div class="stat-label">Avg Score</div> - </div> - <div class="stat-card"> - <div class="stat-value"> - {stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-"} - </div> - <div class="stat-label">Pass Rate</div> - </div> - <div class="stat-card"> - <div class="stat-value"> - {stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-"} + {taskSummaries.map((ts) => ( + <div class="card" style="margin-bottom: 16px; padding: 16px;"> + <div style="display: flex; align-items: center; gap: 24px; flex-wrap: wrap;"> + <h3 style="margin: 0; min-width: 100px;">{ts.task}</h3> + <div style="display: flex; gap: 24px; font-size: 13px;"> + <div> + <span style="color: var(--text-muted);">cells </span> + <span style="font-weight: 600;">{ts.cells}</span> + </div> + <div> + <span style="color: var(--text-muted);">runs </span> + <span style="font-weight: 600;">{ts.runs}</span> + </div> + <div> + <span style="color: var(--text-muted);">avg score </span> + <span style="font-weight: 600;">{ts.avg_score != null ? (ts.avg_score * 100).toFixed(0) + "%" : "-"}</span> + </div> + <div> + <span style="color: var(--text-muted);">pass rate </span> + <span style="font-weight: 600;">{ts.pass_rate != null ? (ts.pass_rate * 100).toFixed(0) + "%" : "-"}</span> + </div> + <div> + <span style="color: var(--text-muted);">avg cost </span> + <span style="font-weight: 600;">{ts.avg_cost != null ? "$" + ts.avg_cost.toFixed(2) : "-"}</span> + </div> + </div> </div> - <div class="stat-label">Avg Cost</div> </div> - </div> + ))} <Charts client:load runs={runs} /> <div style="margin-top: 32px;"> - <h2 style="margin-bottom: 16px;">All Runs</h2> + <h2 style="margin-bottom: 16px;">All Cells</h2> <Grid client:load runs={runs} axisValues={axisValues} tasks={tasks} /> </div> </Base>

Impressum · Datenschutz