loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
parent f944fc2552eac32747f26c24e00b9d8f8cc7829f
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 09:04:27 +0200

Add scatter plots and surprise detector to insights page

Scatter plots:
- Cost vs Score: shows efficiency frontier, colored by model
- Turns vs Score: shows iteration efficiency

Surprise detector:
- Finds cases where weaker models beat stronger ones
- Finds cases where simple prompts beat detailed
- Cards with yellow left border, sorted by magnitude
- Shows both sides with score comparison

All 67 runs now have full eval results (code analysis, transcript
analysis, gameplay bot).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/ScatterPlot.tsx | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adashboard/src/components/Surprises.tsx | 168+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/insights.astro | 18++++++++++++++----
3 files changed, 304 insertions(+), 4 deletions(-)

diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx @@ -0,0 +1,122 @@ +import { + ScatterChart, + Scatter, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + ResponsiveContainer, + Legend, +} from "recharts"; +import type { Run } from "../lib/data"; + +interface ScatterPlotProps { + runs: Run[]; + xMetric: string; + yMetric: string; +} + +const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = { + cost: { + label: "Cost ($)", + extract: (r) => r.claude_output?.total_cost_usd ?? null, + format: (v) => `$${v.toFixed(2)}`, + }, + score: { + label: "Score (%)", + extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null, + format: (v) => `${v.toFixed(0)}%`, + }, + turns: { + label: "Turns", + extract: (r) => r.claude_output?.num_turns ?? null, + format: (v) => `${v}`, + }, + wall_time: { + label: "Time (s)", + extract: (r) => r.meta.wall_time_seconds ?? null, + format: (v) => `${v}s`, + }, +}; + +const MODEL_COLORS: Record<string, string> = { + haiku: "hsl(193 44% 67%)", // frost cyan + sonnet: "hsl(40 71% 73%)", // aurora yellow + opus: "hsl(311 24% 63%)", // aurora purple +}; + +export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) { + const xConf = METRIC_CONFIG[xMetric]; + const yConf = METRIC_CONFIG[yMetric]; + if (!xConf || !yConf) return null; + + // Group by model + const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {}; + + for (const run of runs) { + const x = xConf.extract(run); + const y = yConf.extract(run); + if (x === null || y === null) continue; + + const model = run.meta.model; + if (!byModel[model]) byModel[model] = []; + byModel[model].push({ + x, + y, + run_id: run.meta.run_id, + prompt: run.meta.prompt_style, + }); + } + + const models = Object.keys(byModel).sort(); + + return ( + <div className="card"> + <h3 style={{ marginBottom: "16px" }}> + {xConf.label} vs {yConf.label} + </h3> + <ResponsiveContainer width="100%" height={350}> + <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}> + <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" /> + <XAxis + dataKey="x" + name={xConf.label} + stroke="hsl(213 14% 65%)" + fontSize={11} + tickFormatter={(v) => xConf.format(v)} + /> + <YAxis + dataKey="y" + name={yConf.label} + stroke="hsl(213 14% 65%)" + fontSize={11} + tickFormatter={(v) => yConf.format(v)} + /> + <Tooltip + contentStyle={{ + background: "hsl(217 16% 15.5%)", + border: "1px solid hsl(217 17% 28%)", + borderRadius: "2px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + }} + formatter={(value: number, name: string) => { + if (name === xConf.label) return [xConf.format(value), name]; + if (name === yConf.label) return [yConf.format(value), name]; + return [value, name]; + }} + /> + <Legend /> + {models.map((model) => ( + <Scatter + key={model} + name={model} + data={byModel[model]} + fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"} + /> + ))} + </ScatterChart> + </ResponsiveContainer> + </div> + ); +} diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx @@ -0,0 +1,168 @@ +import type { Run } from "../lib/data"; + +interface SurprisesProps { + runs: Run[]; +} + +interface Surprise { + title: string; + detail: string; + weaker: { model: string; config: string; score: number; cost: number }; + stronger: { model: string; config: string; score: number; cost: number }; + magnitude: number; +} + +const MODEL_RANK: Record<string, number> = { + haiku: 1, + sonnet: 2, + opus: 3, +}; + +function findSurprises(runs: Run[]): Surprise[] { + const surprises: Surprise[] = []; + + // Group runs by config (everything except model and run number) + const configGroups: Record<string, Run[]> = {}; + for (const run of runs) { + if (run.eval_results?.score == null) continue; + // Build config key without model + const m = run.meta; + const key = [ + m.prompt_style, m.language, m.effort, + m.linter, m.playwright, m.context_file, + m.sub_agents, m.web_search, m.max_budget, + ].join("|"); + (configGroups[key] ??= []).push(run); + } + + // Within each config group, compare models + for (const [, group] of Object.entries(configGroups)) { + const byModel: Record<string, { scores: number[]; costs: number[] }> = {}; + for (const run of group) { + const model = run.meta.model; + if (!byModel[model]) byModel[model] = { scores: [], costs: [] }; + if (run.eval_results?.score != null) { + byModel[model].scores.push(run.eval_results.score); + } + if (run.claude_output?.total_cost_usd != null) { + byModel[model].costs.push(run.claude_output.total_cost_usd); + } + } + + const models = Object.keys(byModel); + for (let i = 0; i < models.length; i++) { + for (let j = i + 1; j < models.length; j++) { + const a = models[i]; + const b = models[j]; + const rankA = MODEL_RANK[a] || 0; + const rankB = MODEL_RANK[b] || 0; + + const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length; + const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length; + const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0; + const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0; + + // Surprise: weaker model scores higher + if (rankA < rankB && avgScoreA > avgScoreB + 0.02) { + surprises.push({ + title: `${a} beat ${b}`, + detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`, + weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA }, + stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB }, + magnitude: avgScoreA - avgScoreB, + }); + } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) { + surprises.push({ + title: `${b} beat ${a}`, + detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`, + weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB }, + stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA }, + magnitude: avgScoreB - avgScoreA, + }); + } + } + } + } + + // Also find cases where simple prompt beats detailed + const promptGroups: Record<string, Run[]> = {}; + for (const run of runs) { + if (run.eval_results?.score == null) continue; + const m = run.meta; + const key = [ + m.model, m.language, m.effort, + m.linter, m.playwright, m.context_file, + ].join("|"); + (promptGroups[key] ??= []).push(run); + } + + for (const [, group] of Object.entries(promptGroups)) { + const byPrompt: Record<string, number[]> = {}; + for (const run of group) { + (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!); + } + if (byPrompt.simple && byPrompt.detailed) { + const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length; + const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length; + if (avgSimple > avgDetailed + 0.05) { + surprises.push({ + title: "Simple prompt beat detailed", + detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`, + weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 }, + stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 }, + magnitude: avgSimple - avgDetailed, + }); + } + } + } + + return surprises.sort((a, b) => b.magnitude - a.magnitude); +} + +export default function Surprises({ runs }: SurprisesProps) { + const surprises = findSurprises(runs); + + if (surprises.length === 0) { + return ( + <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}> + No surprises yet. Run more experiments with different models to find upsets. + </div> + ); + } + + return ( + <div> + <h3 style={{ marginBottom: "12px" }}>Surprises</h3> + <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}> + Where weaker configs outperformed stronger ones + </p> + <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}> + {surprises.map((s, i) => ( + <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}> + <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> + {s.title} + </div> + <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> + {s.detail} + </div> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> + <div> + <span style={{ color: "var(--green)" }}>{s.weaker.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(s.weaker.score * 100).toFixed(0)}% + </span> + </div> + <div style={{ color: "var(--text-muted)" }}>vs</div> + <div> + <span style={{ color: "var(--red)" }}>{s.stronger.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(s.stronger.score * 100).toFixed(0)}% + </span> + </div> + </div> + </div> + ))} + </div> + </div> + ); +} diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro @@ -2,16 +2,26 @@ import Base from "../layouts/Base.astro"; import { loadAllRuns } from "../lib/data"; import Insights from "../components/Insights"; +import ScatterPlot from "../components/ScatterPlot"; +import Surprises from "../components/Surprises"; const runs = loadAllRuns(); --- <Base title="Insights"> <h1 style="margin-bottom: 8px;">Insights</h1> - <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;"> - Which variables actually move the needle? Tornado charts show main effects, - heatmaps reveal interactions. + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> + Which variables move the needle? Where do weaker configs win? </p> - <Insights client:load runs={runs} /> + <Surprises client:load runs={runs} /> + + <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> + <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" /> + <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" /> + </div> + + <div style="margin-top: 32px;"> + <Insights client:load runs={runs} /> + </div> </Base>

Impressum · Datenschutz