loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 85f722fa4936da91e87193ac60a407948670d8c6
parent 1964a38e54bd15acd879301fb499d1ccea864778
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 07:32:10 +0200

Surprise cards now clickable with run details and outlier detection

Click a surprise card to see:
- Which config dimensions differed
- Every run involved with model, score, cost, and link to run page

New outlier detection:
- Flags individual sonnet/opus runs that scored far below haiku average
- These show exactly which config caused the poor performance

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Surprises.tsx | 261++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 198 insertions(+), 63 deletions(-)

diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx @@ -1,15 +1,26 @@ +import { useState } from "react"; import type { Run } from "../lib/types"; interface SurprisesProps { runs: Run[]; } +interface RunRef { + run_id: string; + model: string; + score: number; + cost: number; + config: Record<string, string>; +} + interface Surprise { title: string; detail: string; weaker: { model: string; config: string; score: number; cost: number }; stronger: { model: string; config: string; score: number; cost: number }; magnitude: number; + runs: RunRef[]; + configDiffs: string[]; } const MODEL_RANK: Record<string, number> = { @@ -18,6 +29,34 @@ const MODEL_RANK: Record<string, number> = { opus: 3, }; +const CONFIG_KEYS = [ + "prompt_style", "language", "effort", "human_language", + "linter", "playwright", "context_file", "sub_agents", + "web_search", "max_budget", "tool_read", "tool_write", + "tool_edit", "tool_glob", "tool_grep", +]; + +function getConfigKey(meta: Run["meta"]): string { + return CONFIG_KEYS.map(k => `${k}=${(meta as Record<string, unknown>)[k]}`).join("|"); +} + +function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] { + // Find which config values differ between the two groups + const diffs: string[] = []; + const metaA = runsA[0]?.meta; + const metaB = runsB[0]?.meta; + if (!metaA || !metaB) return diffs; + + for (const key of CONFIG_KEYS) { + const va = String((metaA as Record<string, unknown>)[key]); + const vb = String((metaB as Record<string, unknown>)[key]); + if (va !== vb) { + diffs.push(`${key}: ${va} vs ${vb}`); + } + } + return diffs; +} + function findSurprises(runs: Run[]): Surprise[] { const surprises: Surprise[] = []; @@ -25,28 +64,16 @@ function findSurprises(runs: Run[]): Surprise[] { const configGroups: Record<string, Run[]> = {}; for (const run of runs) { if (run.eval_results?.score == null) continue; - // Build config key without model const m = run.meta; - const key = [ - m.prompt_style, m.language, m.effort, - m.linter, m.playwright, m.context_file, - m.sub_agents, m.web_search, m.max_budget, - ].join("|"); + const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|"); (configGroups[key] ??= []).push(run); } // Within each config group, compare models for (const [, group] of Object.entries(configGroups)) { - const byModel: Record<string, { scores: number[]; costs: number[] }> = {}; + const byModel: Record<string, Run[]> = {}; for (const run of group) { - const model = run.meta.model; - if (!byModel[model]) byModel[model] = { scores: [], costs: [] }; - if (run.eval_results?.score != null) { - byModel[model].scores.push(run.eval_results.score); - } - if (run.claude_output?.total_cost_usd != null) { - byModel[model].costs.push(run.claude_output.total_cost_usd); - } + (byModel[run.meta.model] ??= []).push(run); } const models = Object.keys(byModel); @@ -57,60 +84,127 @@ function findSurprises(runs: Run[]): Surprise[] { const rankA = MODEL_RANK[a] || 0; const rankB = MODEL_RANK[b] || 0; - const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length; - const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length; - const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0; - const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0; + const runsA = byModel[a]; + const runsB = byModel[b]; + const scoresA = runsA.map(r => r.eval_results!.score!); + const scoresB = runsB.map(r => r.eval_results!.score!); + const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length; + const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length; + + const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0); + const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0); + const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length; + const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length; - // Surprise: weaker model scores higher - if (rankA < rankB && avgScoreA > avgScoreB + 0.02) { + const allRuns = [ + ...runsA.map(r => ({ + run_id: r.meta.run_id, model: a, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ...runsB.map(r => ({ + run_id: r.meta.run_id, model: b, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ]; + + if (rankA < rankB && avgA > avgB + 0.02) { surprises.push({ title: `${a} beat ${b}`, - detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`, - weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA }, - stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB }, - magnitude: avgScoreA - avgScoreB, + detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`, + weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, + stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, + magnitude: avgA - avgB, + runs: allRuns, + configDiffs: getConfigDiffs(runsA, runsB), }); - } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) { + } else if (rankB < rankA && avgB > avgA + 0.02) { surprises.push({ title: `${b} beat ${a}`, - detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`, - weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB }, - stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA }, - magnitude: avgScoreB - avgScoreA, + detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`, + weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, + stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, + magnitude: avgB - avgA, + runs: allRuns, + configDiffs: getConfigDiffs(runsB, runsA), }); } } } } - // Also find cases where simple prompt beats detailed + // Find individual outlier runs where sonnet scored far below haiku + const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!); + const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0; + + for (const run of runs) { + if (run.eval_results?.score == null) continue; + const model = run.meta.model; + const score = run.eval_results.score; + const rank = MODEL_RANK[model] || 0; + + // Flag if a "stronger" model scored significantly below haiku average + if (rank > 1 && score < haikuMean - 0.15) { + surprises.push({ + title: `${model} run scored far below haiku avg`, + detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`, + weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 }, + stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 }, + magnitude: haikuMean - score, + runs: [{ + run_id: run.meta.run_id, model, + score, cost: run.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])), + }], + configDiffs: CONFIG_KEYS.filter(k => { + const v = String((run.meta as Record<string, unknown>)[k]); + return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple"; + }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`), + }); + } + } + + // Simple prompt beats detailed const promptGroups: Record<string, Run[]> = {}; for (const run of runs) { if (run.eval_results?.score == null) continue; const m = run.meta; - const key = [ - m.model, m.language, m.effort, - m.linter, m.playwright, m.context_file, - ].join("|"); + const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|"); (promptGroups[key] ??= []).push(run); } for (const [, group] of Object.entries(promptGroups)) { - const byPrompt: Record<string, number[]> = {}; + const byPrompt: Record<string, Run[]> = {}; for (const run of group) { - (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!); + (byPrompt[run.meta.prompt_style] ??= []).push(run); } if (byPrompt.simple && byPrompt.detailed) { - const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length; - const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length; + const simpleRuns = byPrompt.simple; + const detailedRuns = byPrompt.detailed; + const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length; + const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length; if (avgSimple > avgDetailed + 0.05) { + const allRuns = [ + ...simpleRuns.map(r => ({ + run_id: r.meta.run_id, model: r.meta.model, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ...detailedRuns.map(r => ({ + run_id: r.meta.run_id, model: r.meta.model, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ]; surprises.push({ title: "Simple prompt beat detailed", detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`, weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 }, stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 }, magnitude: avgSimple - avgDetailed, + runs: allRuns, + configDiffs: ["prompt_style: simple vs detailed"], }); } } @@ -119,6 +213,69 @@ function findSurprises(runs: Run[]): Surprise[] { return surprises.sort((a, b) => b.magnitude - a.magnitude); } +function SurpriseCard({ surprise }: { surprise: Surprise }) { + const [expanded, setExpanded] = useState(false); + + return ( + <div className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)", cursor: "pointer" }} onClick={() => setExpanded(!expanded)}> + <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> + {surprise.title} + </div> + <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> + {surprise.detail} + </div> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> + <div> + <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(surprise.weaker.score * 100).toFixed(0)}% + </span> + </div> + <div style={{ color: "var(--text-muted)" }}>vs</div> + <div> + <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(surprise.stronger.score * 100).toFixed(0)}% + </span> + </div> + </div> + + {expanded && ( + <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}> + {surprise.configDiffs.length > 0 && ( + <div style={{ marginBottom: "8px" }}> + <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div> + {surprise.configDiffs.map((diff, i) => ( + <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div> + ))} + </div> + )} + + <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}> + Runs ({surprise.runs.length}) + </div> + {surprise.runs.map((r) => ( + <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}> + <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}> + {r.model} + </span> + <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}> + {(r.score * 100).toFixed(0)}% + </span> + <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> + ${r.cost.toFixed(2)} + </span> + <a href={`/run/${r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }}> + view + </a> + </div> + ))} + </div> + )} + </div> + ); +} + export default function Surprises({ runs }: SurprisesProps) { const surprises = findSurprises(runs); @@ -134,33 +291,11 @@ export default function Surprises({ runs }: SurprisesProps) { <div> <h3 style={{ marginBottom: "12px" }}>Surprises</h3> <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}> - Where weaker configs outperformed stronger ones + Click to expand. Where weaker configs outperformed stronger ones. </p> <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}> {surprises.map((s, i) => ( - <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}> - <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> - {s.title} - </div> - <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> - {s.detail} - </div> - <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> - <div> - <span style={{ color: "var(--green)" }}>{s.weaker.model}</span> - <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> - {(s.weaker.score * 100).toFixed(0)}% - </span> - </div> - <div style={{ color: "var(--text-muted)" }}>vs</div> - <div> - <span style={{ color: "var(--red)" }}>{s.stronger.model}</span> - <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> - {(s.stronger.score * 100).toFixed(0)}% - </span> - </div> - </div> - </div> + <SurpriseCard key={i} surprise={s} /> ))} </div> </div>

Impressum · Datenschutz