loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 54744304ea79848d404b42ebee7a6a45a6228496
parent 82f6a5b0b78b3ac64bfad4963937ddcd7687a317
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 08:10:15 +0200

Add cell detail page with run comparison and artifact gallery

New /cell/{id} page showing everything about one configuration:
- Summary stats: avg/min/max/median score, consistency rating
- Config pills: all axis values with active highlighted
- Score breakdown: per-dimension bars with min-max range bands
- Run comparison table: all runs side by side, best/worst highlighted
  green/red for each metric (score, cost, turns, time, all eval dims)
- Artifact gallery: side-by-side Tetris iframes for each run
- Agent behavior: tool calls, wasted turns, productivity per run

Grid grouped view now links to cell pages via "cell" link.
154 pages total (39 cells + runs + index pages).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/CellDetail.tsx | 561+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/components/Grid.tsx | 3++-
Adashboard/src/pages/cell/[id].astro | 49+++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 612 insertions(+), 1 deletion(-)

diff --git a/dashboard/src/components/CellDetail.tsx b/dashboard/src/components/CellDetail.tsx @@ -0,0 +1,561 @@ +import type { Run, AxisName } from "../lib/types"; + +interface CellDetailProps { + runs: Run[]; + axisValues: Record<AxisName, string[]>; +} + +const AXIS_CONFIG: Array<{ key: string; label: string }> = [ + { key: "model", label: "Model" }, + { key: "effort", label: "Effort" }, + { key: "prompt_style", label: "Prompt" }, + { key: "language", label: "Language" }, + { key: "human_language", label: "Human Lang" }, + { key: "tool_read", label: "Read" }, + { key: "tool_write", label: "Write" }, + { key: "tool_edit", label: "Edit" }, + { key: "tool_glob", label: "Glob" }, + { key: "tool_grep", label: "Grep" }, + { key: "linter", label: "Linter" }, + { key: "playwright", label: "Playwright" }, + { key: "context_file", label: "Context" }, + { key: "sub_agents", label: "Sub-agents" }, + { key: "web_search", label: "Web Search" }, + { key: "max_budget", label: "Budget" }, +]; + +const EVAL_DIMENSIONS: Array<{ key: string; label: string }> = [ + { key: "structural", label: "Structural" }, + { key: "functional", label: "Functional" }, + { key: "quality", label: "Quality" }, + { key: "code_analysis", label: "Code Analysis" }, + { key: "gameplay_bot", label: "Gameplay Bot" }, + { key: "transcript_analysis", label: "Transcript" }, +]; + +// -- Helpers -- + +function avg(arr: number[]): number { + return arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0; +} + +function median(arr: number[]): number { + if (arr.length === 0) return 0; + const sorted = [...arr].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2; +} + +function formatPct(v: number): string { + return Math.round(v * 100) + "%"; +} + +function formatCost(v: number | null | undefined): string { + if (v === null || v === undefined) return "-"; + return "$" + v.toFixed(2); +} + +function formatTime(seconds: number | null | undefined): string { + if (seconds === null || seconds === undefined) return "-"; + const s = Math.round(seconds); + if (s < 60) return s + "s"; + return Math.floor(s / 60) + "m " + (s % 60) + "s"; +} + +function scoreColor(pct: number): string { + if (pct >= 70) return "var(--green)"; + if (pct >= 40) return "var(--yellow)"; + return "var(--red)"; +} + +// -- Sub-components -- + +function ConfigPills({ + label, + activeValue, + allValues, +}: { + label: string; + activeValue: string; + allValues: string[]; +}) { + return ( + <div style={{ display: "flex", alignItems: "center", gap: "8px", marginBottom: "4px" }}> + <div style={{ + width: "80px", fontSize: "0.65rem", color: "var(--text-muted)", + textTransform: "uppercase", letterSpacing: "0.03em", textAlign: "right", flexShrink: 0, + }}> + {label} + </div> + <div style={{ display: "flex", gap: "3px", flexWrap: "wrap" }}> + {allValues.map((val) => ( + <span key={val} style={{ + padding: "1px 6px", + fontSize: "0.65rem", + fontFamily: "var(--font-mono)", + background: val === activeValue ? "rgba(255, 255, 255, 0.1)" : "transparent", + color: val === activeValue ? "#fff" : "rgba(255, 255, 255, 0.2)", + border: val === activeValue ? "1px solid rgba(255, 255, 255, 0.3)" : "1px solid rgba(255, 255, 255, 0.05)", + }}> + {val} + </span> + ))} + </div> + </div> + ); +} + +function RangeBar({ + label, + avgScore, + minScore, + maxScore, +}: { + label: string; + avgScore: number; + minScore: number; + maxScore: number; +}) { + const avgPct = Math.round(avgScore * 100); + const minPct = Math.round(minScore * 100); + const maxPct = Math.round(maxScore * 100); + const color = scoreColor(avgPct); + + return ( + <div style={{ marginBottom: "8px" }}> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.75rem", marginBottom: "2px" }}> + <span>{label}</span> + <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, color }}> + {avgPct}% + {minPct !== maxPct && ( + <span style={{ fontSize: "0.65rem", color: "var(--text-muted)", marginLeft: "4px" }}> + {minPct}-{maxPct}% + </span> + )} + </span> + </div> + <div style={{ position: "relative", background: "var(--bg)", height: "6px", overflow: "hidden" }}> + {/* Range band */} + {minPct !== maxPct && ( + <div style={{ + position: "absolute", + left: `${minPct}%`, + width: `${maxPct - minPct}%`, + height: "100%", + background: color, + opacity: 0.2, + }} /> + )} + {/* Average marker */} + <div style={{ + position: "absolute", + left: 0, + width: `${avgPct}%`, + height: "100%", + background: color, + opacity: 0.7, + }} /> + </div> + </div> + ); +} + +// -- Main component -- + +export default function CellDetail({ runs, axisValues }: CellDetailProps) { + const meta = runs[0].meta; + + // Aggregate scores + const scores = runs.map(r => r.eval_results?.score).filter((s): s is number => s != null); + const costs = runs.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); + const times = runs.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null); + const turnsList = runs.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null); + + const avgScore = avg(scores); + const minScore = scores.length > 0 ? Math.min(...scores) : 0; + const maxScore = scores.length > 0 ? Math.max(...scores) : 0; + const medianScore = median(scores); + const range = maxScore - minScore; + const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0; + + let consistencyLabel: string; + let consistencyColor: string; + if (coefficientOfVariation < 10) { + consistencyLabel = "Highly consistent"; + consistencyColor = "var(--green)"; + } else if (coefficientOfVariation <= 25) { + consistencyLabel = "Moderate"; + consistencyColor = "var(--yellow)"; + } else { + consistencyLabel = "Volatile"; + consistencyColor = "var(--red)"; + } + + // Per-dimension aggregation + const dimStats = EVAL_DIMENSIONS.map(({ key, label }) => { + const dimScores = runs + .map(r => (r.eval_results as Record<string, any>)?.[key]?.score) + .filter((s): s is number => s != null); + return { + key, + label, + avg: avg(dimScores), + min: dimScores.length > 0 ? Math.min(...dimScores) : 0, + max: dimScores.length > 0 ? Math.max(...dimScores) : 0, + count: dimScores.length, + }; + }); + + // Comparison table metrics + const comparisonRows: Array<{ + label: string; + values: (string | number | null)[]; + type: "pct" | "cost" | "time" | "number"; + }> = [ + { + label: "Overall Score", + values: runs.map(r => r.eval_results?.score ?? null), + type: "pct", + }, + ...EVAL_DIMENSIONS.map(({ key, label }) => ({ + label, + values: runs.map(r => (r.eval_results as Record<string, any>)?.[key]?.score ?? null), + type: "pct" as const, + })), + { + label: "Cost", + values: runs.map(r => r.claude_output?.total_cost_usd ?? null), + type: "cost" as const, + }, + { + label: "Turns", + values: runs.map(r => r.claude_output?.num_turns ?? null), + type: "number" as const, + }, + { + label: "Wall Time", + values: runs.map(r => r.meta.wall_time_seconds ?? null), + type: "time" as const, + }, + ]; + + // Agent behavior data + const agentData = runs.map(r => { + const ta = (r.eval_results as Record<string, any>)?.transcript_analysis; + return { + run_id: r.meta.run_id, + run_number: r.meta.run_number, + tool_calls: ta?.tool_calls?.total ?? null, + bash: ta?.tool_calls?.bash ?? null, + write: ta?.tool_calls?.write ?? null, + edit: ta?.tool_calls?.edit ?? null, + wasted_turns: ta?.wasted_turns?.total ?? null, + productivity: ta?.productivity_ratio ?? null, + self_tested: ta?.self_tested ?? null, + errors: ta?.errors_encountered ?? null, + thinking_blocks: ta?.thinking_blocks ?? null, + }; + }); + + // Artifact check + const hasArtifact = meta.task === "tetris" || meta.task === "bookmarks-api"; + + return ( + <div style={{ display: "flex", flexDirection: "column", gap: "20px" }}> + + {/* A. Header row: three cards */} + <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr 1fr", gap: "16px" }}> + + {/* Summary stats card */} + <div className="card" style={{ padding: "16px" }}> + <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Summary</h3> + + {/* Score range visual */} + <div style={{ marginBottom: "16px" }}> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "baseline", marginBottom: "4px" }}> + <span style={{ fontSize: "0.65rem", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px" }}> + Score + </span> + <span style={{ + fontFamily: "var(--font-mono)", fontWeight: 700, fontSize: "1.75rem", + color: scoreColor(Math.round(avgScore * 100)), + }}> + {formatPct(avgScore)} + </span> + </div> + <div style={{ position: "relative", background: "var(--bg)", height: "8px", marginBottom: "4px" }}> + {/* Range band */} + <div style={{ + position: "absolute", + left: `${Math.round(minScore * 100)}%`, + width: `${Math.round((maxScore - minScore) * 100)}%`, + height: "100%", + background: scoreColor(Math.round(avgScore * 100)), + opacity: 0.3, + }} /> + {/* Avg marker */} + <div style={{ + position: "absolute", + left: `${Math.round(avgScore * 100)}%`, + top: "-2px", + width: "2px", + height: "12px", + background: scoreColor(Math.round(avgScore * 100)), + }} /> + </div> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.6rem", color: "var(--text-muted)" }}> + <span>min {formatPct(minScore)}</span> + <span>med {formatPct(medianScore)}</span> + <span>max {formatPct(maxScore)}</span> + </div> + </div> + + <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "8px", fontSize: "0.75rem" }}> + <div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Cost</div> + <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{formatCost(avg(costs))}</div> + </div> + <div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Turns</div> + <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{turnsList.length > 0 ? Math.round(avg(turnsList)) : "-"}</div> + </div> + <div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Time</div> + <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{formatTime(avg(times))}</div> + </div> + <div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Runs</div> + <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{runs.length}</div> + </div> + </div> + + <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "8px" }}> + <div style={{ fontSize: "0.6rem", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "2px" }}> + Consistency + </div> + <div style={{ display: "flex", alignItems: "baseline", gap: "6px" }}> + <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "0.8rem", color: consistencyColor }}> + {consistencyLabel} + </span> + <span style={{ fontSize: "0.65rem", color: "var(--text-muted)" }}> + {coefficientOfVariation.toFixed(0)}% range/avg + </span> + </div> + </div> + </div> + + {/* Configuration card */} + <div className="card" style={{ padding: "16px" }}> + <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Configuration</h3> + {AXIS_CONFIG.map(({ key, label }) => { + const active = String((meta as unknown as Record<string, unknown>)[key] ?? ""); + const all = (axisValues as Record<string, string[]>)[key] || [active]; + if (!active) return null; + return <ConfigPills key={key} label={label} activeValue={active} allValues={all} />; + })} + </div> + + {/* Variance card */} + <div className="card" style={{ padding: "16px" }}> + <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Score Breakdown</h3> + {dimStats.map((d) => ( + d.count > 0 ? ( + <RangeBar + key={d.key} + label={d.label} + avgScore={d.avg} + minScore={d.min} + maxScore={d.max} + /> + ) : ( + <div key={d.key} style={{ marginBottom: "8px" }}> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.75rem", marginBottom: "2px" }}> + <span>{d.label}</span> + <span style={{ color: "var(--text-muted)" }}>N/A</span> + </div> + </div> + ) + ))} + </div> + </div> + + {/* B. Run comparison table */} + <div className="card" style={{ padding: "16px", overflowX: "auto" }}> + <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Run Comparison</h3> + <table> + <thead> + <tr> + <th style={{ textAlign: "left" }}>Metric</th> + {runs.map((r) => ( + <th key={r.meta.run_id} style={{ textAlign: "center" }}> + <a href={`/run/${r.meta.run_id}`} style={{ color: "var(--accent)" }}> + Run #{r.meta.run_number} + </a> + </th> + ))} + </tr> + </thead> + <tbody> + {comparisonRows.map((row) => { + const numericVals = row.values.filter((v): v is number => v != null); + const best = numericVals.length > 0 ? ( + row.type === "cost" || row.type === "time" ? Math.min(...numericVals) : Math.max(...numericVals) + ) : null; + const worst = numericVals.length > 0 ? ( + row.type === "cost" || row.type === "time" ? Math.max(...numericVals) : Math.min(...numericVals) + ) : null; + const allSame = best !== null && best === worst; + + return ( + <tr key={row.label}> + <td style={{ fontSize: "0.75rem" }}>{row.label}</td> + {row.values.map((v, i) => { + let display: string; + if (v === null || v === undefined) { + display = "-"; + } else if (row.type === "pct") { + display = formatPct(v as number); + } else if (row.type === "cost") { + display = formatCost(v as number); + } else if (row.type === "time") { + display = formatTime(v as number); + } else { + display = String(v); + } + + let cellColor: string | undefined; + if (v !== null && !allSame) { + if (v === best) cellColor = "var(--green)"; + else if (v === worst) cellColor = "var(--red)"; + } + + return ( + <td key={i} style={{ + textAlign: "center", + fontFamily: "var(--font-mono)", + fontSize: "0.75rem", + color: cellColor, + fontWeight: cellColor ? 600 : undefined, + }}> + {display} + </td> + ); + })} + </tr> + ); + })} + </tbody> + </table> + </div> + + {/* C. Artifact gallery */} + {hasArtifact && ( + <div> + <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Artifacts</h3> + <div style={{ display: "grid", gridTemplateColumns: `repeat(${Math.min(runs.length, 3)}, 1fr)`, gap: "16px" }}> + {runs.map((r) => { + const artifactUrl = `/artifacts/${r.meta.run_id}/index.html`; + const score = r.eval_results?.score; + return ( + <div key={r.meta.run_id} className="card" style={{ padding: "0", overflow: "hidden" }}> + <div style={{ + padding: "8px 12px", + borderBottom: "1px solid var(--border)", + fontSize: "0.75rem", + display: "flex", + justifyContent: "space-between", + alignItems: "center", + }}> + <a href={`/run/${r.meta.run_id}`} style={{ color: "var(--accent)", fontWeight: 600 }}> + Run #{r.meta.run_number} + </a> + <span style={{ + fontFamily: "var(--font-mono)", + fontWeight: 600, + color: score != null ? scoreColor(Math.round(score * 100)) : "var(--text-muted)", + }}> + {score != null ? formatPct(score) : "-"} + </span> + </div> + <iframe + srcDoc={`<!DOCTYPE html><html style="height:100%"><head><meta charset="UTF-8"></head><body style="margin:0;height:100%"><iframe src="${artifactUrl}" style="width:100%;height:100%;border:none" sandbox="allow-scripts"></iframe></body></html>`} + style={{ + width: "100%", + height: "50vh", + border: "none", + background: "#fff", + }} + sandbox="allow-scripts allow-same-origin" + title={`Run #${r.meta.run_number} preview`} + /> + </div> + ); + })} + </div> + </div> + )} + + {/* D. Agent behavior comparison */} + {agentData.some(a => a.tool_calls !== null) && ( + <div className="card" style={{ padding: "16px", overflowX: "auto" }}> + <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Agent Behavior</h3> + <table> + <thead> + <tr> + <th style={{ textAlign: "left" }}>Metric</th> + {agentData.map((a) => ( + <th key={a.run_id} style={{ textAlign: "center" }}> + <a href={`/run/${a.run_id}`} style={{ color: "var(--accent)" }}> + Run #{a.run_number} + </a> + </th> + ))} + </tr> + </thead> + <tbody> + {[ + { label: "Tool calls", key: "tool_calls" as const }, + { label: "Bash", key: "bash" as const }, + { label: "Write", key: "write" as const }, + { label: "Edit", key: "edit" as const }, + { label: "Wasted turns", key: "wasted_turns" as const }, + { label: "Errors", key: "errors" as const }, + { label: "Thinking blocks", key: "thinking_blocks" as const }, + ].map((row) => ( + <tr key={row.label}> + <td style={{ fontSize: "0.75rem" }}>{row.label}</td> + {agentData.map((a, i) => ( + <td key={i} style={{ textAlign: "center", fontFamily: "var(--font-mono)", fontSize: "0.75rem" }}> + {a[row.key] ?? "-"} + </td> + ))} + </tr> + ))} + <tr> + <td style={{ fontSize: "0.75rem" }}>Productivity</td> + {agentData.map((a, i) => ( + <td key={i} style={{ textAlign: "center", fontFamily: "var(--font-mono)", fontSize: "0.75rem" }}> + {a.productivity != null ? Math.round(a.productivity * 100) + "%" : "-"} + </td> + ))} + </tr> + <tr> + <td style={{ fontSize: "0.75rem" }}>Self-tested</td> + {agentData.map((a, i) => ( + <td key={i} style={{ + textAlign: "center", + fontFamily: "var(--font-mono)", + fontSize: "0.75rem", + color: a.self_tested === true ? "var(--green)" : a.self_tested === false ? "var(--text-muted)" : undefined, + }}> + {a.self_tested === true ? "yes" : a.self_tested === false ? "no" : "-"} + </td> + ))} + </tr> + </tbody> + </table> + </div> + )} + </div> + ); +} diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -228,7 +228,8 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { cellGroups.map((g) => ( <tr key={g.cell_id}> <td> - <div style={{ fontSize: "0.75rem" }}> + <div style={{ fontSize: "0.75rem", display: "flex", alignItems: "center", gap: "6px" }}> + <a href={`/cell/${g.cell_id}`} style={{ color: "var(--accent)", fontSize: "0.65rem", textTransform: "uppercase", letterSpacing: "0.5px", opacity: 0.7 }} title="View cell detail">cell</a> {g.runs.map((r, i) => ( <span key={r.meta.run_id}> {i > 0 && " "} diff --git a/dashboard/src/pages/cell/[id].astro b/dashboard/src/pages/cell/[id].astro @@ -0,0 +1,49 @@ +--- +import Base from "../../layouts/Base.astro"; +import { loadAllRuns, getAxisValues } from "../../lib/data"; +import CellDetail from "../../components/CellDetail"; + +export function getStaticPaths() { + const runs = loadAllRuns(); + const axisValues = getAxisValues(runs); + + // Group runs by cell_id + const cells = new Map(); + for (const run of runs) { + const id = run.meta.cell_id; + if (!cells.has(id)) cells.set(id, []); + cells.get(id).push(run); + } + + return [...cells.entries()].map(([cellId, cellRuns]) => ({ + params: { id: cellId }, + props: { cellRuns, axisValues }, + })); +} + +const { cellRuns, axisValues } = Astro.props; +const firstRun = cellRuns[0]; +const configSummary = `${firstRun.meta.model} / ${firstRun.meta.prompt_style} / ${firstRun.meta.language}`; +--- + +<Base title={`${firstRun.meta.task} - ${configSummary}`}> + <div style="margin-bottom: 16px;"> + <a href="/" style="font-size: 0.875rem;">Back to Grid</a> + </div> + <div style="display: flex; align-items: baseline; gap: 12px; margin-bottom: 4px;"> + <h1 style="font-size: 1.5rem;">{firstRun.meta.task}</h1> + <span style="color: var(--text-muted); font-size: 0.875rem;">{cellRuns.length} runs</span> + </div> + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.8rem;"> + {configSummary} + </p> + + <CellDetail client:load runs={cellRuns} axisValues={axisValues} /> +</Base> + +<style> + :global(.container) { + max-width: none !important; + padding: 0 32px !important; + } +</style>

Impressum · Datenschutz