loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 42135ccf8f0b74d916836155da84957a9875e4f3
parent 85f722fa4936da91e87193ac60a407948670d8c6
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 07:39:56 +0200

Grid table: grouped view with score/cost ranges per config cell

Default view now groups runs by config (cell_id), showing:
- Score as avg with min-max range (e.g., "72% 68%-80%")
- Ranges highlighted yellow when variance is high
- Pass rate as "2/3" instead of individual PASS/FAIL
- Cost as avg with min-max range
- Links to all individual runs (#1 #2 #3)
- Toggle button to switch between grouped and individual views

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Grid.tsx | 208++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 165 insertions(+), 43 deletions(-)

diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -59,10 +59,63 @@ function getSortValue(run: Run, key: SortKey): string | number { } } +interface CellGroup { + cell_id: string; + runs: Run[]; + meta: Run["meta"]; + scores: number[]; + costs: number[]; + times: number[]; + turns: number[]; + avg: { score: number; cost: number; time: number; turns: number }; + min: { score: number; cost: number }; + max: { score: number; cost: number }; +} + +function groupByCellId(runs: Run[]): CellGroup[] { + const groups: Record<string, Run[]> = {}; + for (const run of runs) { + const cellId = run.meta.cell_id; + (groups[cellId] ??= []).push(run); + } + return Object.entries(groups).map(([cell_id, cellRuns]) => { + const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null); + const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); + const times = cellRuns.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null); + const turnsList = cellRuns.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null); + const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0; + return { + cell_id, + runs: cellRuns, + meta: cellRuns[0].meta, + scores, costs, times, turns: turnsList, + avg: { score: avg(scores), cost: avg(costs), time: avg(times), turns: avg(turnsList) }, + min: { score: scores.length > 0 ? Math.min(...scores) : 0, cost: costs.length > 0 ? Math.min(...costs) : 0 }, + max: { score: scores.length > 0 ? Math.max(...scores) : 0, cost: costs.length > 0 ? Math.max(...costs) : 0 }, + }; + }); +} + +function RangeCell({ min, max, avg, format }: { min: number; max: number; avg: number; format: (v: number) => string }) { + const spread = max - min; + const isWide = format === formatScore ? spread > 0.1 : spread > avg * 0.3; + return ( + <span style={{ fontFamily: "var(--font-mono)" }}> + <span style={{ fontWeight: 600 }}>{format(avg)}</span> + {min !== max && ( + <span style={{ fontSize: "0.65rem", color: isWide ? "var(--yellow)" : "var(--text-muted)", marginLeft: "4px" }}> + {format(min)}-{format(max)} + </span> + )} + </span> + ); +} + export default function Grid({ runs, axisValues, tasks }: GridProps) { const [filters, setFilters] = useState<Record<string, string>>({}); const [sortKey, setSortKey] = useState<SortKey>("score"); const [sortAsc, setSortAsc] = useState(false); + const [grouped, setGrouped] = useState(true); const handleSort = (key: SortKey) => { if (sortKey === key) { @@ -94,13 +147,36 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { }); }, [runs, filters, sortKey, sortAsc]); + const cellGroups = useMemo(() => { + const groups = groupByCellId(filteredRuns); + return groups.sort((a, b) => { + const va = sortKey === "score" ? a.avg.score : sortKey === "cost" ? a.avg.cost : sortKey === "time" ? a.avg.time : sortKey === "turns" ? a.avg.turns : 0; + const vb = sortKey === "score" ? b.avg.score : sortKey === "cost" ? b.avg.cost : sortKey === "time" ? b.avg.time : sortKey === "turns" ? b.avg.turns : 0; + const cmp = va < vb ? -1 : va > vb ? 1 : 0; + return sortAsc ? cmp : -cmp; + }); + }, [filteredRuns, sortKey, sortAsc]); + return ( <div> - <Filters - axisValues={axisValues} - tasks={tasks} - onFilterChange={setFilters} - /> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "12px" }}> + <Filters + axisValues={axisValues} + tasks={tasks} + onFilterChange={setFilters} + /> + <button + onClick={() => setGrouped(!grouped)} + style={{ + padding: "4px 10px", fontSize: "11px", fontFamily: "var(--font-mono)", + background: "transparent", border: "1px solid hsl(var(--border))", + color: "hsl(var(--foreground))", cursor: "pointer", textTransform: "uppercase", + letterSpacing: "0.5px", flexShrink: 0, + }} + > + {grouped ? "Show individual runs" : "Group by config"} + </button> + </div> <div className="card" style={{ overflowX: "auto" }}> <table> @@ -141,51 +217,97 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { </tr> </thead> <tbody> - {filteredRuns.length === 0 ? ( - <tr> - <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}> - {runs.length === 0 - ? "No benchmark results yet. Run the harness to generate data." - : "No results match the current filters."} - </td> - </tr> - ) : ( - filteredRuns.map((run) => ( - <tr key={run.meta.run_id}> - <td> - <a href={`/run/${run.meta.run_id}`} style={{ fontSize: "0.75rem" }}> - {formatRunId(run)} - </a> - </td> - <td>{run.meta.task}</td> - <td> - <span className="badge badge-neutral">{run.meta.model}</span> - </td> - <td>{run.meta.effort}</td> - <td>{run.meta.prompt_style}</td> - <td>{run.meta.language}</td> - <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}> - {formatScore(run.eval_results?.score)} + {grouped ? ( + cellGroups.length === 0 ? ( + <tr> + <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}> + {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."} </td> - <td> - {run.eval_results?.functional?.pass === true ? ( - <span className="badge badge-pass">PASS</span> - ) : run.eval_results?.functional?.pass === false ? ( - <span className="badge badge-fail">FAIL</span> - ) : ( - <span className="badge badge-neutral">-</span> - )} + </tr> + ) : ( + cellGroups.map((g) => ( + <tr key={g.cell_id}> + <td> + <div style={{ fontSize: "0.75rem" }}> + {g.runs.map((r, i) => ( + <span key={r.meta.run_id}> + {i > 0 && " "} + <a href={`/run/${r.meta.run_id}`} style={{ color: "var(--accent)" }}>#{r.meta.run_number}</a> + </span> + ))} + </div> + </td> + <td>{g.meta.task}</td> + <td><span className="badge badge-neutral">{g.meta.model}</span></td> + <td>{g.meta.effort}</td> + <td>{g.meta.prompt_style}</td> + <td>{g.meta.language}</td> + <td className={`score-cell ${scoreClass(g.avg.score)}`}> + <RangeCell min={g.min.score} max={g.max.score} avg={g.avg.score} format={formatScore} /> + </td> + <td> + {(() => { + const passes = g.runs.filter(r => r.eval_results?.functional?.pass === true).length; + const fails = g.runs.filter(r => r.eval_results?.functional?.pass === false).length; + const total = g.runs.length; + if (passes === total) return <span className="badge badge-pass">{passes}/{total}</span>; + if (fails === total) return <span className="badge badge-fail">0/{total}</span>; + return <span className="badge badge-neutral">{passes}/{total}</span>; + })()} + </td> + <td> + <RangeCell min={g.min.cost} max={g.max.cost} avg={g.avg.cost} format={formatCost} /> + </td> + <td>{formatTime(Math.round(g.avg.time))}</td> + <td>{Math.round(g.avg.turns)}</td> + </tr> + )) + ) + ) : ( + filteredRuns.length === 0 ? ( + <tr> + <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}> + {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."} </td> - <td>{formatCost(run.claude_output?.total_cost_usd)}</td> - <td>{formatTime(run.meta.wall_time_seconds)}</td> - <td>{run.claude_output?.num_turns ?? "-"}</td> </tr> - )) + ) : ( + filteredRuns.map((run) => ( + <tr key={run.meta.run_id}> + <td> + <a href={`/run/${run.meta.run_id}`} style={{ fontSize: "0.75rem" }}> + {formatRunId(run)} + </a> + </td> + <td>{run.meta.task}</td> + <td><span className="badge badge-neutral">{run.meta.model}</span></td> + <td>{run.meta.effort}</td> + <td>{run.meta.prompt_style}</td> + <td>{run.meta.language}</td> + <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}> + {formatScore(run.eval_results?.score)} + </td> + <td> + {run.eval_results?.functional?.pass === true ? ( + <span className="badge badge-pass">PASS</span> + ) : run.eval_results?.functional?.pass === false ? ( + <span className="badge badge-fail">FAIL</span> + ) : ( + <span className="badge badge-neutral">-</span> + )} + </td> + <td>{formatCost(run.claude_output?.total_cost_usd)}</td> + <td>{formatTime(run.meta.wall_time_seconds)}</td> + <td>{run.claude_output?.num_turns ?? "-"}</td> + </tr> + )) + ) )} </tbody> </table> <div style={{ padding: "12px", color: "var(--text-muted)", fontSize: "0.75rem" }}> - Showing {filteredRuns.length} of {runs.length} runs + {grouped + ? `${cellGroups.length} configs (${filteredRuns.length} runs)` + : `${filteredRuns.length} of ${runs.length} runs`} </div> </div> </div>

Impressum · Datenschutz