commit 42135ccf8f0b74d916836155da84957a9875e4f3
parent 85f722fa4936da91e87193ac60a407948670d8c6
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 07:39:56 +0200
Grid table: grouped view with score/cost ranges per config cell
Default view now groups runs by config (cell_id), showing:
- Score as avg with min-max range (e.g., "72% 68%-80%")
- Ranges highlighted yellow when variance is high
- Pass rate as "2/3" instead of individual PASS/FAIL
- Cost as avg with min-max range
- Links to all individual runs (#1 #2 #3)
- Toggle button to switch between grouped and individual views
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 165 insertions(+), 43 deletions(-)
diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx
@@ -59,10 +59,63 @@ function getSortValue(run: Run, key: SortKey): string | number {
}
}
+interface CellGroup {
+ cell_id: string;
+ runs: Run[];
+ meta: Run["meta"];
+ scores: number[];
+ costs: number[];
+ times: number[];
+ turns: number[];
+ avg: { score: number; cost: number; time: number; turns: number };
+ min: { score: number; cost: number };
+ max: { score: number; cost: number };
+}
+
+function groupByCellId(runs: Run[]): CellGroup[] {
+ const groups: Record<string, Run[]> = {};
+ for (const run of runs) {
+ const cellId = run.meta.cell_id;
+ (groups[cellId] ??= []).push(run);
+ }
+ return Object.entries(groups).map(([cell_id, cellRuns]) => {
+ const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null);
+ const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null);
+ const times = cellRuns.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null);
+ const turnsList = cellRuns.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null);
+ const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
+ return {
+ cell_id,
+ runs: cellRuns,
+ meta: cellRuns[0].meta,
+ scores, costs, times, turns: turnsList,
+ avg: { score: avg(scores), cost: avg(costs), time: avg(times), turns: avg(turnsList) },
+ min: { score: scores.length > 0 ? Math.min(...scores) : 0, cost: costs.length > 0 ? Math.min(...costs) : 0 },
+ max: { score: scores.length > 0 ? Math.max(...scores) : 0, cost: costs.length > 0 ? Math.max(...costs) : 0 },
+ };
+ });
+}
+
+function RangeCell({ min, max, avg, format }: { min: number; max: number; avg: number; format: (v: number) => string }) {
+ const spread = max - min;
+ const isWide = format === formatScore ? spread > 0.1 : spread > avg * 0.3;
+ return (
+ <span style={{ fontFamily: "var(--font-mono)" }}>
+ <span style={{ fontWeight: 600 }}>{format(avg)}</span>
+ {min !== max && (
+ <span style={{ fontSize: "0.65rem", color: isWide ? "var(--yellow)" : "var(--text-muted)", marginLeft: "4px" }}>
+ {format(min)}-{format(max)}
+ </span>
+ )}
+ </span>
+ );
+}
+
export default function Grid({ runs, axisValues, tasks }: GridProps) {
const [filters, setFilters] = useState<Record<string, string>>({});
const [sortKey, setSortKey] = useState<SortKey>("score");
const [sortAsc, setSortAsc] = useState(false);
+ const [grouped, setGrouped] = useState(true);
const handleSort = (key: SortKey) => {
if (sortKey === key) {
@@ -94,13 +147,36 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
});
}, [runs, filters, sortKey, sortAsc]);
+ const cellGroups = useMemo(() => {
+ const groups = groupByCellId(filteredRuns);
+ return groups.sort((a, b) => {
+ const va = sortKey === "score" ? a.avg.score : sortKey === "cost" ? a.avg.cost : sortKey === "time" ? a.avg.time : sortKey === "turns" ? a.avg.turns : 0;
+ const vb = sortKey === "score" ? b.avg.score : sortKey === "cost" ? b.avg.cost : sortKey === "time" ? b.avg.time : sortKey === "turns" ? b.avg.turns : 0;
+ const cmp = va < vb ? -1 : va > vb ? 1 : 0;
+ return sortAsc ? cmp : -cmp;
+ });
+ }, [filteredRuns, sortKey, sortAsc]);
+
return (
<div>
- <Filters
- axisValues={axisValues}
- tasks={tasks}
- onFilterChange={setFilters}
- />
+ <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "12px" }}>
+ <Filters
+ axisValues={axisValues}
+ tasks={tasks}
+ onFilterChange={setFilters}
+ />
+ <button
+ onClick={() => setGrouped(!grouped)}
+ style={{
+ padding: "4px 10px", fontSize: "11px", fontFamily: "var(--font-mono)",
+ background: "transparent", border: "1px solid hsl(var(--border))",
+ color: "hsl(var(--foreground))", cursor: "pointer", textTransform: "uppercase",
+ letterSpacing: "0.5px", flexShrink: 0,
+ }}
+ >
+ {grouped ? "Show individual runs" : "Group by config"}
+ </button>
+ </div>
<div className="card" style={{ overflowX: "auto" }}>
<table>
@@ -141,51 +217,97 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
</tr>
</thead>
<tbody>
- {filteredRuns.length === 0 ? (
- <tr>
- <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}>
- {runs.length === 0
- ? "No benchmark results yet. Run the harness to generate data."
- : "No results match the current filters."}
- </td>
- </tr>
- ) : (
- filteredRuns.map((run) => (
- <tr key={run.meta.run_id}>
- <td>
- <a href={`/run/${run.meta.run_id}`} style={{ fontSize: "0.75rem" }}>
- {formatRunId(run)}
- </a>
- </td>
- <td>{run.meta.task}</td>
- <td>
- <span className="badge badge-neutral">{run.meta.model}</span>
- </td>
- <td>{run.meta.effort}</td>
- <td>{run.meta.prompt_style}</td>
- <td>{run.meta.language}</td>
- <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}>
- {formatScore(run.eval_results?.score)}
+ {grouped ? (
+ cellGroups.length === 0 ? (
+ <tr>
+ <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}>
+ {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."}
</td>
- <td>
- {run.eval_results?.functional?.pass === true ? (
- <span className="badge badge-pass">PASS</span>
- ) : run.eval_results?.functional?.pass === false ? (
- <span className="badge badge-fail">FAIL</span>
- ) : (
- <span className="badge badge-neutral">-</span>
- )}
+ </tr>
+ ) : (
+ cellGroups.map((g) => (
+ <tr key={g.cell_id}>
+ <td>
+ <div style={{ fontSize: "0.75rem" }}>
+ {g.runs.map((r, i) => (
+ <span key={r.meta.run_id}>
+ {i > 0 && " "}
+ <a href={`/run/${r.meta.run_id}`} style={{ color: "var(--accent)" }}>#{r.meta.run_number}</a>
+ </span>
+ ))}
+ </div>
+ </td>
+ <td>{g.meta.task}</td>
+ <td><span className="badge badge-neutral">{g.meta.model}</span></td>
+ <td>{g.meta.effort}</td>
+ <td>{g.meta.prompt_style}</td>
+ <td>{g.meta.language}</td>
+ <td className={`score-cell ${scoreClass(g.avg.score)}`}>
+ <RangeCell min={g.min.score} max={g.max.score} avg={g.avg.score} format={formatScore} />
+ </td>
+ <td>
+ {(() => {
+ const passes = g.runs.filter(r => r.eval_results?.functional?.pass === true).length;
+ const fails = g.runs.filter(r => r.eval_results?.functional?.pass === false).length;
+ const total = g.runs.length;
+ if (passes === total) return <span className="badge badge-pass">{passes}/{total}</span>;
+ if (fails === total) return <span className="badge badge-fail">0/{total}</span>;
+ return <span className="badge badge-neutral">{passes}/{total}</span>;
+ })()}
+ </td>
+ <td>
+ <RangeCell min={g.min.cost} max={g.max.cost} avg={g.avg.cost} format={formatCost} />
+ </td>
+ <td>{formatTime(Math.round(g.avg.time))}</td>
+ <td>{Math.round(g.avg.turns)}</td>
+ </tr>
+ ))
+ )
+ ) : (
+ filteredRuns.length === 0 ? (
+ <tr>
+ <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}>
+ {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."}
</td>
- <td>{formatCost(run.claude_output?.total_cost_usd)}</td>
- <td>{formatTime(run.meta.wall_time_seconds)}</td>
- <td>{run.claude_output?.num_turns ?? "-"}</td>
</tr>
- ))
+ ) : (
+ filteredRuns.map((run) => (
+ <tr key={run.meta.run_id}>
+ <td>
+ <a href={`/run/${run.meta.run_id}`} style={{ fontSize: "0.75rem" }}>
+ {formatRunId(run)}
+ </a>
+ </td>
+ <td>{run.meta.task}</td>
+ <td><span className="badge badge-neutral">{run.meta.model}</span></td>
+ <td>{run.meta.effort}</td>
+ <td>{run.meta.prompt_style}</td>
+ <td>{run.meta.language}</td>
+ <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}>
+ {formatScore(run.eval_results?.score)}
+ </td>
+ <td>
+ {run.eval_results?.functional?.pass === true ? (
+ <span className="badge badge-pass">PASS</span>
+ ) : run.eval_results?.functional?.pass === false ? (
+ <span className="badge badge-fail">FAIL</span>
+ ) : (
+ <span className="badge badge-neutral">-</span>
+ )}
+ </td>
+ <td>{formatCost(run.claude_output?.total_cost_usd)}</td>
+ <td>{formatTime(run.meta.wall_time_seconds)}</td>
+ <td>{run.claude_output?.num_turns ?? "-"}</td>
+ </tr>
+ ))
+ )
)}
</tbody>
</table>
<div style={{ padding: "12px", color: "var(--text-muted)", fontSize: "0.75rem" }}>
- Showing {filteredRuns.length} of {runs.length} runs
+ {grouped
+ ? `${cellGroups.length} configs (${filteredRuns.length} runs)`
+ : `${filteredRuns.length} of ${runs.length} runs`}
</div>
</div>
</div>