commit f8ffb79af032ef659f675c9670f6d83092244df1
parent b1e91277b76e9c69c234d533f9f088d5b4e84d57
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 08:47:26 +0200
Grid: per-task summary with cells/runs/score/cost. Cell: variance stats. Box plots: model order fix.
Grid page:
- Replaced generic stat cards with per-task summary cards
- Shows: task name, cell count, run count, avg score, pass rate, avg cost
- Cell-based aggregation
Cell detail page:
- Added variance statistics: range, spread (pp), std dev (pp), median,
cost range, turns range
- Coefficient of variation shown as CV%
Variability box plots:
- Model order: haiku, opus, sonnet (was alphabetical)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 122 insertions(+), 32 deletions(-)
diff --git a/dashboard/src/components/CellDetail.tsx b/dashboard/src/components/CellDetail.tsx
@@ -178,6 +178,11 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) {
const range = maxScore - minScore;
const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0;
+ const stdDev = scores.length > 1
+ ? Math.sqrt(scores.reduce((sum, s) => sum + (s - avgScore) ** 2, 0) / (scores.length - 1))
+ : 0;
+ const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0;
+
let consistencyLabel: string;
let consistencyColor: string;
if (coefficientOfVariation < 10) {
@@ -338,9 +343,35 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) {
{consistencyLabel}
</span>
<span style={{ fontSize: "0.65rem", color: "var(--text-muted)" }}>
- {coefficientOfVariation.toFixed(0)}% range/avg
+ {coefficientOfVariation.toFixed(0)}% CV
</span>
</div>
+ <div style={{ marginTop: "8px", fontSize: "0.65rem", display: "flex", flexDirection: "column", gap: "2px" }}>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Range</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(minScore)} - {formatPct(maxScore)}</span>
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Spread</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{(range * 100).toFixed(1)}pp</span>
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Std Dev</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{(stdDev * 100).toFixed(1)}pp</span>
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Median</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(medianScore)}</span>
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Cost range</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{formatCost(costs.length > 0 ? Math.min(...costs) : null)} - {formatCost(costs.length > 0 ? Math.max(...costs) : null)}</span>
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>Turns range</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{turnsList.length > 0 ? Math.min(...turnsList) : "-"} - {turnsList.length > 0 ? Math.max(...turnsList) : "-"}</span>
+ </div>
+ </div>
</div>
</div>
diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx
@@ -59,7 +59,9 @@ interface BoxPlotStats {
}
function computeBoxPlots(cells: Cell[]): BoxPlotStats[] {
- const models = Array.from(new Set(cells.map((c) => c.meta.model))).sort();
+ const MODEL_ORDER: Record<string, number> = { haiku: 1, opus: 2, sonnet: 3 };
+ const models = Array.from(new Set(cells.map((c) => c.meta.model)))
+ .sort((a, b) => (MODEL_ORDER[a] || 99) - (MODEL_ORDER[b] || 99));
const results: BoxPlotStats[] = [];
for (const model of models) {
diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro
@@ -1,54 +1,111 @@
---
import Base from "../layouts/Base.astro";
-import { loadAllRuns, getAxisValues, getTaskNames, aggregateRuns } from "../lib/data";
+import { loadAllRuns, getAxisValues, getTaskNames } from "../lib/data";
+import type { Run } from "../lib/types";
import Grid from "../components/Grid";
import Charts from "../components/Charts";
const runs = loadAllRuns();
const axisValues = getAxisValues(runs);
const tasks = getTaskNames(runs);
-const stats = aggregateRuns(runs);
+
+// Compute per-task cell-based stats
+interface TaskSummary {
+ task: string;
+ cells: number;
+ runs: number;
+ avg_score: number | null;
+ pass_rate: number | null;
+ avg_cost: number | null;
+}
+
+function computeTaskSummaries(runs: Run[]): TaskSummary[] {
+ const byTask: Record<string, Run[]> = {};
+ for (const run of runs) {
+ (byTask[run.meta.task] ??= []).push(run);
+ }
+
+ return Object.entries(byTask).map(([task, taskRuns]) => {
+ // Group into cells
+ const cells = new Map<string, Run[]>();
+ for (const run of taskRuns) {
+ const id = run.meta.cell_id;
+ if (!cells.has(id)) cells.set(id, []);
+ cells.get(id)!.push(run);
+ }
+
+ // Compute cell averages
+ const cellScores: number[] = [];
+ const cellCosts: number[] = [];
+ let totalPasses = 0;
+
+ for (const [, cellRuns] of cells) {
+ const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null);
+ if (scores.length > 0) cellScores.push(scores.reduce((a, b) => a + b, 0) / scores.length);
+
+ const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null);
+ if (costs.length > 0) cellCosts.push(costs.reduce((a, b) => a + b, 0) / costs.length);
+
+ totalPasses += cellRuns.filter(r => r.eval_results?.functional?.pass === true).length;
+ }
+
+ const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+
+ return {
+ task,
+ cells: cells.size,
+ runs: taskRuns.length,
+ avg_score: avg(cellScores),
+ pass_rate: taskRuns.length > 0 ? totalPasses / taskRuns.length : null,
+ avg_cost: avg(cellCosts),
+ };
+ });
+}
+
+const taskSummaries = computeTaskSummaries(runs);
+const totalCells = new Set(runs.map(r => r.meta.cell_id)).size;
---
<Base title="Grid Overview">
<h1 style="margin-bottom: 8px;">Benchmark Results</h1>
- <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
+ <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
Comparing agentic coding loop configurations across tasks and setups.
</p>
- <div class="stats-grid" style="margin-bottom: 32px;">
- <div class="stat-card">
- <div class="stat-value">{runs.length}</div>
- <div class="stat-label">Total Runs</div>
- </div>
- <div class="stat-card">
- <div class="stat-value">{tasks.length}</div>
- <div class="stat-label">Tasks</div>
- </div>
- <div class="stat-card">
- <div class="stat-value">
- {stats.avg_score != null ? (stats.avg_score * 100).toFixed(0) + "%" : "-"}
- </div>
- <div class="stat-label">Avg Score</div>
- </div>
- <div class="stat-card">
- <div class="stat-value">
- {stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-"}
- </div>
- <div class="stat-label">Pass Rate</div>
- </div>
- <div class="stat-card">
- <div class="stat-value">
- {stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-"}
+ {taskSummaries.map((ts) => (
+ <div class="card" style="margin-bottom: 16px; padding: 16px;">
+ <div style="display: flex; align-items: center; gap: 24px; flex-wrap: wrap;">
+ <h3 style="margin: 0; min-width: 100px;">{ts.task}</h3>
+ <div style="display: flex; gap: 24px; font-size: 13px;">
+ <div>
+ <span style="color: var(--text-muted);">cells </span>
+ <span style="font-weight: 600;">{ts.cells}</span>
+ </div>
+ <div>
+ <span style="color: var(--text-muted);">runs </span>
+ <span style="font-weight: 600;">{ts.runs}</span>
+ </div>
+ <div>
+ <span style="color: var(--text-muted);">avg score </span>
+ <span style="font-weight: 600;">{ts.avg_score != null ? (ts.avg_score * 100).toFixed(0) + "%" : "-"}</span>
+ </div>
+ <div>
+ <span style="color: var(--text-muted);">pass rate </span>
+ <span style="font-weight: 600;">{ts.pass_rate != null ? (ts.pass_rate * 100).toFixed(0) + "%" : "-"}</span>
+ </div>
+ <div>
+ <span style="color: var(--text-muted);">avg cost </span>
+ <span style="font-weight: 600;">{ts.avg_cost != null ? "$" + ts.avg_cost.toFixed(2) : "-"}</span>
+ </div>
+ </div>
</div>
- <div class="stat-label">Avg Cost</div>
</div>
- </div>
+ ))}
<Charts client:load runs={runs} />
<div style="margin-top: 32px;">
- <h2 style="margin-bottom: 16px;">All Runs</h2>
+ <h2 style="margin-bottom: 16px;">All Cells</h2>
<Grid client:load runs={runs} axisValues={axisValues} tasks={tasks} />
</div>
</Base>