Cell-based analytics across all dashboard views - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 5e358b275032b8351588c74c53ef7c5853c1b8b4
parent 42135ccf8f0b74d916836155da84957a9875e4f3
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 07:50:46 +0200

Cell-based analytics across all dashboard views

Everything now aggregates by cell (config) not by individual run.
Runs within a cell show variance/reliability of that config.

Analysis lib:
- Cell interface with avg/min/max for all metrics
- groupIntoCells() aggregation function
- computeMainEffects now uses cell averages with variance tracking
- computeInteraction includes variance per cell combo

Bar charts:
- Error bars showing min-max range of cell scores per model/task
- Labels show cell count (e.g., "haiku (n=19 cells)")

Tornado chart:
- Shaded variance bands behind effect bars
- Shows +/-variance percentage alongside effect
- "N cells" instead of "n=N"

Compare page:
- Cell count and run count columns
- Score and cost ranges (min-max across cells)
- Cell-first aggregation prevents configs with more repeats from dominating

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/Charts.tsx  | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
M dashboard/src/components/TornadoChart.tsx  | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M dashboard/src/lib/analysis.ts  | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M dashboard/src/pages/compare.astro  | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------

4 files changed, 443 insertions(+), 105 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -8,6 +8,7 @@ import {
   ResponsiveContainer,
   Legend,
   Cell,
+  ErrorBar,
 } from "recharts";
 import type { Run } from "../lib/types";
 
@@ -18,8 +19,24 @@ interface ChartsProps {
 interface ModelScore {
   model: string;
   avg_score: number;
+  min_score: number;
+  max_score: number;
+  errorRange: [number, number];
   avg_cost: number;
-  count: number;
+  cellCount: number;
+}
+
+interface TaskScore {
+  task: string;
+  avg_score: number;
+  min_score: number;
+  max_score: number;
+  scoreErrorRange: [number, number];
+  pass_rate: number;
+  min_pass_rate: number;
+  max_pass_rate: number;
+  passRateErrorRange: [number, number];
+  cellCount: number;
 }
 
 const SMUI = {
@@ -53,70 +70,139 @@ const TOOLTIP_STYLE = {
   padding: "8px 12px",
 };
 
-function aggregateByModel(runs: Run[]): ModelScore[] {
-  const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+interface CellAggregate {
+  cell_id: string;
+  model: string;
+  task: string;
+  avgScore: number;
+  avgCost: number;
+  passRate: number;
+  runCount: number;
+}
+
+function aggregateCells(runs: Run[]): CellAggregate[] {
+  const byCell: Record<string, {
+    model: string;
+    task: string;
+    scores: number[];
+    costs: number[];
+    passes: number;
+    total: number;
+  }> = {};
 
   for (const run of runs) {
-    const model = run.meta.model;
-    if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
+    const cellId = run.meta.cell_id;
+    if (!byCell[cellId]) {
+      byCell[cellId] = {
+        model: run.meta.model,
+        task: run.meta.task,
+        scores: [],
+        costs: [],
+        passes: 0,
+        total: 0,
+      };
+    }
 
+    byCell[cellId].total++;
     if (run.eval_results?.score != null) {
-      byModel[model].scores.push(run.eval_results.score);
+      byCell[cellId].scores.push(run.eval_results.score);
     }
     if (run.claude_output?.total_cost_usd != null) {
-      byModel[model].costs.push(run.claude_output.total_cost_usd);
+      byCell[cellId].costs.push(run.claude_output.total_cost_usd);
+    }
+    if (run.eval_results?.functional?.pass) {
+      byCell[cellId].passes++;
     }
   }
 
-  return Object.entries(byModel).map(([model, data]) => ({
-    model: `${model} (n=${data.scores.length})`,
-    avg_score: data.scores.length > 0
-      ? Math.round(
-          (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
-        )
+  return Object.entries(byCell).map(([cell_id, data]) => ({
+    cell_id,
+    model: data.model,
+    task: data.task,
+    avgScore: data.scores.length > 0
+      ? data.scores.reduce((a, b) => a + b, 0) / data.scores.length
       : 0,
-    avg_cost: data.costs.length > 0
-      ? Math.round(
-          (data.costs.reduce((a, b) => a + b, 0) / data.costs.length) * 100
-        ) / 100
+    avgCost: data.costs.length > 0
+      ? data.costs.reduce((a, b) => a + b, 0) / data.costs.length
       : 0,
-    count: data.scores.length,
+    passRate: data.total > 0
+      ? data.passes / data.total
+      : 0,
+    runCount: data.total,
   }));
 }
 
-interface TaskScore {
-  task: string;
-  avg_score: number;
-  pass_rate: number;
+function aggregateByModel(runs: Run[]): ModelScore[] {
+  const cells = aggregateCells(runs);
+  const byModel: Record<string, CellAggregate[]> = {};
+
+  for (const cell of cells) {
+    if (!byModel[cell.model]) byModel[cell.model] = [];
+    byModel[cell.model].push(cell);
+  }
+
+  return Object.entries(byModel).map(([model, modelCells]) => {
+    const scores = modelCells.map((c) => Math.round(c.avgScore * 100));
+    const costs = modelCells.map((c) => c.avgCost);
+    const avgScore = scores.length > 0
+      ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length)
+      : 0;
+    const minScore = scores.length > 0 ? Math.min(...scores) : 0;
+    const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
+    const avgCost = costs.length > 0
+      ? Math.round((costs.reduce((a, b) => a + b, 0) / costs.length) * 100) / 100
+      : 0;
+
+    return {
+      model: `${model} (n=${modelCells.length} cells)`,
+      avg_score: avgScore,
+      min_score: minScore,
+      max_score: maxScore,
+      errorRange: [avgScore - minScore, maxScore - avgScore] as [number, number],
+      avg_cost: avgCost,
+      cellCount: modelCells.length,
+    };
+  });
 }
 
 function aggregateByTask(runs: Run[]): TaskScore[] {
-  const byTask: Record<string, { scores: number[]; passes: number; total: number }> = {};
+  const cells = aggregateCells(runs);
+  const byTask: Record<string, CellAggregate[]> = {};
 
-  for (const run of runs) {
-    const task = run.meta.task;
-    if (!byTask[task]) byTask[task] = { scores: [], passes: 0, total: 0 };
-
-    byTask[task].total++;
-    if (run.eval_results?.score != null) {
-      byTask[task].scores.push(run.eval_results.score);
-    }
-    if (run.eval_results?.functional?.pass) {
-      byTask[task].passes++;
-    }
+  for (const cell of cells) {
+    if (!byTask[cell.task]) byTask[cell.task] = [];
+    byTask[cell.task].push(cell);
   }
 
-  return Object.entries(byTask).map(([task, data]) => ({
-    task: `${task} (n=${data.total})`,
-    avg_score: data.scores.length > 0
-      ? Math.round(
-          (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
-        )
-      : 0,
-    pass_rate: data.total > 0
-      ? Math.round((data.passes / data.total) * 100)
-      : 0,
-  }));
+  return Object.entries(byTask).map(([task, taskCells]) => {
+    const scores = taskCells.map((c) => Math.round(c.avgScore * 100));
+    const passRates = taskCells.map((c) => Math.round(c.passRate * 100));
+
+    const avgScore = scores.length > 0
+      ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length)
+      : 0;
+    const minScore = scores.length > 0 ? Math.min(...scores) : 0;
+    const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
+
+    const avgPassRate = passRates.length > 0
+      ? Math.round(passRates.reduce((a, b) => a + b, 0) / passRates.length)
+      : 0;
+    const minPassRate = passRates.length > 0 ? Math.min(...passRates) : 0;
+    const maxPassRate = passRates.length > 0 ? Math.max(...passRates) : 0;
+
+    return {
+      task: `${task} (n=${taskCells.length} cells)`,
+      avg_score: avgScore,
+      min_score: minScore,
+      max_score: maxScore,
+      scoreErrorRange: [avgScore - minScore, maxScore - avgScore] as [number, number],
+      pass_rate: avgPassRate,
+      min_pass_rate: minPassRate,
+      max_pass_rate: maxPassRate,
+      passRateErrorRange: [avgPassRate - minPassRate, maxPassRate - avgPassRate] as [number, number],
+      cellCount: taskCells.length,
+    };
+  });
 }
 
 export default function Charts({ runs }: ChartsProps) {
@@ -154,8 +240,17 @@ export default function Charts({ runs }: ChartsProps) {
               tickLine={false}
               axisLine={false}
             />
-            <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} />
+            <Tooltip
+              contentStyle={TOOLTIP_STYLE}
+              cursor={{ fill: "hsl(217 17% 28% / 0.3)" }}
+              formatter={(value: number, name: string) => {
+                if (name === "Avg Score %") return [`${value}%`, name];
+                return [value, name];
+              }}
+              labelFormatter={(label: string) => label}
+            />
             <Bar dataKey="avg_score" name="Avg Score %" radius={0}>
+              <ErrorBar dataKey="errorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
               {modelData.map((entry) => {
                 const baseModel = entry.model.split(" ")[0];
                 return <Cell key={entry.model} fill={MODEL_COLORS[baseModel] || SMUI.frost2} />;
@@ -186,7 +281,14 @@ export default function Charts({ runs }: ChartsProps) {
               tickLine={false}
               axisLine={false}
             />
-            <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} />
+            <Tooltip
+              contentStyle={TOOLTIP_STYLE}
+              cursor={{ fill: "hsl(217 17% 28% / 0.3)" }}
+              formatter={(value: number, name: string) => {
+                return [`${value}%`, name];
+              }}
+              labelFormatter={(label: string) => label}
+            />
             <Legend
               wrapperStyle={{
                 fontFamily: "'JetBrains Mono', monospace",
@@ -195,8 +297,12 @@ export default function Charts({ runs }: ChartsProps) {
                 letterSpacing: "0.5px",
               }}
             />
-            <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0} />
-            <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0} />
+            <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0}>
+              <ErrorBar dataKey="scoreErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
+            </Bar>
+            <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0}>
+              <ErrorBar dataKey="passRateErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
+            </Bar>
           </BarChart>
         </ResponsiveContainer>
       </div>
diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx
@@ -41,8 +41,13 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
     );
   }
 
-  const maxSpread = Math.max(...effects.map((e) => e.spread));
-  const scale = maxSpread > 0 ? 200 / maxSpread : 1; // max bar width = 200px
+  // Scale must account for variance bands extending beyond effect bars
+  const maxExtent = Math.max(
+    ...effects.flatMap((e) =>
+      e.values.map((v) => Math.abs(v.effect) + v.variance)
+    )
+  );
+  const scale = maxExtent > 0 ? 200 / maxExtent : 1;
 
   return (
     <div className="card">
@@ -54,7 +59,8 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
           marginBottom: "16px",
         }}
       >
-        Sorted by effect size. Wider bars = bigger impact on outcomes.
+        Sorted by effect size. Solid bars show effect (deviation from grand
+        mean). Shaded bands show within-cell variance.
       </p>
 
       {effects.map((effect) => (
@@ -89,7 +95,9 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
             }}
           >
             {effect.values.map((entry) => {
-              const width = Math.abs(entry.effect) * scale;
+              const effectWidth = Math.abs(entry.effect) * scale;
+              const varianceBandWidth =
+                (Math.abs(entry.effect) + entry.variance) * scale;
               const isPositive = entry.effect >= 0;
               return (
                 <div
@@ -114,34 +122,73 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
                   </div>
                   <div
                     style={{
+                      position: "relative",
                       height: "16px",
-                      width: `${Math.max(width, 2)}px`,
-                      background: isPositive
-                        ? "var(--green)"
-                        : "var(--red)",
-                      borderRadius: "2px",
-                      opacity: 0.8,
+                      width: `${Math.max(varianceBandWidth, effectWidth, 2)}px`,
                     }}
-                  />
+                  >
+                    {/* Variance band (behind, wider, semi-transparent) */}
+                    {entry.variance > 0 && (
+                      <div
+                        style={{
+                          position: "absolute",
+                          top: "1px",
+                          left: 0,
+                          height: "14px",
+                          width: `${Math.max(varianceBandWidth, 2)}px`,
+                          background: isPositive
+                            ? "var(--green)"
+                            : "var(--red)",
+                          opacity: 0.15,
+                          borderRadius: "2px",
+                        }}
+                      />
+                    )}
+                    {/* Effect bar (foreground, solid) */}
+                    <div
+                      style={{
+                        position: "absolute",
+                        top: 0,
+                        left: 0,
+                        height: "16px",
+                        width: `${Math.max(effectWidth, 2)}px`,
+                        background: isPositive
+                          ? "var(--green)"
+                          : "var(--red)",
+                        borderRadius: "2px",
+                        opacity: 0.8,
+                      }}
+                    />
+                  </div>
                   <div
                     style={{
                       fontSize: "0.7rem",
                       fontFamily: "var(--font-mono)",
-                      color: isPositive
-                        ? "var(--green)"
-                        : "var(--red)",
+                      color: isPositive ? "var(--green)" : "var(--red)",
+                      whiteSpace: "nowrap",
                     }}
                   >
                     {entry.effect >= 0 ? "+" : ""}
                     {(entry.effect * 100).toFixed(1)}%
+                    {entry.variance > 0 && (
+                      <span
+                        style={{
+                          color: "var(--text-muted)",
+                          marginLeft: "4px",
+                        }}
+                      >
+                        ±{(entry.variance * 100).toFixed(1)}%
+                      </span>
+                    )}
                   </div>
                   <div
                     style={{
                       fontSize: "0.65rem",
                       color: "var(--text-muted)",
+                      whiteSpace: "nowrap",
                     }}
                   >
-                    (n={entry.n})
+                    {entry.n} cells
                   </div>
                 </div>
               );
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -1,10 +1,24 @@
 import type { Run, AxisName } from "./types";
 import { AXIS_NAMES } from "./types";
 
+export interface Cell {
+  cell_id: string;
+  runs: Run[];
+  meta: Run["meta"]; // from first run
+  n: number;
+  score: { avg: number; min: number; max: number; range: number };
+  cost: { avg: number; min: number; max: number };
+  turns: { avg: number; min: number; max: number };
+  wall_time: { avg: number; min: number; max: number };
+  gameplay: { avg: number; min: number; max: number };
+  code_quality: { avg: number; min: number; max: number };
+}
+
 export interface EffectEntry {
   value: string;
   mean: number;
   effect: number;
+  variance: number;
   n: number;
 }
 
@@ -16,6 +30,7 @@ export interface AxisEffect {
 
 export interface InteractionCell {
   mean: number;
+  variance: number;
   n: number;
 }
 
@@ -55,6 +70,56 @@ const METRICS: Record<string, MetricExtractor> = {
   transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
 };
 
+function agg(values: number[]): { avg: number; min: number; max: number } {
+  if (values.length === 0) return { avg: 0, min: 0, max: 0 };
+  const sum = values.reduce((a, b) => a + b, 0);
+  return {
+    avg: sum / values.length,
+    min: Math.min(...values),
+    max: Math.max(...values),
+  };
+}
+
+export function groupIntoCells(runs: Run[]): Cell[] {
+  const byCell = new Map<string, Run[]>();
+  for (const run of runs) {
+    const id = run.meta.cell_id;
+    const list = byCell.get(id);
+    if (list) list.push(run);
+    else byCell.set(id, [run]);
+  }
+
+  const cells: Cell[] = [];
+  for (const [cell_id, cellRuns] of byCell) {
+    const extractVals = (extractor: MetricExtractor): number[] => {
+      const vals: number[] = [];
+      for (const r of cellRuns) {
+        const v = extractor(r);
+        if (v !== null) vals.push(v);
+      }
+      return vals;
+    };
+
+    const scoreVals = extractVals(METRICS.score);
+    const scoreAgg = agg(scoreVals);
+
+    cells.push({
+      cell_id,
+      runs: cellRuns,
+      meta: cellRuns[0].meta,
+      n: cellRuns.length,
+      score: { ...scoreAgg, range: scoreAgg.max - scoreAgg.min },
+      cost: agg(extractVals(METRICS.cost)),
+      turns: agg(extractVals(METRICS.turns)),
+      wall_time: agg(extractVals(METRICS.wall_time)),
+      gameplay: agg(extractVals(METRICS.gameplay)),
+      code_quality: agg(extractVals(METRICS.code_quality)),
+    });
+  }
+
+  return cells;
+}
+
 export function computeMainEffects(
   runs: Run[],
   metric: string = "score"
@@ -62,14 +127,24 @@ export function computeMainEffects(
   const extract = METRICS[metric];
   if (!extract) return [];
 
-  const scored: Array<{ meta: Run["meta"]; value: number }> = [];
-  for (const run of runs) {
-    const val = extract(run);
-    if (val !== null) scored.push({ meta: run.meta, value: val });
+  const cells = groupIntoCells(runs);
+
+  // Compute per-cell metric averages and ranges
+  const scored: Array<{ meta: Run["meta"]; avg: number; range: number }> = [];
+  for (const cell of cells) {
+    const vals: number[] = [];
+    for (const run of cell.runs) {
+      const v = extract(run);
+      if (v !== null) vals.push(v);
+    }
+    if (vals.length === 0) continue;
+    const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+    const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
+    scored.push({ meta: cell.meta, avg: cellAvg, range: cellRange });
   }
   if (scored.length === 0) return [];
 
-  const grandMean = scored.reduce((s, r) => s + r.value, 0) / scored.length;
+  const grandMean = scored.reduce((s, c) => s + c.avg, 0) / scored.length;
 
   // Find axis keys from meta
   const axisKeys = Object.keys(scored[0].meta).filter(
@@ -79,22 +154,26 @@ export function computeMainEffects(
   const effects: AxisEffect[] = [];
 
   for (const axis of axisKeys) {
-    const groups: Record<string, number[]> = {};
-    for (const { meta, value } of scored) {
+    const groups: Record<string, { avgs: number[]; ranges: number[] }> = {};
+    for (const { meta, avg, range } of scored) {
       const key = String((meta as Record<string, unknown>)[axis] ?? "unknown");
-      (groups[key] ??= []).push(value);
+      const g = groups[key] ??= { avgs: [], ranges: [] };
+      g.avgs.push(avg);
+      g.ranges.push(range);
     }
 
     if (Object.keys(groups).length < 2) continue;
 
     const values: EffectEntry[] = [];
-    for (const [val, vals] of Object.entries(groups)) {
-      const mean = vals.reduce((a, b) => a + b, 0) / vals.length;
+    for (const [val, { avgs, ranges }] of Object.entries(groups)) {
+      const mean = avgs.reduce((a, b) => a + b, 0) / avgs.length;
+      const variance = ranges.reduce((a, b) => a + b, 0) / ranges.length;
       values.push({
         value: val,
         mean: Math.round(mean * 10000) / 10000,
         effect: Math.round((mean - grandMean) * 10000) / 10000,
-        n: vals.length,
+        variance: Math.round(variance * 10000) / 10000,
+        n: avgs.length,
       });
     }
 
@@ -121,31 +200,49 @@ export function computeInteraction(
   if (!extract)
     return { axisA, axisB, table: {}, maxInteraction: 0 };
 
-  const groups: Record<string, Record<string, number[]>> = {};
+  const cells = groupIntoCells(runs);
 
-  for (const run of runs) {
-    const val = extract(run);
-    if (val === null) continue;
-    const a = String((run.meta as Record<string, unknown>)[axisA] ?? "?");
-    const b = String((run.meta as Record<string, unknown>)[axisB] ?? "?");
-    ((groups[a] ??= {})[b] ??= []).push(val);
+  // Group cells by (axisA, axisB) combination
+  const groups: Record<string, Record<string, { avgs: number[]; ranges: number[] }>> = {};
+
+  for (const cell of cells) {
+    const vals: number[] = [];
+    for (const run of cell.runs) {
+      const v = extract(run);
+      if (v !== null) vals.push(v);
+    }
+    if (vals.length === 0) continue;
+
+    const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+    const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
+
+    const a = String((cell.meta as Record<string, unknown>)[axisA] ?? "?");
+    const b = String((cell.meta as Record<string, unknown>)[axisB] ?? "?");
+    const g = ((groups[a] ??= {})[b] ??= { avgs: [], ranges: [] });
+    g.avgs.push(cellAvg);
+    g.ranges.push(cellRange);
   }
 
   const table: Record<string, Record<string, InteractionCell>> = {};
-  const allVals: number[] = [];
+  const allMeans: number[] = [];
 
   for (const [a, bGroups] of Object.entries(groups)) {
     table[a] = {};
-    for (const [b, vals] of Object.entries(bGroups)) {
-      const mean = vals.reduce((s, v) => s + v, 0) / vals.length;
-      table[a][b] = { mean: Math.round(mean * 10000) / 10000, n: vals.length };
-      allVals.push(mean);
+    for (const [b, { avgs, ranges }] of Object.entries(bGroups)) {
+      const mean = avgs.reduce((s, v) => s + v, 0) / avgs.length;
+      const variance = ranges.reduce((s, v) => s + v, 0) / ranges.length;
+      table[a][b] = {
+        mean: Math.round(mean * 10000) / 10000,
+        variance: Math.round(variance * 10000) / 10000,
+        n: avgs.length,
+      };
+      allMeans.push(mean);
     }
   }
 
   const grandMean =
-    allVals.length > 0
-      ? allVals.reduce((a, b) => a + b, 0) / allVals.length
+    allMeans.length > 0
+      ? allMeans.reduce((a, b) => a + b, 0) / allMeans.length
       : 0;
 
   // Row and column means
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -1,23 +1,80 @@
 ---
 import Base from "../layouts/Base.astro";
-import { loadAllRuns, getAxisValues, getTaskNames, aggregateRuns, AXIS_NAMES } from "../lib/data";
+import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES } from "../lib/data";
 import type { Run, AxisName } from "../lib/data";
 
 const runs = loadAllRuns();
 const axisValues = getAxisValues(runs);
 const tasks = getTaskNames(runs);
 
-// Build comparison data: for each axis, show how different values perform
+// Build comparison data using cell-based aggregation.
+// A "cell" is a unique configuration (cell_id). Multiple runs share a cell_id
+// when they are repeat trials of the same config. Averaging per-cell first,
+// then aggregating across cells, prevents configs with more repeats from
+// dominating the average.
+
 interface ComparisonRow {
   axis: string;
   value: string;
-  count: number;
+  cells: number;   // number of unique configs
+  runs: number;    // total runs
   avg_score: string;
-  pass_rate: string;
+  score_range: string;  // "68%-80%"
   avg_cost: string;
+  cost_range: string;   // "$0.15-$0.22"
   avg_time: string;
 }
 
+interface CellStats {
+  avg_score: number | null;
+  avg_cost: number | null;
+  avg_time: number | null;
+  run_count: number;
+}
+
+/** Compute per-cell averages from a list of runs. */
+function getCellStats(runs: Run[]): Map<string, CellStats> {
+  const cells = new Map<string, Run[]>();
+  for (const run of runs) {
+    const id = run.meta.cell_id;
+    if (!cells.has(id)) cells.set(id, []);
+    cells.get(id)!.push(run);
+  }
+
+  const result = new Map<string, CellStats>();
+  for (const [cellId, cellRuns] of cells) {
+    const scores = cellRuns
+      .map((r) => r.eval_results?.score)
+      .filter((s): s is number => s != null);
+    const costs = cellRuns
+      .map((r) => r.claude_output?.total_cost_usd)
+      .filter((c): c is number => c != null);
+    const times = cellRuns
+      .map((r) => r.meta.wall_time_seconds)
+      .filter((t): t is number => t != null);
+
+    const avg = (arr: number[]) =>
+      arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+
+    result.set(cellId, {
+      avg_score: avg(scores),
+      avg_cost: avg(costs),
+      avg_time: avg(times),
+      run_count: cellRuns.length,
+    });
+  }
+  return result;
+}
+
+function formatRange(values: number[], formatter: (v: number) => string): string {
+  if (values.length === 0) return "-";
+  if (values.length === 1) return formatter(values[0]);
+  const min = Math.min(...values);
+  const max = Math.max(...values);
+  if (min === max) return formatter(min);
+  return formatter(min) + "-" + formatter(max);
+}
+
 const comparisons: ComparisonRow[] = [];
 
 const AXIS_LABELS: Record<AxisName, string> = {
@@ -39,23 +96,50 @@ const AXIS_LABELS: Record<AxisName, string> = {
   max_budget: "Budget",
 };
 
+// Pre-compute all cell stats once
+const allCellStats = getCellStats(runs);
+
 for (const axis of AXIS_NAMES) {
   for (const value of axisValues[axis]) {
     const filtered = runs.filter(
       (r: Run) => String(r.meta[axis as keyof typeof r.meta]) === value
     );
-    const stats = aggregateRuns(filtered);
+
+    // Find the unique cell_ids in these runs and gather their stats
+    const cellIds = new Set(filtered.map((r) => r.meta.cell_id));
+    const matchingCells: CellStats[] = [];
+    for (const id of cellIds) {
+      const cs = allCellStats.get(id);
+      if (cs) matchingCells.push(cs);
+    }
+
+    const cellScores = matchingCells
+      .map((c) => c.avg_score)
+      .filter((s): s is number => s != null);
+    const cellCosts = matchingCells
+      .map((c) => c.avg_cost)
+      .filter((c): c is number => c != null);
+    const cellTimes = matchingCells
+      .map((c) => c.avg_time)
+      .filter((t): t is number => t != null);
+
+    const avg = (arr: number[]) =>
+      arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+
+    const avgScore = avg(cellScores);
+    const avgCost = avg(cellCosts);
+    const avgTime = avg(cellTimes);
+
     comparisons.push({
       axis: AXIS_LABELS[axis],
       value,
-      count: stats.count,
-      avg_score:
-        stats.avg_score != null ? (stats.avg_score * 100).toFixed(0) + "%" : "-",
-      pass_rate:
-        stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-",
-      avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-",
-      avg_time:
-        stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-",
+      cells: cellIds.size,
+      runs: filtered.length,
+      avg_score: avgScore != null ? (avgScore * 100).toFixed(0) + "%" : "-",
+      score_range: formatRange(cellScores, (v) => (v * 100).toFixed(0) + "%"),
+      avg_cost: avgCost != null ? "$" + avgCost.toFixed(2) : "-",
+      cost_range: formatRange(cellCosts, (v) => "$" + v.toFixed(2)),
+      avg_time: avgTime != null ? Math.round(avgTime) + "s" : "-",
     });
   }
 }
@@ -78,10 +162,12 @@ for (const axis of AXIS_NAMES) {
           <tr>
             <th>Axis</th>
             <th>Value</th>
+            <th>Cells</th>
             <th>Runs</th>
             <th>Avg Score</th>
-            <th>Pass Rate</th>
+            <th>Score Range</th>
             <th>Avg Cost</th>
+            <th>Cost Range</th>
             <th>Avg Time</th>
           </tr>
         </thead>
@@ -92,10 +178,12 @@ for (const axis of AXIS_NAMES) {
               <td>
                 <span class="badge badge-neutral">{row.value}</span>
               </td>
-              <td>{row.count}</td>
+              <td>{row.cells}</td>
+              <td>{row.runs}</td>
               <td class="score-cell">{row.avg_score}</td>
-              <td class="score-cell">{row.pass_rate}</td>
+              <td style="color: var(--text-muted); font-size: 0.85rem;">{row.score_range}</td>
               <td>{row.avg_cost}</td>
+              <td style="color: var(--text-muted); font-size: 0.85rem;">{row.cost_range}</td>
               <td>{row.avg_time}</td>
             </tr>
           ))}

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/Charts.tsx	\|	204	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
M	dashboard/src/components/TornadoChart.tsx	\|	77	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	dashboard/src/lib/analysis.ts	\|	147	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	dashboard/src/pages/compare.astro	\|	120	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------