Convert all charts to cell-based: every visualization now shows cells not runs - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit f213f5a1831e271fcf572e9de9073fe5c85985ba
parent 364e1e4595a31324e0d96750c5bff342c7bbaf76
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 21:22:35 +0200

Convert all charts to cell-based: every visualization now shows cells not runs

ScatterPlot: dots are cells with error bar crosshairs showing run ranges
CorrelationMatrix: spreads computed from cell averages per axis value
HeatmapMatrix: values are averages of cell averages, labels show "N cells"
BumpChart: model rankings from cell averages per condition
RadarComparison: dimension scores from cell averages
ConfigTreemap: size = cell count, color = avg cell score
EfficiencyFrontier: simplified to use groupIntoCells(), labels updated

All charts now consistently represent cells (unique configs) with
variance from repeat runs shown as ranges/error bars.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/BumpChart.tsx  | 55 ++++++++++++++++++++++++++-----------------------------
M dashboard/src/components/ConfigTreemap.tsx  | 26 +++++++++++++-------------
M dashboard/src/components/CorrelationMatrix.tsx  | 24 +++++++++++++++++-------
M dashboard/src/components/EfficiencyFrontier.tsx  | 80 +++++++++++++++++++++++++++----------------------------------------------------
M dashboard/src/components/HeatmapMatrix.tsx  | 21 +++++++++++++--------
M dashboard/src/components/RadarComparison.tsx  | 33 ++++++++++++++-------------------
M dashboard/src/components/ScatterPlot.tsx  | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------

7 files changed, 263 insertions(+), 169 deletions(-)
diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx
@@ -11,6 +11,7 @@ import {
 } from "recharts";
 import type { Run } from "../lib/types";
 import { AXIS_NAMES, type AxisName } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
 
 interface BumpChartProps {
   runs: Run[];
@@ -65,15 +66,19 @@ function computeRankings(
   runs: Run[],
   axis: AxisName
 ): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } {
+  // Group runs into cells, then work with cell averages
+  const cells = groupIntoCells(runs);
+  const scoredCells = cells.filter((c) => c.score.avg > 0);
+
   // Get unique condition values for the selected axis
   const conditionValues = Array.from(
-    new Set(runs.map((r) => String(r.meta[axis])))
+    new Set(scoredCells.map((c) => String(c.meta[axis])))
   ).sort();
 
   // Get unique models
-  const models = Array.from(new Set(runs.map((r) => r.meta.model))).sort();
+  const models = Array.from(new Set(scoredCells.map((c) => c.meta.model))).sort();
 
-  // For each condition value, compute average score per model, then rank
+  // For each condition value, compute average of cell averages per model, then rank
   const ranked: Record<string, RankedPoint[]> = {};
   for (const model of models) {
     ranked[model] = [];
@@ -85,27 +90,24 @@ function computeRankings(
 
   for (let ci = 0; ci < conditionValues.length; ci++) {
     const cv = conditionValues[ci];
-    const runsForCondition = runs.filter(
-      (r) => String(r.meta[axis]) === cv
+    const cellsForCondition = scoredCells.filter(
+      (c) => String(c.meta[axis]) === cv
     );
 
-    // Compute average score per model for this condition
+    // Compute average of cell averages per model for this condition
     const modelScores: Array<{
       model: string;
       avgScore: number;
       n: number;
     }> = [];
     for (const model of models) {
-      const modelRuns = runsForCondition.filter(
-        (r) => r.meta.model === model
+      const modelCells = cellsForCondition.filter(
+        (c) => c.meta.model === model
       );
-      const scores = modelRuns
-        .map((r) => r.eval_results?.score)
-        .filter((s): s is number => s !== null && s !== undefined);
 
-      if (scores.length > 0) {
-        const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
-        modelScores.push({ model, avgScore: avg, n: scores.length });
+      if (modelCells.length > 0) {
+        const avg = modelCells.reduce((s, c) => s + c.score.avg, 0) / modelCells.length;
+        modelScores.push({ model, avgScore: avg, n: modelCells.length });
       }
     }
 
@@ -280,8 +282,7 @@ function CustomTooltipContent({
             {entry.point!.model}
           </span>
           <span style={{ color: "var(--text-muted)", marginLeft: "auto" }}>
-            {(entry.point!.avgScore * 100).toFixed(1)}% (n=
-            {entry.point!.n})
+            {(entry.point!.avgScore * 100).toFixed(1)}% ({entry.point!.n} cells)
           </span>
         </div>
       ))}
@@ -292,18 +293,17 @@ function CustomTooltipContent({
 export default function BumpChart({ runs }: BumpChartProps) {
   // Pre-compute which axes are useful: need 2+ condition values AND 2+ models with scores
   const validAxes = useMemo(() => {
-    const scoredRuns = runs.filter(
-      (r) => r.eval_results?.score !== null && r.eval_results?.score !== undefined
-    );
+    const cells = groupIntoCells(runs);
+    const scoredCells = cells.filter((c) => c.score.avg > 0);
     return CONDITION_AXES.filter((axis) => {
       const conditionValues = Array.from(
-        new Set(scoredRuns.map((r) => String(r.meta[axis])))
+        new Set(scoredCells.map((c) => String(c.meta[axis])))
       );
       if (conditionValues.length < 2) return false;
       // Check that at least one condition value has 2+ models with scores
       for (const cv of conditionValues) {
         const modelsWithScores = new Set(
-          scoredRuns.filter((r) => String(r.meta[axis]) === cv).map((r) => r.meta.model)
+          scoredCells.filter((c) => String(c.meta[axis]) === cv).map((c) => c.meta.model)
         );
         if (modelsWithScores.size >= 2) return true;
       }
@@ -354,12 +354,9 @@ export default function BumpChart({ runs }: BumpChartProps) {
 
   const maxRank = models.length;
 
-  const scoredRuns = runs.filter(
-    (r) =>
-      r.eval_results?.score !== null && r.eval_results?.score !== undefined
-  );
+  const scoredCells = groupIntoCells(runs).filter((c) => c.score.avg > 0);
 
-  if (scoredRuns.length === 0) {
+  if (scoredCells.length === 0) {
     return (
       <div
         className="card"
@@ -369,7 +366,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
           color: "var(--text-muted)",
         }}
       >
-        No scored runs available for ranking.
+        No scored cells available for ranking.
       </div>
     );
   }
@@ -387,7 +384,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
           }}
         >
           Not enough data to compare models. Rankings need at least 2 condition
-          values where 2 or more models have scored runs.
+          values where 2 or more models have scored cells.
         </div>
       </div>
     );
@@ -414,7 +411,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
               margin: "4px 0 0",
             }}
           >
-            Rank 1 = best average score. Crossings indicate rank swaps.
+            Rank 1 = best average cell score. Crossings indicate rank swaps.
           </p>
         </div>
         <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
diff --git a/dashboard/src/components/ConfigTreemap.tsx b/dashboard/src/components/ConfigTreemap.tsx
@@ -2,6 +2,7 @@ import React, { useState, useCallback } from "react";
 import { Treemap, ResponsiveContainer, Tooltip } from "recharts";
 import type { TreemapNode } from "recharts/types/chart/Treemap";
 import type { Run, AxisName } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
 
 interface ConfigTreemapProps {
   runs: Run[];
@@ -47,14 +48,15 @@ interface GroupData {
 }
 
 function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] {
-  const byModel: Record<string, Record<string, Run[]>> = {};
+  const cells = groupIntoCells(runs);
+  const byModel: Record<string, Record<string, Cell[]>> = {};
 
-  for (const run of runs) {
-    const model = run.meta.model;
-    const secondary = String(run.meta[secondaryAxis]);
+  for (const cell of cells) {
+    const model = cell.meta.model;
+    const secondary = String(cell.meta[secondaryAxis]);
     if (!byModel[model]) byModel[model] = {};
     if (!byModel[model][secondary]) byModel[model][secondary] = [];
-    byModel[model][secondary].push(run);
+    byModel[model][secondary].push(cell);
   }
 
   return Object.entries(byModel)
@@ -63,19 +65,17 @@ function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] {
       name: model,
       children: Object.entries(configs)
         .sort(([a], [b]) => a.localeCompare(b))
-        .map(([configValue, configRuns]) => {
-          const scores = configRuns
-            .map((r) => r.eval_results?.score)
-            .filter((s): s is number => s !== null && s !== undefined);
+        .map(([configValue, configCells]) => {
+          const scoredCells = configCells.filter((c) => c.score.avg > 0);
           const avgScore =
-            scores.length > 0
-              ? scores.reduce((a, b) => a + b, 0) / scores.length
+            scoredCells.length > 0
+              ? scoredCells.reduce((s, c) => s + c.score.avg, 0) / scoredCells.length
               : null;
 
           return {
             name: `${model} / ${configValue}`,
             displayName: `${model} / ${configValue}`,
-            size: configRuns.length,
+            size: configCells.length,
             avgScore,
             avgScorePct:
               avgScore !== null ? `${(avgScore * 100).toFixed(0)}%` : "--",
@@ -198,7 +198,7 @@ function CustomTooltip({
         Score: {node.avgScorePct}
       </div>
       <div style={{ color: "hsl(213 14% 65%)" }}>
-        Runs: {node.size}
+        Cells: {node.size}
       </div>
     </div>
   );
diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx
@@ -1,4 +1,5 @@
 import type { Run } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
 
 interface CorrelationMatrixProps {
   runs: Run[];
@@ -36,13 +37,19 @@ const OUTCOME_METRICS: Array<{ key: string; label: string; extract: MetricExtrac
   { key: "time", label: "Wall Time", extract: (r) => r.meta.wall_time_seconds ?? null },
 ];
 
-function computeSpread(runs: Run[], axisKey: string, extract: MetricExtractor): number | null {
+function computeSpread(cells: Cell[], axisKey: string, extract: MetricExtractor): number | null {
+  // Compute per-cell metric averages, then group by axis value
   const groups: Record<string, number[]> = {};
-  for (const run of runs) {
-    const val = extract(run);
-    if (val === null) continue;
-    const groupKey = String((run.meta as Record<string, unknown>)[axisKey] ?? "unknown");
-    (groups[groupKey] ??= []).push(val);
+  for (const cell of cells) {
+    const vals: number[] = [];
+    for (const run of cell.runs) {
+      const v = extract(run);
+      if (v !== null) vals.push(v);
+    }
+    if (vals.length === 0) continue;
+    const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+    const groupKey = String((cell.meta as Record<string, unknown>)[axisKey] ?? "unknown");
+    (groups[groupKey] ??= []).push(cellAvg);
   }
 
   const keys = Object.keys(groups);
@@ -72,6 +79,9 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
     );
   }
 
+  // Group runs into cells once, then compute spreads from cell averages
+  const cells = groupIntoCells(runs);
+
   // Compute the full matrix: rows = config axes, columns = metrics
   const matrix: Array<{
     key: string;
@@ -82,7 +92,7 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
 
   for (const axis of CONFIG_AXES) {
     const spreads = OUTCOME_METRICS.map((metric) =>
-      computeSpread(runs, axis.key, metric.extract)
+      computeSpread(cells, axis.key, metric.extract)
     );
     const validSpreads = spreads.filter((s): s is number => s !== null);
     const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0;
diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx
@@ -9,6 +9,7 @@ import {
   ResponsiveContainer,
 } from "recharts";
 import type { Run } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
 
 interface EfficiencyFrontierProps {
   runs: Run[];
@@ -42,55 +43,28 @@ function getModelColor(model: string): string {
 }
 
 function aggregateByConfig(runs: Run[]): ConfigPoint[] {
-  const groups: Record<
-    string,
-    {
-      scores: number[];
-      costs: number[];
-      model: string;
-      config: Record<string, string>;
-    }
-  > = {};
-
-  for (const run of runs) {
-    const id = run.meta.cell_id;
-    if (!groups[id]) {
-      groups[id] = {
-        scores: [],
-        costs: [],
-        model: run.meta.model,
-        config: {
-          model: run.meta.model,
-          effort: run.meta.effort,
-          prompt_style: run.meta.prompt_style,
-          language: run.meta.language,
-          linter: run.meta.linter,
-          playwright: run.meta.playwright,
-          context_file: run.meta.context_file,
-          sub_agents: run.meta.sub_agents,
-          web_search: run.meta.web_search,
-          max_budget: run.meta.max_budget,
-        },
-      };
-    }
-
-    if (run.eval_results?.score != null) {
-      groups[id].scores.push(run.eval_results.score);
-    }
-    if (run.claude_output?.total_cost_usd != null) {
-      groups[id].costs.push(run.claude_output.total_cost_usd);
-    }
-  }
-
-  return Object.entries(groups)
-    .filter(([, g]) => g.scores.length > 0 && g.costs.length > 0)
-    .map(([cell_id, g]) => ({
-      cell_id,
-      model: g.model,
-      avgCost: g.costs.reduce((a, b) => a + b, 0) / g.costs.length,
-      avgScore: g.scores.reduce((a, b) => a + b, 0) / g.scores.length,
-      runCount: g.scores.length,
-      config: g.config,
+  const cells = groupIntoCells(runs);
+
+  return cells
+    .filter((c) => c.score.avg > 0 && c.cost.avg > 0)
+    .map((c) => ({
+      cell_id: c.cell_id,
+      model: c.meta.model,
+      avgCost: c.cost.avg,
+      avgScore: c.score.avg,
+      runCount: c.n,
+      config: {
+        model: c.meta.model,
+        effort: c.meta.effort,
+        prompt_style: c.meta.prompt_style,
+        language: c.meta.language,
+        linter: c.meta.linter,
+        playwright: c.meta.playwright,
+        context_file: c.meta.context_file,
+        sub_agents: c.meta.sub_agents,
+        web_search: c.meta.web_search,
+        max_budget: c.meta.max_budget,
+      },
       isFrontier: false,
       label: "",
     }));
@@ -203,7 +177,7 @@ function CustomTooltip({
         <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span>
       </div>
       <div style={{ marginBottom: "8px" }}>
-        <span style={{ color: "var(--text-muted)" }}>runs: </span>
+        <span style={{ color: "var(--text-muted)" }}>runs in cell: </span>
         <span>{point.runCount}</span>
       </div>
       {point.isFrontier && (
@@ -334,8 +308,8 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
           marginBottom: "16px",
         }}
       >
-        Cost vs score per config. Pareto frontier highlights configs not
-        dominated on both axes.
+        Cost vs score per cell (averaged across runs). Pareto frontier
+        highlights cells not dominated on both axes.
       </p>
 
       {/* Legend */}
@@ -420,7 +394,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
 
           {/* Non-frontier points (dimmed) */}
           <Scatter
-            name="configs"
+            name="cells"
             data={nonFrontierPoints}
             shape={nonFrontierShape}
             isAnimationActive={false}
diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx
@@ -1,6 +1,7 @@
 import { useState, useMemo } from "react";
 import type { Run, AxisName } from "../lib/types";
 import { AXIS_NAMES } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
 
 interface HeatmapMatrixProps {
   runs: Run[];
@@ -61,16 +62,20 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
   const [colAxis, setColAxis] = useState<AxisName>("prompt_style");
 
   const { rowValues, colValues, cells } = useMemo(() => {
+    const analysisCells = groupIntoCells(runs);
     const cellMap: Record<string, Record<string, CellData>> = {};
     const rowSet = new Set<string>();
     const colSet = new Set<string>();
 
-    for (const run of runs) {
-      const score = run.eval_results?.score;
-      if (score === null || score === undefined) continue;
+    for (const cell of analysisCells) {
+      // Skip cells where no run has a score
+      const hasScore = cell.runs.some((r) => r.eval_results?.score != null);
+      if (!hasScore) continue;
+      // Use the cell's average score as a single data point
+      const cellAvg = cell.score.avg;
 
-      const rv = String(run.meta[rowAxis]);
-      const cv = String(run.meta[colAxis]);
+      const rv = String(cell.meta[rowAxis]);
+      const cv = String(cell.meta[colAxis]);
 
       rowSet.add(rv);
       colSet.add(cv);
@@ -78,7 +83,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
       if (!cellMap[rv]) cellMap[rv] = {};
       if (!cellMap[rv][cv]) cellMap[rv][cv] = { totalScore: 0, count: 0 };
 
-      cellMap[rv][cv].totalScore += score;
+      cellMap[rv][cv].totalScore += cellAvg;
       cellMap[rv][cv].count += 1;
     }
 
@@ -171,7 +176,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
             fontFamily: "var(--font-mono)",
           }}
         >
-          No scored runs available for this axis combination.
+          No scored cells available for this axis combination.
         </div>
       ) : (
         <div style={{ overflowX: "auto" }}>
@@ -297,7 +302,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
                             lineHeight: 1.3,
                           }}
                         >
-                          n={cell.count}
+                          {cell.count} {cell.count === 1 ? "cell" : "cells"}
                         </div>
                       </td>
                     );
diff --git a/dashboard/src/components/RadarComparison.tsx b/dashboard/src/components/RadarComparison.tsx
@@ -9,6 +9,7 @@ import {
   Tooltip,
 } from "recharts";
 import type { Run } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
 
 interface RadarComparisonProps {
   runs: Run[];
@@ -50,31 +51,25 @@ function extractDimensionScore(run: Run, dim: Dimension): number | null {
 interface CellConfig {
   cell_id: string;
   label: string;
-  runs: Run[];
+  cell: Cell;
 }
 
 function buildCellConfigs(runs: Run[]): CellConfig[] {
-  const grouped: Record<string, Run[]> = {};
-  for (const run of runs) {
-    const id = run.meta.cell_id;
-    if (!grouped[id]) grouped[id] = [];
-    grouped[id].push(run);
-  }
-
-  return Object.entries(grouped)
-    .map(([cell_id, cellRuns]) => {
-      const m = cellRuns[0].meta;
+  const cells = groupIntoCells(runs);
+  return cells
+    .map((cell) => {
+      const m = cell.meta;
       const label = `${m.model} / ${m.language} / ${m.prompt_style} / ${m.effort}`;
-      return { cell_id, label, runs: cellRuns };
+      return { cell_id: cell.cell_id, label, cell };
     })
     .sort((a, b) => a.label.localeCompare(b.label));
 }
 
-function averageScores(
-  runs: Run[],
+function cellAverageScore(
+  cell: Cell,
   dim: Dimension
 ): number | null {
-  const scores = runs
+  const scores = cell.runs
     .map((r) => extractDimensionScore(r, dim))
     .filter((s): s is number => s !== null);
   if (scores.length === 0) return null;
@@ -148,8 +143,8 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
 
   const data: RadarDatum[] = useMemo(() => {
     return DIMENSIONS.map((dim) => {
-      const scoreA = configA ? averageScores(configA.runs, dim) : null;
-      const scoreB = configB ? averageScores(configB.runs, dim) : null;
+      const scoreA = configA ? cellAverageScore(configA.cell, dim) : null;
+      const scoreB = configB ? cellAverageScore(configB.cell, dim) : null;
       return {
         dimension: DIMENSION_LABELS[dim],
         scoreA: scoreA ?? 0,
@@ -203,7 +198,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
           >
             {cellConfigs.map((c) => (
               <option key={c.cell_id} value={c.cell_id}>
-                {c.label} (n={c.runs.length})
+                {c.label} ({c.cell.n} runs)
               </option>
             ))}
           </select>
@@ -217,7 +212,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
           >
             {cellConfigs.map((c) => (
               <option key={c.cell_id} value={c.cell_id}>
-                {c.label} (n={c.runs.length})
+                {c.label} ({c.cell.n} runs)
               </option>
             ))}
           </select>
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -7,8 +7,10 @@ import {
   Tooltip,
   ResponsiveContainer,
   Legend,
+  ErrorBar,
 } from "recharts";
 import type { Run } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
 
 interface ScatterPlotProps {
   runs: Run[];
@@ -16,68 +18,177 @@ interface ScatterPlotProps {
   yMetric: string;
 }
 
-const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = {
+type CellMetricKey = "cost" | "score" | "turns" | "wall_time";
+
+const METRIC_CONFIG: Record<
+  string,
+  {
+    label: string;
+    cellKey: CellMetricKey;
+    scale: number; // multiply avg/min/max by this for display
+    format: (v: number) => string;
+  }
+> = {
   cost: {
     label: "Cost ($)",
-    extract: (r) => r.claude_output?.total_cost_usd ?? null,
+    cellKey: "cost",
+    scale: 1,
     format: (v) => `$${v.toFixed(2)}`,
   },
   score: {
     label: "Score (%)",
-    extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null,
+    cellKey: "score",
+    scale: 100,
     format: (v) => `${v.toFixed(0)}%`,
   },
   turns: {
     label: "Turns",
-    extract: (r) => r.claude_output?.num_turns ?? null,
-    format: (v) => `${v}`,
+    cellKey: "turns",
+    scale: 1,
+    format: (v) => `${Math.round(v)}`,
   },
   wall_time: {
     label: "Time (s)",
-    extract: (r) => r.meta.wall_time_seconds ?? null,
-    format: (v) => `${v}s`,
+    cellKey: "wall_time",
+    scale: 1,
+    format: (v) => `${Math.round(v)}s`,
   },
 };
 
 const MODEL_COLORS: Record<string, string> = {
-  haiku: "hsl(193 44% 67%)",   // frost cyan
-  sonnet: "hsl(40 71% 73%)",   // aurora yellow
-  opus: "hsl(311 24% 63%)",    // aurora purple
+  haiku: "hsl(193 44% 67%)", // frost cyan
+  sonnet: "hsl(40 71% 73%)", // aurora yellow
+  opus: "hsl(311 24% 63%)", // aurora purple
 };
 
-export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) {
+function formatCellId(cellId: string): string {
+  return cellId.replace(/_/g, " ");
+}
+
+interface CellDatum {
+  x: number;
+  y: number;
+  xErrorRange: [number, number];
+  yErrorRange: [number, number];
+  cell_id: string;
+  xLabel: string;
+  yLabel: string;
+  xRange: string;
+  yRange: string;
+  n: number;
+}
+
+function CustomTooltip({ active, payload }: any) {
+  if (!active || !payload?.length) return null;
+  const d: CellDatum = payload[0].payload;
+  return (
+    <div
+      style={{
+        background: "hsl(217 16% 15.5%)",
+        border: "1px solid hsl(217 17% 28%)",
+        borderRadius: "2px",
+        fontFamily: "'JetBrains Mono', monospace",
+        fontSize: "11px",
+        padding: "8px 10px",
+        lineHeight: "1.6",
+        color: "hsl(213 14% 80%)",
+      }}
+    >
+      <div style={{ fontWeight: 600, marginBottom: 4 }}>
+        {formatCellId(d.cell_id)}
+      </div>
+      <div>
+        {d.xLabel}: {d.xRange}
+      </div>
+      <div>
+        {d.yLabel}: {d.yRange}
+      </div>
+      <div style={{ marginTop: 2, color: "hsl(213 14% 55%)" }}>
+        {d.n} run{d.n !== 1 ? "s" : ""} in cell
+      </div>
+    </div>
+  );
+}
+
+export default function ScatterPlot({
+  runs,
+  xMetric,
+  yMetric,
+}: ScatterPlotProps) {
   const xConf = METRIC_CONFIG[xMetric];
   const yConf = METRIC_CONFIG[yMetric];
   if (!xConf || !yConf) return null;
 
-  // Group by model
-  const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {};
+  const cells = groupIntoCells(runs);
+
+  // Group cells by model
+  const byModel: Record<string, CellDatum[]> = {};
+
+  for (const cell of cells) {
+    const xAgg = cell[xConf.cellKey];
+    const yAgg = cell[yConf.cellKey];
+    // Skip cells where either metric has no data
+    if (xAgg.avg === 0 && xAgg.min === 0 && xAgg.max === 0) continue;
+    if (yAgg.avg === 0 && yAgg.min === 0 && yAgg.max === 0) continue;
 
-  for (const run of runs) {
-    const x = xConf.extract(run);
-    const y = yConf.extract(run);
-    if (x === null || y === null) continue;
+    const xAvg = xAgg.avg * xConf.scale;
+    const xMin = xAgg.min * xConf.scale;
+    const xMax = xAgg.max * xConf.scale;
+    const yAvg = yAgg.avg * yConf.scale;
+    const yMin = yAgg.min * yConf.scale;
+    const yMax = yAgg.max * yConf.scale;
 
-    const model = run.meta.model;
+    const model = cell.meta.model;
     if (!byModel[model]) byModel[model] = [];
+
+    const xRangeStr =
+      cell.n > 1
+        ? `avg ${xConf.format(xAvg)} (${xConf.format(xMin)} - ${xConf.format(xMax)})`
+        : xConf.format(xAvg);
+    const yRangeStr =
+      cell.n > 1
+        ? `avg ${yConf.format(yAvg)} (${yConf.format(yMin)} - ${yConf.format(yMax)})`
+        : yConf.format(yAvg);
+
     byModel[model].push({
-      x,
-      y,
-      run_id: run.meta.run_id,
-      prompt: run.meta.prompt_style,
+      x: xAvg,
+      y: yAvg,
+      xErrorRange: [xAvg - xMin, xMax - xAvg],
+      yErrorRange: [yAvg - yMin, yMax - yAvg],
+      cell_id: cell.cell_id,
+      xLabel: xConf.label,
+      yLabel: yConf.label,
+      xRange: xRangeStr,
+      yRange: yRangeStr,
+      n: cell.n,
     });
   }
 
   const models = Object.keys(byModel).sort();
+  const totalCells = models.reduce((sum, m) => sum + byModel[m].length, 0);
 
   return (
     <div className="card">
       <h3 style={{ marginBottom: "16px" }}>
-        {xConf.label} vs {yConf.label}
+        {xConf.label} vs {yConf.label}{" "}
+        <span
+          style={{
+            fontSize: "12px",
+            fontWeight: 400,
+            color: "hsl(213 14% 55%)",
+          }}
+        >
+          ({totalCells} cells)
+        </span>
       </h3>
       <ResponsiveContainer width="100%" height={350}>
-        <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}>
-          <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" />
+        <ScatterChart
+          margin={{ top: 10, right: 20, bottom: 10, left: 10 }}
+        >
+          <CartesianGrid
+            strokeDasharray="3 3"
+            stroke="hsl(217 17% 28%)"
+          />
           <XAxis
             dataKey="x"
             name={xConf.label}
@@ -92,20 +203,7 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps
             fontSize={11}
             tickFormatter={(v) => yConf.format(v)}
           />
-          <Tooltip
-            contentStyle={{
-              background: "hsl(217 16% 15.5%)",
-              border: "1px solid hsl(217 17% 28%)",
-              borderRadius: "2px",
-              fontFamily: "'JetBrains Mono', monospace",
-              fontSize: "11px",
-            }}
-            formatter={(value: number, name: string) => {
-              if (name === xConf.label) return [xConf.format(value), name];
-              if (name === yConf.label) return [yConf.format(value), name];
-              return [value, name];
-            }}
-          />
+          <Tooltip content={<CustomTooltip />} />
           <Legend />
           {models.map((model) => (
             <Scatter
@@ -113,7 +211,22 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps
               name={model}
               data={byModel[model]}
               fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"}
-            />
+            >
+              <ErrorBar
+                dataKey="xErrorRange"
+                direction="x"
+                stroke="hsl(213 14% 45%)"
+                strokeWidth={1}
+                width={4}
+              />
+              <ErrorBar
+                dataKey="yErrorRange"
+                direction="y"
+                stroke="hsl(213 14% 45%)"
+                strokeWidth={1}
+                width={4}
+              />
+            </Scatter>
           ))}
         </ScatterChart>
       </ResponsiveContainer>

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/BumpChart.tsx	\|	55	++++++++++++++++++++++++++-----------------------------
M	dashboard/src/components/ConfigTreemap.tsx	\|	26	+++++++++++++-------------
M	dashboard/src/components/CorrelationMatrix.tsx	\|	24	+++++++++++++++++-------
M	dashboard/src/components/EfficiencyFrontier.tsx	\|	80	+++++++++++++++++++++++++++----------------------------------------------------
M	dashboard/src/components/HeatmapMatrix.tsx	\|	21	+++++++++++++--------
M	dashboard/src/components/RadarComparison.tsx	\|	33	++++++++++++++-------------------
M	dashboard/src/components/ScatterPlot.tsx	\|	193	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------