Add variability analysis to insights page - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 82f6a5b0b78b3ac64bfad4963937ddcd7687a317
parent 5e358b275032b8351588c74c53ef7c5853c1b8b4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 08:03:28 +0200

Add variability analysis to insights page

Three new visualizations on /insights:

1. Score Distribution by Model (box plots)
   - Horizontal box plots showing min/Q1/median/Q3/max per model
   - Individual cell dots overlaid
   - Shows which model is most consistent

2. Reliability Ranking by Variable
   - For each variable value, shows average within-cell range
   - Sorted most reliable first (smallest range)
   - Green/yellow/red bars for consistency level

3. Variance Contribution (ANOVA decomposition)
   - Stacked bar showing: % from config choices vs % from randomness
   - Answers "does the loop config actually matter?"

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
A dashboard/src/components/Variability.tsx  | 718 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M dashboard/src/pages/insights.astro  | 7 ++++++-

2 files changed, 724 insertions(+), 1 deletion(-)
diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx
@@ -0,0 +1,718 @@
+import { useMemo } from "react";
+import type { Run, AxisName } from "../lib/types";
+import { AXIS_NAMES } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
+import type { Cell } from "../lib/analysis";
+
+interface VariabilityProps {
+  runs: Run[];
+}
+
+const AXIS_LABELS: Record<string, string> = {
+  model: "Model",
+  effort: "Effort",
+  prompt_style: "Prompt Style",
+  language: "Language",
+  human_language: "Human Language",
+  tool_read: "Read Tool",
+  tool_write: "Write Tool",
+  tool_edit: "Edit Tool",
+  tool_glob: "Glob Tool",
+  tool_grep: "Grep Tool",
+  linter: "Linter",
+  playwright: "Playwright",
+  context_file: "Context File",
+  sub_agents: "Sub-agents",
+  web_search: "Web Search",
+  max_budget: "Budget",
+};
+
+/* ---------- helpers ---------- */
+
+function quantile(sorted: number[], q: number): number {
+  if (sorted.length === 0) return 0;
+  if (sorted.length === 1) return sorted[0];
+  const pos = q * (sorted.length - 1);
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sorted[lo];
+  return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]);
+}
+
+function variance(values: number[]): number {
+  if (values.length < 2) return 0;
+  const mean = values.reduce((a, b) => a + b, 0) / values.length;
+  return values.reduce((s, v) => s + (v - mean) ** 2, 0) / values.length;
+}
+
+/* ---------- Section 1: Box Plots ---------- */
+
+interface BoxPlotStats {
+  model: string;
+  min: number;
+  q1: number;
+  median: number;
+  q3: number;
+  max: number;
+  points: number[];
+  cellCount: number;
+}
+
+function computeBoxPlots(cells: Cell[]): BoxPlotStats[] {
+  const models = Array.from(new Set(cells.map((c) => c.meta.model))).sort();
+  const results: BoxPlotStats[] = [];
+
+  for (const model of models) {
+    const modelCells = cells.filter((c) => c.meta.model === model);
+    const scores = modelCells
+      .map((c) => c.score.avg)
+      .filter((s) => s > 0)
+      .sort((a, b) => a - b);
+
+    if (scores.length === 0) continue;
+
+    results.push({
+      model,
+      min: scores[0],
+      q1: quantile(scores, 0.25),
+      median: quantile(scores, 0.5),
+      q3: quantile(scores, 0.75),
+      max: scores[scores.length - 1],
+      points: scores,
+      cellCount: scores.length,
+    });
+  }
+
+  return results;
+}
+
+function BoxPlotSection({ cells }: { cells: Cell[] }) {
+  const stats = useMemo(() => computeBoxPlots(cells), [cells]);
+
+  if (stats.length === 0) {
+    return (
+      <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+        No scored cells available.
+      </div>
+    );
+  }
+
+  // Global scale across all models
+  const globalMin = Math.min(...stats.map((s) => s.min));
+  const globalMax = Math.max(...stats.map((s) => s.max));
+  const range = globalMax - globalMin || 0.01;
+
+  const toPercent = (v: number) => ((v - globalMin) / range) * 100;
+
+  return (
+    <div>
+      {/* Axis labels */}
+      <div
+        style={{
+          display: "flex",
+          justifyContent: "space-between",
+          marginBottom: "4px",
+          paddingLeft: "140px",
+          paddingRight: "12px",
+        }}
+      >
+        <span style={axisLabelStyle}>{(globalMin * 100).toFixed(0)}%</span>
+        <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span>
+      </div>
+
+      {stats.map((s) => (
+        <div
+          key={s.model}
+          style={{
+            display: "flex",
+            alignItems: "center",
+            marginBottom: "16px",
+            gap: "12px",
+          }}
+        >
+          {/* Label */}
+          <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}>
+            <div style={labelPrimaryStyle}>{s.model}</div>
+            <div style={labelSecondaryStyle}>
+              median {(s.median * 100).toFixed(1)}% / {s.cellCount} cells
+            </div>
+          </div>
+
+          {/* Box plot */}
+          <div
+            style={{
+              flex: 1,
+              position: "relative",
+              height: "32px",
+              marginRight: "12px",
+            }}
+          >
+            {/* Background track */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: 0,
+                right: 0,
+                height: "1px",
+                background: "hsl(var(--border))",
+                transform: "translateY(-50%)",
+              }}
+            />
+
+            {/* Whisker line: min to max */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: `${toPercent(s.min)}%`,
+                width: `${toPercent(s.max) - toPercent(s.min)}%`,
+                height: "2px",
+                background: "var(--accent)",
+                opacity: 0.5,
+                transform: "translateY(-50%)",
+              }}
+            />
+
+            {/* Min whisker cap */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: `${toPercent(s.min)}%`,
+                width: "1px",
+                height: "12px",
+                background: "var(--accent)",
+                opacity: 0.5,
+                transform: "translate(-50%, -50%)",
+              }}
+            />
+
+            {/* Max whisker cap */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: `${toPercent(s.max)}%`,
+                width: "1px",
+                height: "12px",
+                background: "var(--accent)",
+                opacity: 0.5,
+                transform: "translate(-50%, -50%)",
+              }}
+            />
+
+            {/* IQR box: Q1 to Q3 */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: `${toPercent(s.q1)}%`,
+                width: `${Math.max(toPercent(s.q3) - toPercent(s.q1), 0.5)}%`,
+                height: "18px",
+                background: "var(--accent)",
+                opacity: 0.2,
+                border: "1px solid var(--accent)",
+                transform: "translateY(-50%)",
+              }}
+            />
+
+            {/* Median line */}
+            <div
+              style={{
+                position: "absolute",
+                top: "50%",
+                left: `${toPercent(s.median)}%`,
+                width: "2px",
+                height: "22px",
+                background: "var(--accent)",
+                transform: "translate(-50%, -50%)",
+              }}
+            />
+
+            {/* Individual cell dots */}
+            {s.points.map((p, i) => (
+              <div
+                key={i}
+                style={{
+                  position: "absolute",
+                  top: "50%",
+                  left: `${toPercent(p)}%`,
+                  width: "5px",
+                  height: "5px",
+                  borderRadius: "50%",
+                  background: "var(--accent)",
+                  opacity: 0.6,
+                  transform: "translate(-50%, -50%)",
+                  zIndex: 1,
+                }}
+              />
+            ))}
+          </div>
+        </div>
+      ))}
+    </div>
+  );
+}
+
+/* ---------- Section 2: Reliability Ranking ---------- */
+
+interface ReliabilityRow {
+  axis: string;
+  value: string;
+  avgScore: number;
+  avgRange: number;
+  n: number;
+}
+
+function computeReliability(cells: Cell[]): ReliabilityRow[] {
+  const rows: ReliabilityRow[] = [];
+
+  for (const axis of AXIS_NAMES) {
+    const groups: Record<string, { scores: number[]; ranges: number[] }> = {};
+    for (const cell of cells) {
+      const val = String(
+        (cell.meta as Record<string, unknown>)[axis] ?? "unknown"
+      );
+      const g = (groups[val] ??= { scores: [], ranges: [] });
+      if (cell.score.avg > 0) {
+        g.scores.push(cell.score.avg);
+        g.ranges.push(cell.score.range);
+      }
+    }
+
+    for (const [val, { scores, ranges }] of Object.entries(groups)) {
+      if (scores.length < 2) continue;
+      rows.push({
+        axis,
+        value: val,
+        avgScore: scores.reduce((a, b) => a + b, 0) / scores.length,
+        avgRange: ranges.reduce((a, b) => a + b, 0) / ranges.length,
+        n: scores.length,
+      });
+    }
+  }
+
+  return rows.sort((a, b) => a.avgRange - b.avgRange);
+}
+
+function reliabilityColor(avgRange: number): string {
+  if (avgRange <= 0.05) return "var(--green)";
+  if (avgRange <= 0.12) return "var(--yellow)";
+  return "var(--red)";
+}
+
+function ReliabilitySection({ cells }: { cells: Cell[] }) {
+  const rows = useMemo(() => computeReliability(cells), [cells]);
+
+  if (rows.length === 0) {
+    return (
+      <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+        Not enough multi-run cells to compute reliability.
+      </div>
+    );
+  }
+
+  const maxRange = Math.max(...rows.map((r) => r.avgRange), 0.01);
+
+  return (
+    <div style={{ overflowX: "auto" }}>
+      <table style={{ borderCollapse: "collapse", width: "100%" }}>
+        <thead>
+          <tr>
+            {["VARIABLE", "VALUE", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map(
+              (h) => (
+                <th key={h} style={thStyle}>
+                  {h}
+                </th>
+              )
+            )}
+          </tr>
+        </thead>
+        <tbody>
+          {rows.map((row, i) => {
+            const barWidth = (row.avgRange / maxRange) * 100;
+            const color = reliabilityColor(row.avgRange);
+            return (
+              <tr
+                key={`${row.axis}-${row.value}`}
+                style={{
+                  borderBottom: "1px solid hsl(var(--border))",
+                  background:
+                    i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)",
+                }}
+              >
+                <td style={tdStyle}>
+                  {AXIS_LABELS[row.axis] || row.axis}
+                </td>
+                <td style={{ ...tdStyle, fontFamily: "var(--font-mono)" }}>
+                  {row.value}
+                </td>
+                <td
+                  style={{
+                    ...tdStyle,
+                    fontFamily: "var(--font-mono)",
+                    textAlign: "right",
+                  }}
+                >
+                  {(row.avgScore * 100).toFixed(1)}%
+                </td>
+                <td
+                  style={{
+                    ...tdStyle,
+                    fontFamily: "var(--font-mono)",
+                    textAlign: "right",
+                    color,
+                  }}
+                >
+                  {(row.avgRange * 100).toFixed(1)}%
+                </td>
+                <td style={{ ...tdStyle, width: "200px" }}>
+                  <div
+                    style={{
+                      position: "relative",
+                      height: "12px",
+                      background: "hsl(var(--border) / 0.2)",
+                    }}
+                  >
+                    <div
+                      style={{
+                        position: "absolute",
+                        top: 0,
+                        left: 0,
+                        height: "100%",
+                        width: `${Math.max(barWidth, 1)}%`,
+                        background: color,
+                        opacity: 0.7,
+                      }}
+                    />
+                  </div>
+                </td>
+              </tr>
+            );
+          })}
+        </tbody>
+      </table>
+    </div>
+  );
+}
+
+/* ---------- Section 3: Variance Contribution ---------- */
+
+interface VarianceDecomp {
+  totalVariance: number;
+  withinVariance: number;
+  betweenVariance: number;
+  betweenPct: number;
+  withinPct: number;
+}
+
+function computeVarianceDecomp(
+  runs: Run[],
+  cells: Cell[]
+): VarianceDecomp | null {
+  // All individual run scores
+  const allScores = runs
+    .map((r) => r.eval_results?.score ?? null)
+    .filter((s): s is number => s !== null && s > 0);
+
+  if (allScores.length < 2) return null;
+
+  const totalVar = variance(allScores);
+  if (totalVar === 0) return null;
+
+  // Within-cell variance: average variance within each cell
+  const cellVariances: number[] = [];
+  for (const cell of cells) {
+    const scores = cell.runs
+      .map((r) => r.eval_results?.score ?? null)
+      .filter((s): s is number => s !== null && s > 0);
+    if (scores.length >= 2) {
+      cellVariances.push(variance(scores));
+    }
+  }
+
+  const withinVar =
+    cellVariances.length > 0
+      ? cellVariances.reduce((a, b) => a + b, 0) / cellVariances.length
+      : 0;
+
+  const betweenVar = Math.max(totalVar - withinVar, 0);
+  const betweenPct = totalVar > 0 ? betweenVar / totalVar : 0;
+  const withinPct = totalVar > 0 ? withinVar / totalVar : 0;
+
+  return {
+    totalVariance: totalVar,
+    withinVariance: withinVar,
+    betweenVariance: betweenVar,
+    betweenPct,
+    withinPct,
+  };
+}
+
+function VarianceSection({
+  runs,
+  cells,
+}: {
+  runs: Run[];
+  cells: Cell[];
+}) {
+  const decomp = useMemo(
+    () => computeVarianceDecomp(runs, cells),
+    [runs, cells]
+  );
+
+  if (!decomp) {
+    return (
+      <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+        Not enough data to decompose variance.
+      </div>
+    );
+  }
+
+  return (
+    <div>
+      <div
+        style={{
+          display: "flex",
+          height: "32px",
+          marginBottom: "12px",
+          border: "1px solid hsl(var(--border))",
+        }}
+      >
+        {/* Between-cell (config choices) */}
+        <div
+          style={{
+            width: `${decomp.betweenPct * 100}%`,
+            background: "var(--accent)",
+            opacity: 0.7,
+            display: "flex",
+            alignItems: "center",
+            justifyContent: "center",
+            fontSize: "11px",
+            fontFamily: "var(--font-mono)",
+            color: "var(--text)",
+            fontWeight: 600,
+            minWidth: decomp.betweenPct > 0.08 ? undefined : "0px",
+            overflow: "hidden",
+            whiteSpace: "nowrap",
+          }}
+        >
+          {decomp.betweenPct > 0.08 &&
+            `${(decomp.betweenPct * 100).toFixed(0)}%`}
+        </div>
+        {/* Within-cell (randomness) */}
+        <div
+          style={{
+            width: `${decomp.withinPct * 100}%`,
+            background: "var(--yellow)",
+            opacity: 0.5,
+            display: "flex",
+            alignItems: "center",
+            justifyContent: "center",
+            fontSize: "11px",
+            fontFamily: "var(--font-mono)",
+            color: "var(--text)",
+            fontWeight: 600,
+            minWidth: decomp.withinPct > 0.08 ? undefined : "0px",
+            overflow: "hidden",
+            whiteSpace: "nowrap",
+          }}
+        >
+          {decomp.withinPct > 0.08 &&
+            `${(decomp.withinPct * 100).toFixed(0)}%`}
+        </div>
+      </div>
+
+      <div
+        style={{
+          display: "flex",
+          gap: "24px",
+          flexWrap: "wrap",
+        }}
+      >
+        <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
+          <div
+            style={{
+              width: "12px",
+              height: "12px",
+              background: "var(--accent)",
+              opacity: 0.7,
+            }}
+          />
+          <span style={legendStyle}>
+            CONFIG CHOICES: {(decomp.betweenPct * 100).toFixed(0)}%
+          </span>
+        </div>
+        <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
+          <div
+            style={{
+              width: "12px",
+              height: "12px",
+              background: "var(--yellow)",
+              opacity: 0.5,
+            }}
+          />
+          <span style={legendStyle}>
+            RANDOMNESS: {(decomp.withinPct * 100).toFixed(0)}%
+          </span>
+        </div>
+      </div>
+
+      <p
+        style={{
+          marginTop: "12px",
+          fontSize: "12px",
+          color: "var(--text-muted)",
+          lineHeight: "1.5",
+        }}
+      >
+        {decomp.betweenPct >= 0.5
+          ? `Configuration choices explain ${(decomp.betweenPct * 100).toFixed(0)}% of score variance. The config matters more than run-to-run randomness.`
+          : decomp.betweenPct >= 0.3
+            ? `Configuration and randomness contribute roughly equally. Scores are moderately sensitive to config choices.`
+            : `Run-to-run randomness dominates (${(decomp.withinPct * 100).toFixed(0)}%). Config choices have limited impact on scores -- results are noisy.`}
+      </p>
+    </div>
+  );
+}
+
+/* ---------- shared styles ---------- */
+
+const sectionHeaderStyle: React.CSSProperties = {
+  fontSize: "11px",
+  fontFamily: "var(--font-mono)",
+  textTransform: "uppercase",
+  letterSpacing: "0.08em",
+  color: "var(--text-muted)",
+  marginBottom: "4px",
+};
+
+const sectionTitleStyle: React.CSSProperties = {
+  fontSize: "16px",
+  fontWeight: 600,
+  marginBottom: "4px",
+};
+
+const sectionDescStyle: React.CSSProperties = {
+  fontSize: "12px",
+  color: "var(--text-muted)",
+  marginBottom: "16px",
+  lineHeight: "1.4",
+};
+
+const cardStyle: React.CSSProperties = {
+  border: "1px solid hsl(var(--border))",
+  padding: "20px",
+  marginBottom: "16px",
+  background: "var(--surface-1)",
+};
+
+const axisLabelStyle: React.CSSProperties = {
+  fontSize: "10px",
+  fontFamily: "var(--font-mono)",
+  color: "var(--text-muted)",
+  textTransform: "uppercase",
+  letterSpacing: "0.06em",
+};
+
+const labelPrimaryStyle: React.CSSProperties = {
+  fontSize: "13px",
+  fontFamily: "var(--font-mono)",
+  fontWeight: 600,
+  color: "var(--text)",
+};
+
+const labelSecondaryStyle: React.CSSProperties = {
+  fontSize: "10px",
+  fontFamily: "var(--font-mono)",
+  color: "var(--text-muted)",
+  marginTop: "2px",
+};
+
+const thStyle: React.CSSProperties = {
+  fontSize: "11px",
+  fontFamily: "var(--font-mono)",
+  textTransform: "uppercase",
+  letterSpacing: "0.06em",
+  color: "var(--text-muted)",
+  padding: "8px 12px",
+  textAlign: "left",
+  borderBottom: "1px solid hsl(var(--border))",
+  fontWeight: 500,
+};
+
+const tdStyle: React.CSSProperties = {
+  fontSize: "12px",
+  padding: "6px 12px",
+  color: "var(--text)",
+};
+
+const legendStyle: React.CSSProperties = {
+  fontSize: "11px",
+  fontFamily: "var(--font-mono)",
+  textTransform: "uppercase",
+  letterSpacing: "0.06em",
+  color: "var(--text-muted)",
+};
+
+/* ---------- Main Component ---------- */
+
+export default function Variability({ runs }: VariabilityProps) {
+  const cells = useMemo(() => groupIntoCells(runs), [runs]);
+
+  if (runs.length === 0) {
+    return (
+      <div
+        style={{
+          ...cardStyle,
+          textAlign: "center",
+          padding: "40px",
+          color: "var(--text-muted)",
+        }}
+      >
+        No runs available for variability analysis.
+      </div>
+    );
+  }
+
+  return (
+    <div>
+      {/* Section 1: Box Plots */}
+      <div style={cardStyle}>
+        <div style={sectionHeaderStyle}>CONSISTENCY</div>
+        <div style={sectionTitleStyle}>Score Distribution by Model</div>
+        <p style={sectionDescStyle}>
+          Each dot is a cell (unique config). The box spans Q1-Q3; the line
+          marks the median. Tighter boxes mean more consistent results across
+          configs.
+        </p>
+        <BoxPlotSection cells={cells} />
+      </div>
+
+      {/* Section 2: Reliability Ranking */}
+      <div style={cardStyle}>
+        <div style={sectionHeaderStyle}>RELIABILITY</div>
+        <div style={sectionTitleStyle}>Reliability Ranking by Variable</div>
+        <p style={sectionDescStyle}>
+          How much do repeat runs of the same config vary? Sorted by average
+          range (smallest = most reliable). Green means scores are consistent
+          across re-runs; red means volatile.
+        </p>
+        <ReliabilitySection cells={cells} />
+      </div>
+
+      {/* Section 3: Variance Decomposition */}
+      <div style={cardStyle}>
+        <div style={sectionHeaderStyle}>VARIANCE</div>
+        <div style={sectionTitleStyle}>Variance Contribution</div>
+        <p style={sectionDescStyle}>
+          ANOVA-style decomposition: how much of the total score variance comes
+          from config choices (between cells) vs run-to-run randomness (within
+          cells)?
+        </p>
+        <VarianceSection runs={runs} cells={cells} />
+      </div>
+    </div>
+  );
+}
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -4,6 +4,7 @@ import { loadAllRuns } from "../lib/data";
 import Insights from "../components/Insights";
 import ScatterPlot from "../components/ScatterPlot";
 import Surprises from "../components/Surprises";
+import Variability from "../components/Variability";
 
 const runs = loadAllRuns();
 ---
@@ -11,11 +12,15 @@ const runs = loadAllRuns();
 <Base title="Insights">
   <h1 style="margin-bottom: 8px;">Insights</h1>
   <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
-    Which variables move the needle? Where do weaker configs win?
+    Which variables move the needle? Where do weaker configs win? How consistent are the results?
   </p>
 
   <Surprises client:load runs={runs} />
 
+  <div style="margin-top: 32px;">
+    <Variability client:load runs={runs} />
+  </div>
+
   <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
     <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" />
     <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

A	dashboard/src/components/Variability.tsx	\|	718	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	dashboard/src/pages/insights.astro	\|	7	++++++-