commit e9c7251cd07c133098a32de1b00898bc7ea79d3f
parent 4c5457fbc3c2f5ff52de70289b518e2f956800f4
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Wed, 8 Apr 2026 07:58:36 +0200
Add 95% CI bands, statistical power card, tornado CI whiskers
- Box plot: CI band overlay with mean dot, tooltip shows CI range
- Statistical Power card: avg CI width, detectable effect, color status
- Tornado: CI whiskers on effect bars, non-significant dimmed with "n.s."
- confidenceInterval() function with t-distribution for small samples
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
6 files changed, 402 insertions(+), 7 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -12,6 +12,7 @@ import {
} from "recharts";
import type { Run } from "../lib/types";
import { getModelColor, modelSortOrder } from "../lib/colors";
+import { confidenceInterval } from "../lib/analysis";
import ModelSelector from "./ModelSelector";
interface ChartsProps {
@@ -32,6 +33,10 @@ interface BoxPlotData {
base: number; // invisible bar height = q1
iqr: number; // visible box height = q3 - q1
color: string;
+ // 95% confidence interval of the mean
+ ciMean: number;
+ ciLower: number;
+ ciUpper: number;
}
@@ -165,6 +170,7 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] {
const totalRuns = modelCells.reduce((sum, c) => sum + c.runCount, 0);
const stats = computeBoxStats(scores);
const baseModel = model;
+ const ci = confidenceInterval(scores);
return {
label: `${model}|(n=${totalRuns})`,
...stats,
@@ -174,6 +180,9 @@ function aggregateByModel(runs: Run[]): BoxPlotData[] {
runCount: totalRuns,
scores,
color: getModelColor(baseModel),
+ ciMean: ci.mean,
+ ciLower: ci.lower,
+ ciUpper: ci.upper,
};
});
}
@@ -187,7 +196,7 @@ function BoxPlotShape(props: any) {
};
if (!payload || height === undefined) return null;
- const { min, median, max, color, cellCount } = payload;
+ const { min, median, max, color, cellCount, ciLower, ciUpper, ciMean } = payload;
const lowN = cellCount < 3;
const boxOpacity = lowN ? 0.4 : 1;
// The bar is rendered from q1 (base) with height iqr (q3-q1).
@@ -222,6 +231,35 @@ function BoxPlotShape(props: any) {
<rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} strokeDasharray={lowN ? "4 2" : undefined} />
{/* Median line */}
<line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} />
+ {/* 95% CI band on the mean */}
+ {ciLower !== ciUpper && (() => {
+ const ciTopY = dataToY(Math.min(ciUpper, max));
+ const ciBotY = dataToY(Math.max(ciLower, min));
+ const ciMeanY = dataToY(ciMean);
+ const ciHalfW = width * 0.45;
+ return (
+ <>
+ {/* Shaded CI band */}
+ <rect
+ x={centerX - ciHalfW}
+ y={ciTopY}
+ width={ciHalfW * 2}
+ height={Math.max(ciBotY - ciTopY, 1)}
+ fill={color}
+ fillOpacity={0.2}
+ stroke="none"
+ />
+ {/* CI vertical line */}
+ <line x1={centerX} y1={ciTopY} x2={centerX} y2={ciBotY} stroke={color} strokeWidth={1.5} strokeDasharray="2 2" />
+ {/* CI top cap */}
+ <line x1={centerX - 4} y1={ciTopY} x2={centerX + 4} y2={ciTopY} stroke={color} strokeWidth={1.5} />
+ {/* CI bottom cap */}
+ <line x1={centerX - 4} y1={ciBotY} x2={centerX + 4} y2={ciBotY} stroke={color} strokeWidth={1.5} />
+ {/* Mean dot */}
+ <circle cx={centerX} cy={ciMeanY} r={2.5} fill={color} stroke="none" />
+ </>
+ );
+ })()}
</g>
);
}
@@ -242,6 +280,12 @@ function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean;
<div>Median: {Math.round(d.median)}%</div>
<div>Q1: {Math.round(d.q1)}%</div>
<div>Min: {d.min}%</div>
+ {d.ciLower !== d.ciUpper && (
+ <div style={{ marginTop: 4, borderTop: `1px solid ${SMUI.border}`, paddingTop: 4 }}>
+ <div style={{ color: SMUI.frost2 }}>Mean: {Math.round(d.ciMean)}%</div>
+ <div style={{ color: SMUI.frost2 }}>95% CI: [{Math.round(d.ciLower)}% - {Math.round(d.ciUpper)}%]</div>
+ </div>
+ )}
</div>
);
}
@@ -270,6 +314,14 @@ export default function Charts({ runs }: ChartsProps) {
const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model));
const modelData = aggregateByModel(filteredRuns);
+ const detectableDifference = useMemo(() => {
+ const ciWidths = modelData
+ .filter((d) => d.ciLower !== d.ciUpper)
+ .map((d) => d.ciUpper - d.ciLower);
+ if (ciWidths.length === 0) return null;
+ return Math.round(Math.max(...ciWidths));
+ }, [modelData]);
+
return (
<div className="card">
<div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}>
@@ -327,6 +379,17 @@ export default function Charts({ runs }: ChartsProps) {
<Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" />
</ComposedChart>
</ResponsiveContainer>
+ {detectableDifference != null && (
+ <div style={{
+ fontSize: "10px",
+ fontFamily: "'JetBrains Mono', monospace",
+ color: SMUI.muted,
+ marginTop: "4px",
+ textAlign: "center",
+ }}>
+ Detectable difference: differences of ±{detectableDifference}% are statistically significant with current data
+ </div>
+ )}
</div>
);
}
diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx
@@ -109,7 +109,7 @@ export default function Insights({ runs }: InsightsProps) {
</div>
{/* Tornado chart */}
- <TornadoChart effects={effects} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} />
+ <TornadoChart effects={effects} metric={metric} totalRuns={filteredRuns.length} totalCells={filteredCells.length} runs={filteredRuns} />
{/* Interaction explorer */}
<div className="card">
diff --git a/dashboard/src/components/StatisticalPowerCard.tsx b/dashboard/src/components/StatisticalPowerCard.tsx
@@ -0,0 +1,137 @@
+import { useMemo } from "react";
+import type { Run } from "../lib/types";
+import { groupIntoCells, confidenceInterval } from "../lib/analysis";
+
+interface StatisticalPowerCardProps {
+ runs: Run[];
+}
+
+const SMUI = {
+ surface1: "hsl(217 16% 15.5%)",
+ surface2: "hsl(216 15% 19%)",
+ border: "hsl(217 17% 28%)",
+ muted: "hsl(213 14% 65%)",
+ green: "hsl(92 28% 65%)",
+ yellow: "hsl(40 71% 73%)",
+ red: "hsl(355 52% 64%)",
+};
+
+export default function StatisticalPowerCard({ runs }: StatisticalPowerCardProps) {
+ const stats = useMemo(() => {
+ const cells = groupIntoCells(runs);
+ const totalRuns = runs.length;
+ const totalCells = cells.length;
+ if (totalCells === 0) return null;
+
+ const avgRunsPerCell = totalRuns / totalCells;
+
+ // Compute CI width for each cell with 3+ runs
+ const ciWidths: number[] = [];
+ for (const cell of cells) {
+ const scores = cell.runs
+ .map((r) => r.eval_results?.score)
+ .filter((s): s is number => s != null)
+ .map((s) => s * 100);
+ if (scores.length >= 2) {
+ const ci = confidenceInterval(scores);
+ const width = ci.upper - ci.lower;
+ if (isFinite(width)) ciWidths.push(width);
+ }
+ }
+
+ const avgCiWidth = ciWidths.length > 0
+ ? ciWidths.reduce((a, b) => a + b, 0) / ciWidths.length
+ : null;
+
+ // Minimum detectable effect = largest CI half-width across cells
+ const minDetectable = ciWidths.length > 0
+ ? Math.max(...ciWidths) / 2
+ : null;
+
+ return { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable };
+ }, [runs]);
+
+ if (!stats) return null;
+
+ const { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable } = stats;
+
+ // Color indicator based on average CI width
+ let statusColor = SMUI.green;
+ let statusLabel = "Strong";
+ if (avgCiWidth == null) {
+ statusColor = SMUI.muted;
+ statusLabel = "Insufficient data";
+ } else if (avgCiWidth > 10) {
+ statusColor = SMUI.red;
+ statusLabel = "Low power";
+ } else if (avgCiWidth > 5) {
+ statusColor = SMUI.yellow;
+ statusLabel = "Moderate";
+ }
+
+ return (
+ <div
+ className="card"
+ style={{
+ padding: "16px",
+ marginBottom: "16px",
+ }}
+ >
+ <div style={{ display: "flex", alignItems: "center", gap: "12px", marginBottom: "12px" }}>
+ <h3 style={{ margin: 0 }}>Statistical Power</h3>
+ <span
+ style={{
+ fontSize: "10px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontWeight: 600,
+ color: statusColor,
+ border: `1px solid ${statusColor}`,
+ padding: "2px 8px",
+ letterSpacing: "0.5px",
+ textTransform: "uppercase",
+ }}
+ >
+ {statusLabel}
+ </span>
+ </div>
+ <div
+ style={{
+ display: "flex",
+ gap: "24px",
+ flexWrap: "wrap",
+ fontSize: "13px",
+ fontFamily: "'JetBrains Mono', monospace",
+ }}
+ >
+ <div>
+ <span style={{ color: SMUI.muted }}>runs </span>
+ <span style={{ fontWeight: 600 }}>{totalRuns}</span>
+ </div>
+ <div>
+ <span style={{ color: SMUI.muted }}>cells </span>
+ <span style={{ fontWeight: 600 }}>{totalCells}</span>
+ </div>
+ <div>
+ <span style={{ color: SMUI.muted }}>avg runs/cell </span>
+ <span style={{ fontWeight: 600 }}>{avgRunsPerCell.toFixed(1)}</span>
+ </div>
+ {avgCiWidth != null && (
+ <div>
+ <span style={{ color: SMUI.muted }}>avg 95% CI </span>
+ <span style={{ fontWeight: 600, color: statusColor }}>
+ ±{(avgCiWidth / 2).toFixed(1)}%
+ </span>
+ </div>
+ )}
+ {minDetectable != null && (
+ <div>
+ <span style={{ color: SMUI.muted }}>min detectable effect </span>
+ <span style={{ fontWeight: 600, color: statusColor }}>
+ ±{minDetectable.toFixed(1)}%
+ </span>
+ </div>
+ )}
+ </div>
+ </div>
+ );
+}
diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx
@@ -1,10 +1,13 @@
+import type { Run } from "../lib/types";
import type { AxisEffect } from "../lib/analysis";
+import { groupIntoCells, confidenceInterval } from "../lib/analysis";
interface TornadoChartProps {
effects: AxisEffect[];
metric: string;
totalRuns?: number;
totalCells?: number;
+ runs?: Run[];
}
const AXIS_LABELS: Record<string, string> = {
@@ -33,7 +36,90 @@ const AXIS_LABELS: Record<string, string> = {
provider: "Provider",
};
-export default function TornadoChart({ effects, metric, totalRuns, totalCells }: TornadoChartProps) {
+// Metric extractors matching analysis.ts
+const METRIC_EXTRACTORS: Record<string, (r: Run) => number | null> = {
+ score: (r) => r.eval_results?.score ?? null,
+ cost: (r) => r.claude_output?.total_cost_usd ?? null,
+ turns: (r) => r.claude_output?.num_turns ?? null,
+ wall_time: (r) => r.meta.wall_time_seconds ?? null,
+ gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null,
+ code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null,
+ structural: (r) => r.eval_results?.structural?.score ?? null,
+ quality: (r) => r.eval_results?.quality?.score ?? null,
+ transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
+ sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null,
+};
+
+const SKIP_KEYS = new Set([
+ "task", "cell_id", "run_id", "run_number", "runs_per_cell",
+ "max_budget_usd", "timeout_seconds", "base_tools", "started_at",
+ "completed_at", "wall_time_seconds", "exit_code", "short_id",
+ "short_cell_id", "claude_version", "sub_agents", "actual_model",
+]);
+
+interface EffectCI {
+ ciLower: number;
+ ciUpper: number;
+ crossesZero: boolean;
+}
+
+function computeEffectCIs(
+ runs: Run[],
+ metric: string
+): Record<string, Record<string, EffectCI>> {
+ const extract = METRIC_EXTRACTORS[metric];
+ if (!extract || runs.length === 0) return {};
+
+ const cells = groupIntoCells(runs);
+
+ // Get per-cell averages
+ const cellData: Array<{ meta: Run["meta"]; avg: number }> = [];
+ for (const cell of cells) {
+ const vals: number[] = [];
+ for (const run of cell.runs) {
+ const v = extract(run);
+ if (v !== null) vals.push(v);
+ }
+ if (vals.length === 0) continue;
+ cellData.push({ meta: cell.meta, avg: vals.reduce((a, b) => a + b, 0) / vals.length });
+ }
+
+ if (cellData.length === 0) return {};
+
+ const grandMean = cellData.reduce((s, c) => s + c.avg, 0) / cellData.length;
+
+ const axisKeys = Object.keys(cellData[0].meta).filter((k) => !SKIP_KEYS.has(k));
+ const result: Record<string, Record<string, EffectCI>> = {};
+
+ for (const axis of axisKeys) {
+ const groups: Record<string, number[]> = {};
+ for (const { meta, avg } of cellData) {
+ const key = String((meta as Record<string, unknown>)[axis] ?? "unknown");
+ (groups[key] ??= []).push(avg);
+ }
+
+ result[axis] = {};
+ for (const [val, avgs] of Object.entries(groups)) {
+ if (avgs.length < 2) {
+ result[axis][val] = { ciLower: 0, ciUpper: 0, crossesZero: true };
+ continue;
+ }
+ // Compute CI of the effect (mean - grandMean)
+ const ci = confidenceInterval(avgs);
+ const effectLower = ci.lower - grandMean;
+ const effectUpper = ci.upper - grandMean;
+ result[axis][val] = {
+ ciLower: effectLower,
+ ciUpper: effectUpper,
+ crossesZero: effectLower <= 0 && effectUpper >= 0,
+ };
+ }
+ }
+
+ return result;
+}
+
+export default function TornadoChart({ effects, metric, totalRuns, totalCells, runs }: TornadoChartProps) {
if (effects.length === 0) {
return (
<div
@@ -50,11 +136,18 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
);
}
- // Scale must account for variance bands extending beyond effect bars
+ // Compute CIs if runs are provided
+ const effectCIs = runs ? computeEffectCIs(runs, metric) : {};
+
+ // Scale must account for variance bands and CI whiskers extending beyond effect bars
+ const ciExtents = Object.values(effectCIs).flatMap((axisCIs) =>
+ Object.values(axisCIs).map((ci) => Math.max(Math.abs(ci.ciLower), Math.abs(ci.ciUpper)))
+ );
const maxExtent = Math.max(
...effects.flatMap((e) =>
e.values.map((v) => Math.abs(v.effect) + v.variance)
- )
+ ),
+ ...ciExtents
);
const scale = maxExtent > 0 ? 200 / maxExtent : 1;
@@ -109,11 +202,22 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
}}
>
{effect.values.map((entry) => {
+ const ci = effectCIs[effect.axis]?.[entry.value];
+ const crossesZero = ci?.crossesZero ?? true;
const effectWidth = Math.abs(entry.effect) * scale;
const varianceBandWidth =
(Math.abs(entry.effect) + entry.variance) * scale;
const isPositive = entry.effect >= 0;
const isLowN = entry.n < 3;
+ // CI whisker positions (in px from left edge of bar area)
+ const ciLowerPx = ci ? Math.abs(ci.ciLower) * scale : 0;
+ const ciUpperPx = ci ? Math.abs(ci.ciUpper) * scale : 0;
+ // For the whisker, we show the full CI extent
+ const ciMaxPx = ci ? Math.max(ciLowerPx, ciUpperPx) : 0;
+ const ciMinPx = ci ? Math.min(ciLowerPx, ciUpperPx) : 0;
+ // Dim bars where CI crosses zero (effect not significant)
+ const notSignificant = ci && crossesZero && !isLowN;
+ const barContainerWidth = Math.max(varianceBandWidth, effectWidth, ciMaxPx, 2);
return (
<div
key={entry.value}
@@ -121,7 +225,7 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
display: "flex",
alignItems: "center",
gap: "8px",
- opacity: isLowN ? 0.4 : 1,
+ opacity: isLowN ? 0.4 : notSignificant ? 0.5 : 1,
}}
>
<div
@@ -140,7 +244,7 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
style={{
position: "relative",
height: "16px",
- width: `${Math.max(varianceBandWidth, effectWidth, 2)}px`,
+ width: `${barContainerWidth}px`,
}}
>
{/* Variance band (behind, wider, semi-transparent) */}
@@ -176,6 +280,44 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
...(isLowN ? { borderStyle: "dashed", borderWidth: "1px", borderColor: isPositive ? "var(--green)" : "var(--red)" } : {}),
}}
/>
+ {/* CI whisker */}
+ {ci && !isLowN && ciMaxPx > 0 && (
+ <>
+ {/* Whisker line */}
+ <div
+ style={{
+ position: "absolute",
+ top: "7px",
+ left: `${ciMinPx}px`,
+ width: `${Math.max(ciMaxPx - ciMinPx, 1)}px`,
+ height: "2px",
+ background: "var(--text-muted)",
+ }}
+ />
+ {/* Left cap */}
+ <div
+ style={{
+ position: "absolute",
+ top: "4px",
+ left: `${ciMinPx}px`,
+ width: "1px",
+ height: "8px",
+ background: "var(--text-muted)",
+ }}
+ />
+ {/* Right cap */}
+ <div
+ style={{
+ position: "absolute",
+ top: "4px",
+ left: `${ciMaxPx}px`,
+ width: "1px",
+ height: "8px",
+ background: "var(--text-muted)",
+ }}
+ />
+ </>
+ )}
</div>
<div
style={{
@@ -197,6 +339,18 @@ export default function TornadoChart({ effects, metric, totalRuns, totalCells }:
±{(entry.variance * 100).toFixed(1)}%
</span>
)}
+ {ci && !isLowN && (
+ <span
+ style={{
+ color: crossesZero ? "var(--yellow)" : "var(--text-muted)",
+ marginLeft: "4px",
+ fontSize: "0.6rem",
+ }}
+ >
+ CI [{(ci.ciLower * 100).toFixed(1)}, {(ci.ciUpper * 100).toFixed(1)}]
+ {crossesZero ? " n.s." : ""}
+ </span>
+ )}
</div>
<div
style={{
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -1,6 +1,44 @@
import type { Run, AxisName } from "./types";
import { AXIS_NAMES } from "./types";
+export interface ConfidenceIntervalResult {
+ mean: number;
+ ci: number;
+ lower: number;
+ upper: number;
+ n: number;
+}
+
+export function confidenceInterval(
+ values: number[],
+ confidence = 0.95
+): ConfidenceIntervalResult {
+ const n = values.length;
+ if (n < 2) {
+ const mean = n === 1 ? values[0] : 0;
+ return { mean, ci: 0, lower: mean, upper: mean, n };
+ }
+ const mean = values.reduce((a, b) => a + b, 0) / n;
+ const stdDev = Math.sqrt(
+ values.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1)
+ );
+ // t-value approximation for 95% CI (exact for small n, 1.96 for large n)
+ const tValues: Record<number, number> = {
+ 2: 12.71,
+ 3: 4.3,
+ 4: 3.18,
+ 5: 2.78,
+ 6: 2.57,
+ 7: 2.45,
+ 8: 2.36,
+ 9: 2.31,
+ 10: 2.26,
+ };
+ const t = tValues[n] ?? (n > 30 ? 1.96 : 2.0);
+ const ci = t * stdDev / Math.sqrt(n);
+ return { mean, ci, lower: mean - ci, upper: mean + ci, n };
+}
+
export interface Cell {
cell_id: string;
runs: Run[];
diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro
@@ -5,6 +5,7 @@ import type { Run } from "../lib/types";
import Grid from "../components/Grid";
import Charts from "../components/Charts";
import TopBottomConfigs from "../components/TopBottomConfigs";
+import StatisticalPowerCard from "../components/StatisticalPowerCard";
const runs = loadAllRuns();
const axisValues = getAxisValues(runs);
@@ -103,6 +104,8 @@ const totalCells = new Set(runs.map(r => r.meta.cell_id)).size;
</div>
))}
+ <StatisticalPowerCard client:load runs={runs} />
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
<Charts client:load runs={runs} />
<TopBottomConfigs client:load runs={runs} />