commit 82f6a5b0b78b3ac64bfad4963937ddcd7687a317
parent 5e358b275032b8351588c74c53ef7c5853c1b8b4
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 08:03:28 +0200
Add variability analysis to insights page
Three new visualizations on /insights:
1. Score Distribution by Model (box plots)
- Horizontal box plots showing min/Q1/median/Q3/max per model
- Individual cell dots overlaid
- Shows which model is most consistent
2. Reliability Ranking by Variable
- For each variable value, shows average within-cell range
- Sorted most reliable first (smallest range)
- Green/yellow/red bars for consistency level
3. Variance Contribution (ANOVA decomposition)
- Stacked bar showing: % from config choices vs % from randomness
- Answers "does the loop config actually matter?"
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 724 insertions(+), 1 deletion(-)
diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx
@@ -0,0 +1,718 @@
+import { useMemo } from "react";
+import type { Run, AxisName } from "../lib/types";
+import { AXIS_NAMES } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
+import type { Cell } from "../lib/analysis";
+
+interface VariabilityProps {
+ runs: Run[];
+}
+
+const AXIS_LABELS: Record<string, string> = {
+ model: "Model",
+ effort: "Effort",
+ prompt_style: "Prompt Style",
+ language: "Language",
+ human_language: "Human Language",
+ tool_read: "Read Tool",
+ tool_write: "Write Tool",
+ tool_edit: "Edit Tool",
+ tool_glob: "Glob Tool",
+ tool_grep: "Grep Tool",
+ linter: "Linter",
+ playwright: "Playwright",
+ context_file: "Context File",
+ sub_agents: "Sub-agents",
+ web_search: "Web Search",
+ max_budget: "Budget",
+};
+
+/* ---------- helpers ---------- */
+
+function quantile(sorted: number[], q: number): number {
+ if (sorted.length === 0) return 0;
+ if (sorted.length === 1) return sorted[0];
+ const pos = q * (sorted.length - 1);
+ const lo = Math.floor(pos);
+ const hi = Math.ceil(pos);
+ if (lo === hi) return sorted[lo];
+ return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]);
+}
+
+function variance(values: number[]): number {
+ if (values.length < 2) return 0;
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
+ return values.reduce((s, v) => s + (v - mean) ** 2, 0) / values.length;
+}
+
+/* ---------- Section 1: Box Plots ---------- */
+
+interface BoxPlotStats {
+ model: string;
+ min: number;
+ q1: number;
+ median: number;
+ q3: number;
+ max: number;
+ points: number[];
+ cellCount: number;
+}
+
+function computeBoxPlots(cells: Cell[]): BoxPlotStats[] {
+ const models = Array.from(new Set(cells.map((c) => c.meta.model))).sort();
+ const results: BoxPlotStats[] = [];
+
+ for (const model of models) {
+ const modelCells = cells.filter((c) => c.meta.model === model);
+ const scores = modelCells
+ .map((c) => c.score.avg)
+ .filter((s) => s > 0)
+ .sort((a, b) => a - b);
+
+ if (scores.length === 0) continue;
+
+ results.push({
+ model,
+ min: scores[0],
+ q1: quantile(scores, 0.25),
+ median: quantile(scores, 0.5),
+ q3: quantile(scores, 0.75),
+ max: scores[scores.length - 1],
+ points: scores,
+ cellCount: scores.length,
+ });
+ }
+
+ return results;
+}
+
+function BoxPlotSection({ cells }: { cells: Cell[] }) {
+ const stats = useMemo(() => computeBoxPlots(cells), [cells]);
+
+ if (stats.length === 0) {
+ return (
+ <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+ No scored cells available.
+ </div>
+ );
+ }
+
+ // Global scale across all models
+ const globalMin = Math.min(...stats.map((s) => s.min));
+ const globalMax = Math.max(...stats.map((s) => s.max));
+ const range = globalMax - globalMin || 0.01;
+
+ const toPercent = (v: number) => ((v - globalMin) / range) * 100;
+
+ return (
+ <div>
+ {/* Axis labels */}
+ <div
+ style={{
+ display: "flex",
+ justifyContent: "space-between",
+ marginBottom: "4px",
+ paddingLeft: "140px",
+ paddingRight: "12px",
+ }}
+ >
+ <span style={axisLabelStyle}>{(globalMin * 100).toFixed(0)}%</span>
+ <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span>
+ </div>
+
+ {stats.map((s) => (
+ <div
+ key={s.model}
+ style={{
+ display: "flex",
+ alignItems: "center",
+ marginBottom: "16px",
+ gap: "12px",
+ }}
+ >
+ {/* Label */}
+ <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}>
+ <div style={labelPrimaryStyle}>{s.model}</div>
+ <div style={labelSecondaryStyle}>
+ median {(s.median * 100).toFixed(1)}% / {s.cellCount} cells
+ </div>
+ </div>
+
+ {/* Box plot */}
+ <div
+ style={{
+ flex: 1,
+ position: "relative",
+ height: "32px",
+ marginRight: "12px",
+ }}
+ >
+ {/* Background track */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: 0,
+ right: 0,
+ height: "1px",
+ background: "hsl(var(--border))",
+ transform: "translateY(-50%)",
+ }}
+ />
+
+ {/* Whisker line: min to max */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(s.min)}%`,
+ width: `${toPercent(s.max) - toPercent(s.min)}%`,
+ height: "2px",
+ background: "var(--accent)",
+ opacity: 0.5,
+ transform: "translateY(-50%)",
+ }}
+ />
+
+ {/* Min whisker cap */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(s.min)}%`,
+ width: "1px",
+ height: "12px",
+ background: "var(--accent)",
+ opacity: 0.5,
+ transform: "translate(-50%, -50%)",
+ }}
+ />
+
+ {/* Max whisker cap */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(s.max)}%`,
+ width: "1px",
+ height: "12px",
+ background: "var(--accent)",
+ opacity: 0.5,
+ transform: "translate(-50%, -50%)",
+ }}
+ />
+
+ {/* IQR box: Q1 to Q3 */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(s.q1)}%`,
+ width: `${Math.max(toPercent(s.q3) - toPercent(s.q1), 0.5)}%`,
+ height: "18px",
+ background: "var(--accent)",
+ opacity: 0.2,
+ border: "1px solid var(--accent)",
+ transform: "translateY(-50%)",
+ }}
+ />
+
+ {/* Median line */}
+ <div
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(s.median)}%`,
+ width: "2px",
+ height: "22px",
+ background: "var(--accent)",
+ transform: "translate(-50%, -50%)",
+ }}
+ />
+
+ {/* Individual cell dots */}
+ {s.points.map((p, i) => (
+ <div
+ key={i}
+ style={{
+ position: "absolute",
+ top: "50%",
+ left: `${toPercent(p)}%`,
+ width: "5px",
+ height: "5px",
+ borderRadius: "50%",
+ background: "var(--accent)",
+ opacity: 0.6,
+ transform: "translate(-50%, -50%)",
+ zIndex: 1,
+ }}
+ />
+ ))}
+ </div>
+ </div>
+ ))}
+ </div>
+ );
+}
+
+/* ---------- Section 2: Reliability Ranking ---------- */
+
+interface ReliabilityRow {
+ axis: string;
+ value: string;
+ avgScore: number;
+ avgRange: number;
+ n: number;
+}
+
+function computeReliability(cells: Cell[]): ReliabilityRow[] {
+ const rows: ReliabilityRow[] = [];
+
+ for (const axis of AXIS_NAMES) {
+ const groups: Record<string, { scores: number[]; ranges: number[] }> = {};
+ for (const cell of cells) {
+ const val = String(
+ (cell.meta as Record<string, unknown>)[axis] ?? "unknown"
+ );
+ const g = (groups[val] ??= { scores: [], ranges: [] });
+ if (cell.score.avg > 0) {
+ g.scores.push(cell.score.avg);
+ g.ranges.push(cell.score.range);
+ }
+ }
+
+ for (const [val, { scores, ranges }] of Object.entries(groups)) {
+ if (scores.length < 2) continue;
+ rows.push({
+ axis,
+ value: val,
+ avgScore: scores.reduce((a, b) => a + b, 0) / scores.length,
+ avgRange: ranges.reduce((a, b) => a + b, 0) / ranges.length,
+ n: scores.length,
+ });
+ }
+ }
+
+ return rows.sort((a, b) => a.avgRange - b.avgRange);
+}
+
+function reliabilityColor(avgRange: number): string {
+ if (avgRange <= 0.05) return "var(--green)";
+ if (avgRange <= 0.12) return "var(--yellow)";
+ return "var(--red)";
+}
+
+function ReliabilitySection({ cells }: { cells: Cell[] }) {
+ const rows = useMemo(() => computeReliability(cells), [cells]);
+
+ if (rows.length === 0) {
+ return (
+ <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+ Not enough multi-run cells to compute reliability.
+ </div>
+ );
+ }
+
+ const maxRange = Math.max(...rows.map((r) => r.avgRange), 0.01);
+
+ return (
+ <div style={{ overflowX: "auto" }}>
+ <table style={{ borderCollapse: "collapse", width: "100%" }}>
+ <thead>
+ <tr>
+ {["VARIABLE", "VALUE", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map(
+ (h) => (
+ <th key={h} style={thStyle}>
+ {h}
+ </th>
+ )
+ )}
+ </tr>
+ </thead>
+ <tbody>
+ {rows.map((row, i) => {
+ const barWidth = (row.avgRange / maxRange) * 100;
+ const color = reliabilityColor(row.avgRange);
+ return (
+ <tr
+ key={`${row.axis}-${row.value}`}
+ style={{
+ borderBottom: "1px solid hsl(var(--border))",
+ background:
+ i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)",
+ }}
+ >
+ <td style={tdStyle}>
+ {AXIS_LABELS[row.axis] || row.axis}
+ </td>
+ <td style={{ ...tdStyle, fontFamily: "var(--font-mono)" }}>
+ {row.value}
+ </td>
+ <td
+ style={{
+ ...tdStyle,
+ fontFamily: "var(--font-mono)",
+ textAlign: "right",
+ }}
+ >
+ {(row.avgScore * 100).toFixed(1)}%
+ </td>
+ <td
+ style={{
+ ...tdStyle,
+ fontFamily: "var(--font-mono)",
+ textAlign: "right",
+ color,
+ }}
+ >
+ {(row.avgRange * 100).toFixed(1)}%
+ </td>
+ <td style={{ ...tdStyle, width: "200px" }}>
+ <div
+ style={{
+ position: "relative",
+ height: "12px",
+ background: "hsl(var(--border) / 0.2)",
+ }}
+ >
+ <div
+ style={{
+ position: "absolute",
+ top: 0,
+ left: 0,
+ height: "100%",
+ width: `${Math.max(barWidth, 1)}%`,
+ background: color,
+ opacity: 0.7,
+ }}
+ />
+ </div>
+ </td>
+ </tr>
+ );
+ })}
+ </tbody>
+ </table>
+ </div>
+ );
+}
+
+/* ---------- Section 3: Variance Contribution ---------- */
+
+interface VarianceDecomp {
+ totalVariance: number;
+ withinVariance: number;
+ betweenVariance: number;
+ betweenPct: number;
+ withinPct: number;
+}
+
+function computeVarianceDecomp(
+ runs: Run[],
+ cells: Cell[]
+): VarianceDecomp | null {
+ // All individual run scores
+ const allScores = runs
+ .map((r) => r.eval_results?.score ?? null)
+ .filter((s): s is number => s !== null && s > 0);
+
+ if (allScores.length < 2) return null;
+
+ const totalVar = variance(allScores);
+ if (totalVar === 0) return null;
+
+ // Within-cell variance: average variance within each cell
+ const cellVariances: number[] = [];
+ for (const cell of cells) {
+ const scores = cell.runs
+ .map((r) => r.eval_results?.score ?? null)
+ .filter((s): s is number => s !== null && s > 0);
+ if (scores.length >= 2) {
+ cellVariances.push(variance(scores));
+ }
+ }
+
+ const withinVar =
+ cellVariances.length > 0
+ ? cellVariances.reduce((a, b) => a + b, 0) / cellVariances.length
+ : 0;
+
+ const betweenVar = Math.max(totalVar - withinVar, 0);
+ const betweenPct = totalVar > 0 ? betweenVar / totalVar : 0;
+ const withinPct = totalVar > 0 ? withinVar / totalVar : 0;
+
+ return {
+ totalVariance: totalVar,
+ withinVariance: withinVar,
+ betweenVariance: betweenVar,
+ betweenPct,
+ withinPct,
+ };
+}
+
+function VarianceSection({
+ runs,
+ cells,
+}: {
+ runs: Run[];
+ cells: Cell[];
+}) {
+ const decomp = useMemo(
+ () => computeVarianceDecomp(runs, cells),
+ [runs, cells]
+ );
+
+ if (!decomp) {
+ return (
+ <div style={{ color: "var(--text-muted)", padding: "20px" }}>
+ Not enough data to decompose variance.
+ </div>
+ );
+ }
+
+ return (
+ <div>
+ <div
+ style={{
+ display: "flex",
+ height: "32px",
+ marginBottom: "12px",
+ border: "1px solid hsl(var(--border))",
+ }}
+ >
+ {/* Between-cell (config choices) */}
+ <div
+ style={{
+ width: `${decomp.betweenPct * 100}%`,
+ background: "var(--accent)",
+ opacity: 0.7,
+ display: "flex",
+ alignItems: "center",
+ justifyContent: "center",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ color: "var(--text)",
+ fontWeight: 600,
+ minWidth: decomp.betweenPct > 0.08 ? undefined : "0px",
+ overflow: "hidden",
+ whiteSpace: "nowrap",
+ }}
+ >
+ {decomp.betweenPct > 0.08 &&
+ `${(decomp.betweenPct * 100).toFixed(0)}%`}
+ </div>
+ {/* Within-cell (randomness) */}
+ <div
+ style={{
+ width: `${decomp.withinPct * 100}%`,
+ background: "var(--yellow)",
+ opacity: 0.5,
+ display: "flex",
+ alignItems: "center",
+ justifyContent: "center",
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ color: "var(--text)",
+ fontWeight: 600,
+ minWidth: decomp.withinPct > 0.08 ? undefined : "0px",
+ overflow: "hidden",
+ whiteSpace: "nowrap",
+ }}
+ >
+ {decomp.withinPct > 0.08 &&
+ `${(decomp.withinPct * 100).toFixed(0)}%`}
+ </div>
+ </div>
+
+ <div
+ style={{
+ display: "flex",
+ gap: "24px",
+ flexWrap: "wrap",
+ }}
+ >
+ <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
+ <div
+ style={{
+ width: "12px",
+ height: "12px",
+ background: "var(--accent)",
+ opacity: 0.7,
+ }}
+ />
+ <span style={legendStyle}>
+ CONFIG CHOICES: {(decomp.betweenPct * 100).toFixed(0)}%
+ </span>
+ </div>
+ <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
+ <div
+ style={{
+ width: "12px",
+ height: "12px",
+ background: "var(--yellow)",
+ opacity: 0.5,
+ }}
+ />
+ <span style={legendStyle}>
+ RANDOMNESS: {(decomp.withinPct * 100).toFixed(0)}%
+ </span>
+ </div>
+ </div>
+
+ <p
+ style={{
+ marginTop: "12px",
+ fontSize: "12px",
+ color: "var(--text-muted)",
+ lineHeight: "1.5",
+ }}
+ >
+ {decomp.betweenPct >= 0.5
+ ? `Configuration choices explain ${(decomp.betweenPct * 100).toFixed(0)}% of score variance. The config matters more than run-to-run randomness.`
+ : decomp.betweenPct >= 0.3
+ ? `Configuration and randomness contribute roughly equally. Scores are moderately sensitive to config choices.`
+ : `Run-to-run randomness dominates (${(decomp.withinPct * 100).toFixed(0)}%). Config choices have limited impact on scores -- results are noisy.`}
+ </p>
+ </div>
+ );
+}
+
+/* ---------- shared styles ---------- */
+
+const sectionHeaderStyle: React.CSSProperties = {
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ textTransform: "uppercase",
+ letterSpacing: "0.08em",
+ color: "var(--text-muted)",
+ marginBottom: "4px",
+};
+
+const sectionTitleStyle: React.CSSProperties = {
+ fontSize: "16px",
+ fontWeight: 600,
+ marginBottom: "4px",
+};
+
+const sectionDescStyle: React.CSSProperties = {
+ fontSize: "12px",
+ color: "var(--text-muted)",
+ marginBottom: "16px",
+ lineHeight: "1.4",
+};
+
+const cardStyle: React.CSSProperties = {
+ border: "1px solid hsl(var(--border))",
+ padding: "20px",
+ marginBottom: "16px",
+ background: "var(--surface-1)",
+};
+
+const axisLabelStyle: React.CSSProperties = {
+ fontSize: "10px",
+ fontFamily: "var(--font-mono)",
+ color: "var(--text-muted)",
+ textTransform: "uppercase",
+ letterSpacing: "0.06em",
+};
+
+const labelPrimaryStyle: React.CSSProperties = {
+ fontSize: "13px",
+ fontFamily: "var(--font-mono)",
+ fontWeight: 600,
+ color: "var(--text)",
+};
+
+const labelSecondaryStyle: React.CSSProperties = {
+ fontSize: "10px",
+ fontFamily: "var(--font-mono)",
+ color: "var(--text-muted)",
+ marginTop: "2px",
+};
+
+const thStyle: React.CSSProperties = {
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ textTransform: "uppercase",
+ letterSpacing: "0.06em",
+ color: "var(--text-muted)",
+ padding: "8px 12px",
+ textAlign: "left",
+ borderBottom: "1px solid hsl(var(--border))",
+ fontWeight: 500,
+};
+
+const tdStyle: React.CSSProperties = {
+ fontSize: "12px",
+ padding: "6px 12px",
+ color: "var(--text)",
+};
+
+const legendStyle: React.CSSProperties = {
+ fontSize: "11px",
+ fontFamily: "var(--font-mono)",
+ textTransform: "uppercase",
+ letterSpacing: "0.06em",
+ color: "var(--text-muted)",
+};
+
+/* ---------- Main Component ---------- */
+
+export default function Variability({ runs }: VariabilityProps) {
+ const cells = useMemo(() => groupIntoCells(runs), [runs]);
+
+ if (runs.length === 0) {
+ return (
+ <div
+ style={{
+ ...cardStyle,
+ textAlign: "center",
+ padding: "40px",
+ color: "var(--text-muted)",
+ }}
+ >
+ No runs available for variability analysis.
+ </div>
+ );
+ }
+
+ return (
+ <div>
+ {/* Section 1: Box Plots */}
+ <div style={cardStyle}>
+ <div style={sectionHeaderStyle}>CONSISTENCY</div>
+ <div style={sectionTitleStyle}>Score Distribution by Model</div>
+ <p style={sectionDescStyle}>
+ Each dot is a cell (unique config). The box spans Q1-Q3; the line
+ marks the median. Tighter boxes mean more consistent results across
+ configs.
+ </p>
+ <BoxPlotSection cells={cells} />
+ </div>
+
+ {/* Section 2: Reliability Ranking */}
+ <div style={cardStyle}>
+ <div style={sectionHeaderStyle}>RELIABILITY</div>
+ <div style={sectionTitleStyle}>Reliability Ranking by Variable</div>
+ <p style={sectionDescStyle}>
+ How much do repeat runs of the same config vary? Sorted by average
+ range (smallest = most reliable). Green means scores are consistent
+ across re-runs; red means volatile.
+ </p>
+ <ReliabilitySection cells={cells} />
+ </div>
+
+ {/* Section 3: Variance Decomposition */}
+ <div style={cardStyle}>
+ <div style={sectionHeaderStyle}>VARIANCE</div>
+ <div style={sectionTitleStyle}>Variance Contribution</div>
+ <p style={sectionDescStyle}>
+ ANOVA-style decomposition: how much of the total score variance comes
+ from config choices (between cells) vs run-to-run randomness (within
+ cells)?
+ </p>
+ <VarianceSection runs={runs} cells={cells} />
+ </div>
+ </div>
+ );
+}
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -4,6 +4,7 @@ import { loadAllRuns } from "../lib/data";
import Insights from "../components/Insights";
import ScatterPlot from "../components/ScatterPlot";
import Surprises from "../components/Surprises";
+import Variability from "../components/Variability";
const runs = loadAllRuns();
---
@@ -11,11 +12,15 @@ const runs = loadAllRuns();
<Base title="Insights">
<h1 style="margin-bottom: 8px;">Insights</h1>
<p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
- Which variables move the needle? Where do weaker configs win?
+ Which variables move the needle? Where do weaker configs win? How consistent are the results?
</p>
<Surprises client:load runs={runs} />
+ <div style="margin-top: 32px;">
+ <Variability client:load runs={runs} />
+ </div>
+
<div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
<ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" />
<ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />