Add scatter plots and surprise detector to insights page - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
parent f944fc2552eac32747f26c24e00b9d8f8cc7829f
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 09:04:27 +0200

Add scatter plots and surprise detector to insights page

Scatter plots:
- Cost vs Score: shows efficiency frontier, colored by model
- Turns vs Score: shows iteration efficiency

Surprise detector:
- Finds cases where weaker models beat stronger ones
- Finds cases where simple prompts beat detailed
- Cards with yellow left border, sorted by magnitude
- Shows both sides with score comparison

All 67 runs now have full eval results (code analysis, transcript
analysis, gameplay bot).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
A dashboard/src/components/ScatterPlot.tsx  | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A dashboard/src/components/Surprises.tsx  | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M dashboard/src/pages/insights.astro  | 18 ++++++++++++++----

3 files changed, 304 insertions(+), 4 deletions(-)
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -0,0 +1,122 @@
+import {
+  ScatterChart,
+  Scatter,
+  XAxis,
+  YAxis,
+  CartesianGrid,
+  Tooltip,
+  ResponsiveContainer,
+  Legend,
+} from "recharts";
+import type { Run } from "../lib/data";
+
+interface ScatterPlotProps {
+  runs: Run[];
+  xMetric: string;
+  yMetric: string;
+}
+
+const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = {
+  cost: {
+    label: "Cost ($)",
+    extract: (r) => r.claude_output?.total_cost_usd ?? null,
+    format: (v) => `$${v.toFixed(2)}`,
+  },
+  score: {
+    label: "Score (%)",
+    extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null,
+    format: (v) => `${v.toFixed(0)}%`,
+  },
+  turns: {
+    label: "Turns",
+    extract: (r) => r.claude_output?.num_turns ?? null,
+    format: (v) => `${v}`,
+  },
+  wall_time: {
+    label: "Time (s)",
+    extract: (r) => r.meta.wall_time_seconds ?? null,
+    format: (v) => `${v}s`,
+  },
+};
+
+const MODEL_COLORS: Record<string, string> = {
+  haiku: "hsl(193 44% 67%)",   // frost cyan
+  sonnet: "hsl(40 71% 73%)",   // aurora yellow
+  opus: "hsl(311 24% 63%)",    // aurora purple
+};
+
+export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) {
+  const xConf = METRIC_CONFIG[xMetric];
+  const yConf = METRIC_CONFIG[yMetric];
+  if (!xConf || !yConf) return null;
+
+  // Group by model
+  const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {};
+
+  for (const run of runs) {
+    const x = xConf.extract(run);
+    const y = yConf.extract(run);
+    if (x === null || y === null) continue;
+
+    const model = run.meta.model;
+    if (!byModel[model]) byModel[model] = [];
+    byModel[model].push({
+      x,
+      y,
+      run_id: run.meta.run_id,
+      prompt: run.meta.prompt_style,
+    });
+  }
+
+  const models = Object.keys(byModel).sort();
+
+  return (
+    <div className="card">
+      <h3 style={{ marginBottom: "16px" }}>
+        {xConf.label} vs {yConf.label}
+      </h3>
+      <ResponsiveContainer width="100%" height={350}>
+        <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}>
+          <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" />
+          <XAxis
+            dataKey="x"
+            name={xConf.label}
+            stroke="hsl(213 14% 65%)"
+            fontSize={11}
+            tickFormatter={(v) => xConf.format(v)}
+          />
+          <YAxis
+            dataKey="y"
+            name={yConf.label}
+            stroke="hsl(213 14% 65%)"
+            fontSize={11}
+            tickFormatter={(v) => yConf.format(v)}
+          />
+          <Tooltip
+            contentStyle={{
+              background: "hsl(217 16% 15.5%)",
+              border: "1px solid hsl(217 17% 28%)",
+              borderRadius: "2px",
+              fontFamily: "'JetBrains Mono', monospace",
+              fontSize: "11px",
+            }}
+            formatter={(value: number, name: string) => {
+              if (name === xConf.label) return [xConf.format(value), name];
+              if (name === yConf.label) return [yConf.format(value), name];
+              return [value, name];
+            }}
+          />
+          <Legend />
+          {models.map((model) => (
+            <Scatter
+              key={model}
+              name={model}
+              data={byModel[model]}
+              fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"}
+            />
+          ))}
+        </ScatterChart>
+      </ResponsiveContainer>
+    </div>
+  );
+}
diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx
@@ -0,0 +1,168 @@
+import type { Run } from "../lib/data";
+
+interface SurprisesProps {
+  runs: Run[];
+}
+
+interface Surprise {
+  title: string;
+  detail: string;
+  weaker: { model: string; config: string; score: number; cost: number };
+  stronger: { model: string; config: string; score: number; cost: number };
+  magnitude: number;
+}
+
+const MODEL_RANK: Record<string, number> = {
+  haiku: 1,
+  sonnet: 2,
+  opus: 3,
+};
+
+function findSurprises(runs: Run[]): Surprise[] {
+  const surprises: Surprise[] = [];
+
+  // Group runs by config (everything except model and run number)
+  const configGroups: Record<string, Run[]> = {};
+  for (const run of runs) {
+    if (run.eval_results?.score == null) continue;
+    // Build config key without model
+    const m = run.meta;
+    const key = [
+      m.prompt_style, m.language, m.effort,
+      m.linter, m.playwright, m.context_file,
+      m.sub_agents, m.web_search, m.max_budget,
+    ].join("|");
+    (configGroups[key] ??= []).push(run);
+  }
+
+  // Within each config group, compare models
+  for (const [, group] of Object.entries(configGroups)) {
+    const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+    for (const run of group) {
+      const model = run.meta.model;
+      if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
+      if (run.eval_results?.score != null) {
+        byModel[model].scores.push(run.eval_results.score);
+      }
+      if (run.claude_output?.total_cost_usd != null) {
+        byModel[model].costs.push(run.claude_output.total_cost_usd);
+      }
+    }
+
+    const models = Object.keys(byModel);
+    for (let i = 0; i < models.length; i++) {
+      for (let j = i + 1; j < models.length; j++) {
+        const a = models[i];
+        const b = models[j];
+        const rankA = MODEL_RANK[a] || 0;
+        const rankB = MODEL_RANK[b] || 0;
+
+        const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length;
+        const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length;
+        const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0;
+        const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0;
+
+        // Surprise: weaker model scores higher
+        if (rankA < rankB && avgScoreA > avgScoreB + 0.02) {
+          surprises.push({
+            title: `${a} beat ${b}`,
+            detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`,
+            weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
+            stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
+            magnitude: avgScoreA - avgScoreB,
+          });
+        } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) {
+          surprises.push({
+            title: `${b} beat ${a}`,
+            detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`,
+            weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
+            stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
+            magnitude: avgScoreB - avgScoreA,
+          });
+        }
+      }
+    }
+  }
+
+  // Also find cases where simple prompt beats detailed
+  const promptGroups: Record<string, Run[]> = {};
+  for (const run of runs) {
+    if (run.eval_results?.score == null) continue;
+    const m = run.meta;
+    const key = [
+      m.model, m.language, m.effort,
+      m.linter, m.playwright, m.context_file,
+    ].join("|");
+    (promptGroups[key] ??= []).push(run);
+  }
+
+  for (const [, group] of Object.entries(promptGroups)) {
+    const byPrompt: Record<string, number[]> = {};
+    for (const run of group) {
+      (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!);
+    }
+    if (byPrompt.simple && byPrompt.detailed) {
+      const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length;
+      const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length;
+      if (avgSimple > avgDetailed + 0.05) {
+        surprises.push({
+          title: "Simple prompt beat detailed",
+          detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
+          weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
+          stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
+          magnitude: avgSimple - avgDetailed,
+        });
+      }
+    }
+  }
+
+  return surprises.sort((a, b) => b.magnitude - a.magnitude);
+}
+
+export default function Surprises({ runs }: SurprisesProps) {
+  const surprises = findSurprises(runs);
+
+  if (surprises.length === 0) {
+    return (
+      <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}>
+        No surprises yet. Run more experiments with different models to find upsets.
+      </div>
+    );
+  }
+
+  return (
+    <div>
+      <h3 style={{ marginBottom: "12px" }}>Surprises</h3>
+      <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+        Where weaker configs outperformed stronger ones
+      </p>
+      <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
+        {surprises.map((s, i) => (
+          <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}>
+            <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+              {s.title}
+            </div>
+            <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
+              {s.detail}
+            </div>
+            <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
+              <div>
+                <span style={{ color: "var(--green)" }}>{s.weaker.model}</span>
+                <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+                  {(s.weaker.score * 100).toFixed(0)}%
+                </span>
+              </div>
+              <div style={{ color: "var(--text-muted)" }}>vs</div>
+              <div>
+                <span style={{ color: "var(--red)" }}>{s.stronger.model}</span>
+                <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+                  {(s.stronger.score * 100).toFixed(0)}%
+                </span>
+              </div>
+            </div>
+          </div>
+        ))}
+      </div>
+    </div>
+  );
+}
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -2,16 +2,26 @@
 import Base from "../layouts/Base.astro";
 import { loadAllRuns } from "../lib/data";
 import Insights from "../components/Insights";
+import ScatterPlot from "../components/ScatterPlot";
+import Surprises from "../components/Surprises";
 
 const runs = loadAllRuns();
 ---
 
 <Base title="Insights">
   <h1 style="margin-bottom: 8px;">Insights</h1>
-  <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
-    Which variables actually move the needle? Tornado charts show main effects,
-    heatmaps reveal interactions.
+  <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
+    Which variables move the needle? Where do weaker configs win?
   </p>
 
-  <Insights client:load runs={runs} />
+  <Surprises client:load runs={runs} />
+
+  <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
+    <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" />
+    <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />
+  </div>
+
+  <div style="margin-top: 32px;">
+    <Insights client:load runs={runs} />
+  </div>
 </Base>

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

A	dashboard/src/components/ScatterPlot.tsx	\|	122	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	dashboard/src/components/Surprises.tsx	\|	168	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	dashboard/src/pages/insights.astro	\|	18	++++++++++++++----