commit 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
parent f944fc2552eac32747f26c24e00b9d8f8cc7829f
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 09:04:27 +0200
Add scatter plots and surprise detector to insights page
Scatter plots:
- Cost vs Score: shows efficiency frontier, colored by model
- Turns vs Score: shows iteration efficiency
Surprise detector:
- Finds cases where weaker models beat stronger ones
- Finds cases where simple prompts beat detailed
- Cards with yellow left border, sorted by magnitude
- Shows both sides with score comparison
All 67 runs now have full eval results (code analysis, transcript
analysis, gameplay bot).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 304 insertions(+), 4 deletions(-)
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -0,0 +1,122 @@
+import {
+ ScatterChart,
+ Scatter,
+ XAxis,
+ YAxis,
+ CartesianGrid,
+ Tooltip,
+ ResponsiveContainer,
+ Legend,
+} from "recharts";
+import type { Run } from "../lib/data";
+
+interface ScatterPlotProps {
+ runs: Run[];
+ xMetric: string;
+ yMetric: string;
+}
+
+const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = {
+ cost: {
+ label: "Cost ($)",
+ extract: (r) => r.claude_output?.total_cost_usd ?? null,
+ format: (v) => `$${v.toFixed(2)}`,
+ },
+ score: {
+ label: "Score (%)",
+ extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
+ turns: {
+ label: "Turns",
+ extract: (r) => r.claude_output?.num_turns ?? null,
+ format: (v) => `${v}`,
+ },
+ wall_time: {
+ label: "Time (s)",
+ extract: (r) => r.meta.wall_time_seconds ?? null,
+ format: (v) => `${v}s`,
+ },
+};
+
+const MODEL_COLORS: Record<string, string> = {
+ haiku: "hsl(193 44% 67%)", // frost cyan
+ sonnet: "hsl(40 71% 73%)", // aurora yellow
+ opus: "hsl(311 24% 63%)", // aurora purple
+};
+
+export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) {
+ const xConf = METRIC_CONFIG[xMetric];
+ const yConf = METRIC_CONFIG[yMetric];
+ if (!xConf || !yConf) return null;
+
+ // Group by model
+ const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {};
+
+ for (const run of runs) {
+ const x = xConf.extract(run);
+ const y = yConf.extract(run);
+ if (x === null || y === null) continue;
+
+ const model = run.meta.model;
+ if (!byModel[model]) byModel[model] = [];
+ byModel[model].push({
+ x,
+ y,
+ run_id: run.meta.run_id,
+ prompt: run.meta.prompt_style,
+ });
+ }
+
+ const models = Object.keys(byModel).sort();
+
+ return (
+ <div className="card">
+ <h3 style={{ marginBottom: "16px" }}>
+ {xConf.label} vs {yConf.label}
+ </h3>
+ <ResponsiveContainer width="100%" height={350}>
+ <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}>
+ <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" />
+ <XAxis
+ dataKey="x"
+ name={xConf.label}
+ stroke="hsl(213 14% 65%)"
+ fontSize={11}
+ tickFormatter={(v) => xConf.format(v)}
+ />
+ <YAxis
+ dataKey="y"
+ name={yConf.label}
+ stroke="hsl(213 14% 65%)"
+ fontSize={11}
+ tickFormatter={(v) => yConf.format(v)}
+ />
+ <Tooltip
+ contentStyle={{
+ background: "hsl(217 16% 15.5%)",
+ border: "1px solid hsl(217 17% 28%)",
+ borderRadius: "2px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ }}
+ formatter={(value: number, name: string) => {
+ if (name === xConf.label) return [xConf.format(value), name];
+ if (name === yConf.label) return [yConf.format(value), name];
+ return [value, name];
+ }}
+ />
+ <Legend />
+ {models.map((model) => (
+ <Scatter
+ key={model}
+ name={model}
+ data={byModel[model]}
+ fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"}
+ />
+ ))}
+ </ScatterChart>
+ </ResponsiveContainer>
+ </div>
+ );
+}
diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx
@@ -0,0 +1,168 @@
+import type { Run } from "../lib/data";
+
+interface SurprisesProps {
+ runs: Run[];
+}
+
+interface Surprise {
+ title: string;
+ detail: string;
+ weaker: { model: string; config: string; score: number; cost: number };
+ stronger: { model: string; config: string; score: number; cost: number };
+ magnitude: number;
+}
+
+const MODEL_RANK: Record<string, number> = {
+ haiku: 1,
+ sonnet: 2,
+ opus: 3,
+};
+
+function findSurprises(runs: Run[]): Surprise[] {
+ const surprises: Surprise[] = [];
+
+ // Group runs by config (everything except model and run number)
+ const configGroups: Record<string, Run[]> = {};
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ // Build config key without model
+ const m = run.meta;
+ const key = [
+ m.prompt_style, m.language, m.effort,
+ m.linter, m.playwright, m.context_file,
+ m.sub_agents, m.web_search, m.max_budget,
+ ].join("|");
+ (configGroups[key] ??= []).push(run);
+ }
+
+ // Within each config group, compare models
+ for (const [, group] of Object.entries(configGroups)) {
+ const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+ for (const run of group) {
+ const model = run.meta.model;
+ if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
+ if (run.eval_results?.score != null) {
+ byModel[model].scores.push(run.eval_results.score);
+ }
+ if (run.claude_output?.total_cost_usd != null) {
+ byModel[model].costs.push(run.claude_output.total_cost_usd);
+ }
+ }
+
+ const models = Object.keys(byModel);
+ for (let i = 0; i < models.length; i++) {
+ for (let j = i + 1; j < models.length; j++) {
+ const a = models[i];
+ const b = models[j];
+ const rankA = MODEL_RANK[a] || 0;
+ const rankB = MODEL_RANK[b] || 0;
+
+ const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length;
+ const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length;
+ const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0;
+ const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0;
+
+ // Surprise: weaker model scores higher
+ if (rankA < rankB && avgScoreA > avgScoreB + 0.02) {
+ surprises.push({
+ title: `${a} beat ${b}`,
+ detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`,
+ weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
+ stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
+ magnitude: avgScoreA - avgScoreB,
+ });
+ } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) {
+ surprises.push({
+ title: `${b} beat ${a}`,
+ detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`,
+ weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
+ stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
+ magnitude: avgScoreB - avgScoreA,
+ });
+ }
+ }
+ }
+ }
+
+ // Also find cases where simple prompt beats detailed
+ const promptGroups: Record<string, Run[]> = {};
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ const m = run.meta;
+ const key = [
+ m.model, m.language, m.effort,
+ m.linter, m.playwright, m.context_file,
+ ].join("|");
+ (promptGroups[key] ??= []).push(run);
+ }
+
+ for (const [, group] of Object.entries(promptGroups)) {
+ const byPrompt: Record<string, number[]> = {};
+ for (const run of group) {
+ (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!);
+ }
+ if (byPrompt.simple && byPrompt.detailed) {
+ const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length;
+ const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length;
+ if (avgSimple > avgDetailed + 0.05) {
+ surprises.push({
+ title: "Simple prompt beat detailed",
+ detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
+ weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
+ stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
+ magnitude: avgSimple - avgDetailed,
+ });
+ }
+ }
+ }
+
+ return surprises.sort((a, b) => b.magnitude - a.magnitude);
+}
+
+export default function Surprises({ runs }: SurprisesProps) {
+ const surprises = findSurprises(runs);
+
+ if (surprises.length === 0) {
+ return (
+ <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}>
+ No surprises yet. Run more experiments with different models to find upsets.
+ </div>
+ );
+ }
+
+ return (
+ <div>
+ <h3 style={{ marginBottom: "12px" }}>Surprises</h3>
+ <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+ Where weaker configs outperformed stronger ones
+ </p>
+ <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
+ {surprises.map((s, i) => (
+ <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}>
+ <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+ {s.title}
+ </div>
+ <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
+ {s.detail}
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
+ <div>
+ <span style={{ color: "var(--green)" }}>{s.weaker.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(s.weaker.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ <div style={{ color: "var(--text-muted)" }}>vs</div>
+ <div>
+ <span style={{ color: "var(--red)" }}>{s.stronger.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(s.stronger.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ </div>
+ </div>
+ ))}
+ </div>
+ </div>
+ );
+}
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -2,16 +2,26 @@
import Base from "../layouts/Base.astro";
import { loadAllRuns } from "../lib/data";
import Insights from "../components/Insights";
+import ScatterPlot from "../components/ScatterPlot";
+import Surprises from "../components/Surprises";
const runs = loadAllRuns();
---
<Base title="Insights">
<h1 style="margin-bottom: 8px;">Insights</h1>
- <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
- Which variables actually move the needle? Tornado charts show main effects,
- heatmaps reveal interactions.
+ <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
+ Which variables move the needle? Where do weaker configs win?
</p>
- <Insights client:load runs={runs} />
+ <Surprises client:load runs={runs} />
+
+ <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
+ <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" />
+ <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />
+ </div>
+
+ <div style="margin-top: 32px;">
+ <Insights client:load runs={runs} />
+ </div>
</Base>