commit 85f722fa4936da91e87193ac60a407948670d8c6
parent 1964a38e54bd15acd879301fb499d1ccea864778
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 07:32:10 +0200
Surprise cards now clickable with run details and outlier detection
Click a surprise card to see:
- Which config dimensions differed
- Every run involved with model, score, cost, and link to run page
New outlier detection:
- Flags individual sonnet/opus runs that scored far below haiku average
- These show exactly which config caused the poor performance
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 198 insertions(+), 63 deletions(-)
diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx
@@ -1,15 +1,26 @@
+import { useState } from "react";
import type { Run } from "../lib/types";
interface SurprisesProps {
runs: Run[];
}
+interface RunRef {
+ run_id: string;
+ model: string;
+ score: number;
+ cost: number;
+ config: Record<string, string>;
+}
+
interface Surprise {
title: string;
detail: string;
weaker: { model: string; config: string; score: number; cost: number };
stronger: { model: string; config: string; score: number; cost: number };
magnitude: number;
+ runs: RunRef[];
+ configDiffs: string[];
}
const MODEL_RANK: Record<string, number> = {
@@ -18,6 +29,34 @@ const MODEL_RANK: Record<string, number> = {
opus: 3,
};
+const CONFIG_KEYS = [
+ "prompt_style", "language", "effort", "human_language",
+ "linter", "playwright", "context_file", "sub_agents",
+ "web_search", "max_budget", "tool_read", "tool_write",
+ "tool_edit", "tool_glob", "tool_grep",
+];
+
+function getConfigKey(meta: Run["meta"]): string {
+ return CONFIG_KEYS.map(k => `${k}=${(meta as Record<string, unknown>)[k]}`).join("|");
+}
+
+function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] {
+ // Find which config values differ between the two groups
+ const diffs: string[] = [];
+ const metaA = runsA[0]?.meta;
+ const metaB = runsB[0]?.meta;
+ if (!metaA || !metaB) return diffs;
+
+ for (const key of CONFIG_KEYS) {
+ const va = String((metaA as Record<string, unknown>)[key]);
+ const vb = String((metaB as Record<string, unknown>)[key]);
+ if (va !== vb) {
+ diffs.push(`${key}: ${va} vs ${vb}`);
+ }
+ }
+ return diffs;
+}
+
function findSurprises(runs: Run[]): Surprise[] {
const surprises: Surprise[] = [];
@@ -25,28 +64,16 @@ function findSurprises(runs: Run[]): Surprise[] {
const configGroups: Record<string, Run[]> = {};
for (const run of runs) {
if (run.eval_results?.score == null) continue;
- // Build config key without model
const m = run.meta;
- const key = [
- m.prompt_style, m.language, m.effort,
- m.linter, m.playwright, m.context_file,
- m.sub_agents, m.web_search, m.max_budget,
- ].join("|");
+ const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|");
(configGroups[key] ??= []).push(run);
}
// Within each config group, compare models
for (const [, group] of Object.entries(configGroups)) {
- const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+ const byModel: Record<string, Run[]> = {};
for (const run of group) {
- const model = run.meta.model;
- if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
- if (run.eval_results?.score != null) {
- byModel[model].scores.push(run.eval_results.score);
- }
- if (run.claude_output?.total_cost_usd != null) {
- byModel[model].costs.push(run.claude_output.total_cost_usd);
- }
+ (byModel[run.meta.model] ??= []).push(run);
}
const models = Object.keys(byModel);
@@ -57,60 +84,127 @@ function findSurprises(runs: Run[]): Surprise[] {
const rankA = MODEL_RANK[a] || 0;
const rankB = MODEL_RANK[b] || 0;
- const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length;
- const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length;
- const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0;
- const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0;
+ const runsA = byModel[a];
+ const runsB = byModel[b];
+ const scoresA = runsA.map(r => r.eval_results!.score!);
+ const scoresB = runsB.map(r => r.eval_results!.score!);
+ const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length;
+ const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length;
+
+ const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0);
+ const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0);
+ const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length;
+ const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length;
- // Surprise: weaker model scores higher
- if (rankA < rankB && avgScoreA > avgScoreB + 0.02) {
+ const allRuns = [
+ ...runsA.map(r => ({
+ run_id: r.meta.run_id, model: a,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ...runsB.map(r => ({
+ run_id: r.meta.run_id, model: b,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ];
+
+ if (rankA < rankB && avgA > avgB + 0.02) {
surprises.push({
title: `${a} beat ${b}`,
- detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`,
- weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
- stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
- magnitude: avgScoreA - avgScoreB,
+ detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`,
+ weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+ stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+ magnitude: avgA - avgB,
+ runs: allRuns,
+ configDiffs: getConfigDiffs(runsA, runsB),
});
- } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) {
+ } else if (rankB < rankA && avgB > avgA + 0.02) {
surprises.push({
title: `${b} beat ${a}`,
- detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`,
- weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
- stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
- magnitude: avgScoreB - avgScoreA,
+ detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`,
+ weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+ stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+ magnitude: avgB - avgA,
+ runs: allRuns,
+ configDiffs: getConfigDiffs(runsB, runsA),
});
}
}
}
}
- // Also find cases where simple prompt beats detailed
+ // Find individual outlier runs where sonnet scored far below haiku
+ const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!);
+ const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0;
+
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ const model = run.meta.model;
+ const score = run.eval_results.score;
+ const rank = MODEL_RANK[model] || 0;
+
+ // Flag if a "stronger" model scored significantly below haiku average
+ if (rank > 1 && score < haikuMean - 0.15) {
+ surprises.push({
+ title: `${model} run scored far below haiku avg`,
+ detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`,
+ weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 },
+ stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 },
+ magnitude: haikuMean - score,
+ runs: [{
+ run_id: run.meta.run_id, model,
+ score, cost: run.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])),
+ }],
+ configDiffs: CONFIG_KEYS.filter(k => {
+ const v = String((run.meta as Record<string, unknown>)[k]);
+ return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple";
+ }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`),
+ });
+ }
+ }
+
+ // Simple prompt beats detailed
const promptGroups: Record<string, Run[]> = {};
for (const run of runs) {
if (run.eval_results?.score == null) continue;
const m = run.meta;
- const key = [
- m.model, m.language, m.effort,
- m.linter, m.playwright, m.context_file,
- ].join("|");
+ const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|");
(promptGroups[key] ??= []).push(run);
}
for (const [, group] of Object.entries(promptGroups)) {
- const byPrompt: Record<string, number[]> = {};
+ const byPrompt: Record<string, Run[]> = {};
for (const run of group) {
- (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!);
+ (byPrompt[run.meta.prompt_style] ??= []).push(run);
}
if (byPrompt.simple && byPrompt.detailed) {
- const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length;
- const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length;
+ const simpleRuns = byPrompt.simple;
+ const detailedRuns = byPrompt.detailed;
+ const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length;
+ const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length;
if (avgSimple > avgDetailed + 0.05) {
+ const allRuns = [
+ ...simpleRuns.map(r => ({
+ run_id: r.meta.run_id, model: r.meta.model,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ...detailedRuns.map(r => ({
+ run_id: r.meta.run_id, model: r.meta.model,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ];
surprises.push({
title: "Simple prompt beat detailed",
detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
magnitude: avgSimple - avgDetailed,
+ runs: allRuns,
+ configDiffs: ["prompt_style: simple vs detailed"],
});
}
}
@@ -119,6 +213,69 @@ function findSurprises(runs: Run[]): Surprise[] {
return surprises.sort((a, b) => b.magnitude - a.magnitude);
}
+function SurpriseCard({ surprise }: { surprise: Surprise }) {
+ const [expanded, setExpanded] = useState(false);
+
+ return (
+ <div className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)", cursor: "pointer" }} onClick={() => setExpanded(!expanded)}>
+ <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+ {surprise.title}
+ </div>
+ <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
+ {surprise.detail}
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
+ <div>
+ <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(surprise.weaker.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ <div style={{ color: "var(--text-muted)" }}>vs</div>
+ <div>
+ <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(surprise.stronger.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ </div>
+
+ {expanded && (
+ <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}>
+ {surprise.configDiffs.length > 0 && (
+ <div style={{ marginBottom: "8px" }}>
+ <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div>
+ {surprise.configDiffs.map((diff, i) => (
+ <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div>
+ ))}
+ </div>
+ )}
+
+ <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>
+ Runs ({surprise.runs.length})
+ </div>
+ {surprise.runs.map((r) => (
+ <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}>
+ <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}>
+ {r.model}
+ </span>
+ <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}>
+ {(r.score * 100).toFixed(0)}%
+ </span>
+ <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
+ ${r.cost.toFixed(2)}
+ </span>
+ <a href={`/run/${r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }}>
+ view
+ </a>
+ </div>
+ ))}
+ </div>
+ )}
+ </div>
+ );
+}
+
export default function Surprises({ runs }: SurprisesProps) {
const surprises = findSurprises(runs);
@@ -134,33 +291,11 @@ export default function Surprises({ runs }: SurprisesProps) {
<div>
<h3 style={{ marginBottom: "12px" }}>Surprises</h3>
<p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
- Where weaker configs outperformed stronger ones
+ Click to expand. Where weaker configs outperformed stronger ones.
</p>
<div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
{surprises.map((s, i) => (
- <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}>
- <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
- {s.title}
- </div>
- <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
- {s.detail}
- </div>
- <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
- <div>
- <span style={{ color: "var(--green)" }}>{s.weaker.model}</span>
- <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
- {(s.weaker.score * 100).toFixed(0)}%
- </span>
- </div>
- <div style={{ color: "var(--text-muted)" }}>vs</div>
- <div>
- <span style={{ color: "var(--red)" }}>{s.stronger.model}</span>
- <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
- {(s.stronger.score * 100).toFixed(0)}%
- </span>
- </div>
- </div>
- </div>
+ <SurpriseCard key={i} surprise={s} />
))}
</div>
</div>