Surprise cards now clickable with run details and outlier detection - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 85f722fa4936da91e87193ac60a407948670d8c6
parent 1964a38e54bd15acd879301fb499d1ccea864778
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 07:32:10 +0200

Surprise cards now clickable with run details and outlier detection

Click a surprise card to see:
- Which config dimensions differed
- Every run involved with model, score, cost, and link to run page

New outlier detection:
- Flags individual sonnet/opus runs that scored far below haiku average
- These show exactly which config caused the poor performance

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/Surprises.tsx  | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------

1 file changed, 198 insertions(+), 63 deletions(-)
diff --git a/dashboard/src/components/Surprises.tsx b/dashboard/src/components/Surprises.tsx
@@ -1,15 +1,26 @@
+import { useState } from "react";
 import type { Run } from "../lib/types";
 
 interface SurprisesProps {
   runs: Run[];
 }
 
+interface RunRef {
+  run_id: string;
+  model: string;
+  score: number;
+  cost: number;
+  config: Record<string, string>;
+}
+
 interface Surprise {
   title: string;
   detail: string;
   weaker: { model: string; config: string; score: number; cost: number };
   stronger: { model: string; config: string; score: number; cost: number };
   magnitude: number;
+  runs: RunRef[];
+  configDiffs: string[];
 }
 
 const MODEL_RANK: Record<string, number> = {
@@ -18,6 +29,34 @@ const MODEL_RANK: Record<string, number> = {
   opus: 3,
 };
 
+const CONFIG_KEYS = [
+  "prompt_style", "language", "effort", "human_language",
+  "linter", "playwright", "context_file", "sub_agents",
+  "web_search", "max_budget", "tool_read", "tool_write",
+  "tool_edit", "tool_glob", "tool_grep",
+];
+
+function getConfigKey(meta: Run["meta"]): string {
+  return CONFIG_KEYS.map(k => `${k}=${(meta as Record<string, unknown>)[k]}`).join("|");
+}
+
+function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] {
+  // Find which config values differ between the two groups
+  const diffs: string[] = [];
+  const metaA = runsA[0]?.meta;
+  const metaB = runsB[0]?.meta;
+  if (!metaA || !metaB) return diffs;
+
+  for (const key of CONFIG_KEYS) {
+    const va = String((metaA as Record<string, unknown>)[key]);
+    const vb = String((metaB as Record<string, unknown>)[key]);
+    if (va !== vb) {
+      diffs.push(`${key}: ${va} vs ${vb}`);
+    }
+  }
+  return diffs;
+}
+
 function findSurprises(runs: Run[]): Surprise[] {
   const surprises: Surprise[] = [];
 
@@ -25,28 +64,16 @@ function findSurprises(runs: Run[]): Surprise[] {
   const configGroups: Record<string, Run[]> = {};
   for (const run of runs) {
     if (run.eval_results?.score == null) continue;
-    // Build config key without model
     const m = run.meta;
-    const key = [
-      m.prompt_style, m.language, m.effort,
-      m.linter, m.playwright, m.context_file,
-      m.sub_agents, m.web_search, m.max_budget,
-    ].join("|");
+    const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|");
     (configGroups[key] ??= []).push(run);
   }
 
   // Within each config group, compare models
   for (const [, group] of Object.entries(configGroups)) {
-    const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+    const byModel: Record<string, Run[]> = {};
     for (const run of group) {
-      const model = run.meta.model;
-      if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
-      if (run.eval_results?.score != null) {
-        byModel[model].scores.push(run.eval_results.score);
-      }
-      if (run.claude_output?.total_cost_usd != null) {
-        byModel[model].costs.push(run.claude_output.total_cost_usd);
-      }
+      (byModel[run.meta.model] ??= []).push(run);
     }
 
     const models = Object.keys(byModel);
@@ -57,60 +84,127 @@ function findSurprises(runs: Run[]): Surprise[] {
         const rankA = MODEL_RANK[a] || 0;
         const rankB = MODEL_RANK[b] || 0;
 
-        const avgScoreA = byModel[a].scores.reduce((s, v) => s + v, 0) / byModel[a].scores.length;
-        const avgScoreB = byModel[b].scores.reduce((s, v) => s + v, 0) / byModel[b].scores.length;
-        const avgCostA = byModel[a].costs.length > 0 ? byModel[a].costs.reduce((s, v) => s + v, 0) / byModel[a].costs.length : 0;
-        const avgCostB = byModel[b].costs.length > 0 ? byModel[b].costs.reduce((s, v) => s + v, 0) / byModel[b].costs.length : 0;
+        const runsA = byModel[a];
+        const runsB = byModel[b];
+        const scoresA = runsA.map(r => r.eval_results!.score!);
+        const scoresB = runsB.map(r => r.eval_results!.score!);
+        const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length;
+        const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length;
+
+        const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0);
+        const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0);
+        const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length;
+        const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length;
 
-        // Surprise: weaker model scores higher
-        if (rankA < rankB && avgScoreA > avgScoreB + 0.02) {
+        const allRuns = [
+          ...runsA.map(r => ({
+            run_id: r.meta.run_id, model: a,
+            score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+            config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+          })),
+          ...runsB.map(r => ({
+            run_id: r.meta.run_id, model: b,
+            score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+            config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+          })),
+        ];
+
+        if (rankA < rankB && avgA > avgB + 0.02) {
           surprises.push({
             title: `${a} beat ${b}`,
-            detail: `${a} scored ${(avgScoreA * 100).toFixed(0)}% vs ${b} at ${(avgScoreB * 100).toFixed(0)}%`,
-            weaker: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
-            stronger: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
-            magnitude: avgScoreA - avgScoreB,
+            detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`,
+            weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+            stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+            magnitude: avgA - avgB,
+            runs: allRuns,
+            configDiffs: getConfigDiffs(runsA, runsB),
           });
-        } else if (rankB < rankA && avgScoreB > avgScoreA + 0.02) {
+        } else if (rankB < rankA && avgB > avgA + 0.02) {
           surprises.push({
             title: `${b} beat ${a}`,
-            detail: `${b} scored ${(avgScoreB * 100).toFixed(0)}% vs ${a} at ${(avgScoreA * 100).toFixed(0)}%`,
-            weaker: { model: b, config: group[0].meta.prompt_style, score: avgScoreB, cost: avgCostB },
-            stronger: { model: a, config: group[0].meta.prompt_style, score: avgScoreA, cost: avgCostA },
-            magnitude: avgScoreB - avgScoreA,
+            detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`,
+            weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+            stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+            magnitude: avgB - avgA,
+            runs: allRuns,
+            configDiffs: getConfigDiffs(runsB, runsA),
           });
         }
       }
     }
   }
 
-  // Also find cases where simple prompt beats detailed
+  // Find individual outlier runs where sonnet scored far below haiku
+  const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!);
+  const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0;
+
+  for (const run of runs) {
+    if (run.eval_results?.score == null) continue;
+    const model = run.meta.model;
+    const score = run.eval_results.score;
+    const rank = MODEL_RANK[model] || 0;
+
+    // Flag if a "stronger" model scored significantly below haiku average
+    if (rank > 1 && score < haikuMean - 0.15) {
+      surprises.push({
+        title: `${model} run scored far below haiku avg`,
+        detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`,
+        weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 },
+        stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 },
+        magnitude: haikuMean - score,
+        runs: [{
+          run_id: run.meta.run_id, model,
+          score, cost: run.claude_output?.total_cost_usd ?? 0,
+          config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])),
+        }],
+        configDiffs: CONFIG_KEYS.filter(k => {
+          const v = String((run.meta as Record<string, unknown>)[k]);
+          return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple";
+        }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`),
+      });
+    }
+  }
+
+  // Simple prompt beats detailed
   const promptGroups: Record<string, Run[]> = {};
   for (const run of runs) {
     if (run.eval_results?.score == null) continue;
     const m = run.meta;
-    const key = [
-      m.model, m.language, m.effort,
-      m.linter, m.playwright, m.context_file,
-    ].join("|");
+    const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|");
     (promptGroups[key] ??= []).push(run);
   }
 
   for (const [, group] of Object.entries(promptGroups)) {
-    const byPrompt: Record<string, number[]> = {};
+    const byPrompt: Record<string, Run[]> = {};
     for (const run of group) {
-      (byPrompt[run.meta.prompt_style] ??= []).push(run.eval_results!.score!);
+      (byPrompt[run.meta.prompt_style] ??= []).push(run);
     }
     if (byPrompt.simple && byPrompt.detailed) {
-      const avgSimple = byPrompt.simple.reduce((a, b) => a + b, 0) / byPrompt.simple.length;
-      const avgDetailed = byPrompt.detailed.reduce((a, b) => a + b, 0) / byPrompt.detailed.length;
+      const simpleRuns = byPrompt.simple;
+      const detailedRuns = byPrompt.detailed;
+      const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length;
+      const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length;
       if (avgSimple > avgDetailed + 0.05) {
+        const allRuns = [
+          ...simpleRuns.map(r => ({
+            run_id: r.meta.run_id, model: r.meta.model,
+            score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+            config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+          })),
+          ...detailedRuns.map(r => ({
+            run_id: r.meta.run_id, model: r.meta.model,
+            score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+            config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+          })),
+        ];
         surprises.push({
           title: "Simple prompt beat detailed",
           detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
           weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
           stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
           magnitude: avgSimple - avgDetailed,
+          runs: allRuns,
+          configDiffs: ["prompt_style: simple vs detailed"],
         });
       }
     }
@@ -119,6 +213,69 @@ function findSurprises(runs: Run[]): Surprise[] {
   return surprises.sort((a, b) => b.magnitude - a.magnitude);
 }
 
+function SurpriseCard({ surprise }: { surprise: Surprise }) {
+  const [expanded, setExpanded] = useState(false);
+
+  return (
+    <div className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)", cursor: "pointer" }} onClick={() => setExpanded(!expanded)}>
+      <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+        {surprise.title}
+      </div>
+      <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
+        {surprise.detail}
+      </div>
+      <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
+        <div>
+          <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span>
+          <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+            {(surprise.weaker.score * 100).toFixed(0)}%
+          </span>
+        </div>
+        <div style={{ color: "var(--text-muted)" }}>vs</div>
+        <div>
+          <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span>
+          <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+            {(surprise.stronger.score * 100).toFixed(0)}%
+          </span>
+        </div>
+      </div>
+
+      {expanded && (
+        <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}>
+          {surprise.configDiffs.length > 0 && (
+            <div style={{ marginBottom: "8px" }}>
+              <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div>
+              {surprise.configDiffs.map((diff, i) => (
+                <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div>
+              ))}
+            </div>
+          )}
+
+          <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>
+            Runs ({surprise.runs.length})
+          </div>
+          {surprise.runs.map((r) => (
+            <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}>
+              <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}>
+                {r.model}
+              </span>
+              <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}>
+                {(r.score * 100).toFixed(0)}%
+              </span>
+              <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
+                ${r.cost.toFixed(2)}
+              </span>
+              <a href={`/run/${r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }}>
+                view
+              </a>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}
+
 export default function Surprises({ runs }: SurprisesProps) {
   const surprises = findSurprises(runs);
 
@@ -134,33 +291,11 @@ export default function Surprises({ runs }: SurprisesProps) {
     <div>
       <h3 style={{ marginBottom: "12px" }}>Surprises</h3>
       <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
-        Where weaker configs outperformed stronger ones
+        Click to expand. Where weaker configs outperformed stronger ones.
       </p>
       <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
         {surprises.map((s, i) => (
-          <div key={i} className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)" }}>
-            <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
-              {s.title}
-            </div>
-            <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
-              {s.detail}
-            </div>
-            <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
-              <div>
-                <span style={{ color: "var(--green)" }}>{s.weaker.model}</span>
-                <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
-                  {(s.weaker.score * 100).toFixed(0)}%
-                </span>
-              </div>
-              <div style={{ color: "var(--text-muted)" }}>vs</div>
-              <div>
-                <span style={{ color: "var(--red)" }}>{s.stronger.model}</span>
-                <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
-                  {(s.stronger.score * 100).toFixed(0)}%
-                </span>
-              </div>
-            </div>
-          </div>
+          <SurpriseCard key={i} surprise={s} />
         ))}
       </div>
     </div>

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README