SurprisesPage.tsx - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

SurprisesPage.tsx (19596B)
      1 import { useState, useMemo } from "react";
      2 import type { Run } from "../lib/types";
      3 
      4 interface SurprisesPageProps {
      5   runs: Run[];
      6 }
      7 
      8 interface RunRef {
      9   run_id: string;
     10   short_id?: string;
     11   model: string;
     12   score: number;
     13   cost: number;
     14   config: Record<string, string>;
     15 }
     16 
     17 interface Surprise {
     18   title: string;
     19   detail: string;
     20   category: "model_upset" | "prompt_upset" | "individual_outlier";
     21   weaker: { model: string; config: string; score: number; cost: number };
     22   stronger: { model: string; config: string; score: number; cost: number };
     23   magnitude: number;
     24   runs: RunRef[];
     25   configDiffs: string[];
     26   /** Which config axis is the primary differentiator */
     27   primaryAxis: string;
     28 }
     29 
     30 const MODEL_RANK: Record<string, number> = {
     31   haiku: 1,
     32   sonnet: 2,
     33   opus: 3,
     34 };
     35 
     36 const CONFIG_KEYS = [
     37   "prompt_style", "language", "effort", "human_language",
     38   "linter", "playwright", "context_file",
     39   "web_search", "max_budget", "tool_read", "tool_write",
     40   "tool_edit", "tool_glob", "tool_grep",
     41   "tests_provided", "strategy", "design_guidance", "architecture",
     42   "error_checking", "context_noise", "renderer",
     43 ];
     44 
     45 function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] {
     46   const diffs: string[] = [];
     47   const metaA = runsA[0]?.meta;
     48   const metaB = runsB[0]?.meta;
     49   if (!metaA || !metaB) return diffs;
     50 
     51   for (const key of CONFIG_KEYS) {
     52     const va = String((metaA as Record<string, unknown>)[key]);
     53     const vb = String((metaB as Record<string, unknown>)[key]);
     54     if (va !== vb) {
     55       diffs.push(`${key}: ${va} vs ${vb}`);
     56     }
     57   }
     58   return diffs;
     59 }
     60 
     61 function findSurprises(runs: Run[]): Surprise[] {
     62   const surprises: Surprise[] = [];
     63 
     64   // Group runs by config (everything except model and run number)
     65   const configGroups: Record<string, Run[]> = {};
     66   for (const run of runs) {
     67     if (run.eval_results?.score == null) continue;
     68     const m = run.meta;
     69     const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|");
     70     (configGroups[key] ??= []).push(run);
     71   }
     72 
     73   // Within each config group, compare models
     74   for (const [, group] of Object.entries(configGroups)) {
     75     const byModel: Record<string, Run[]> = {};
     76     for (const run of group) {
     77       (byModel[run.meta.model] ??= []).push(run);
     78     }
     79 
     80     const models = Object.keys(byModel);
     81     for (let i = 0; i < models.length; i++) {
     82       for (let j = i + 1; j < models.length; j++) {
     83         const a = models[i];
     84         const b = models[j];
     85         const rankA = MODEL_RANK[a] || 0;
     86         const rankB = MODEL_RANK[b] || 0;
     87 
     88         const runsA = byModel[a];
     89         const runsB = byModel[b];
     90         const scoresA = runsA.map(r => r.eval_results!.score!);
     91         const scoresB = runsB.map(r => r.eval_results!.score!);
     92         const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length;
     93         const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length;
     94 
     95         const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0);
     96         const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0);
     97         const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length;
     98         const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length;
     99 
    100         const allRuns = [
    101           ...runsA.map(r => ({
    102             run_id: r.meta.run_id, short_id: r.meta.short_id, model: a,
    103             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    104             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    105           })),
    106           ...runsB.map(r => ({
    107             run_id: r.meta.run_id, short_id: r.meta.short_id, model: b,
    108             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    109             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    110           })),
    111         ];
    112 
    113         if (rankA < rankB && avgA > avgB + 0.02) {
    114           surprises.push({
    115             title: `${a} beat ${b}`,
    116             detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`,
    117             category: "model_upset",
    118             weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
    119             stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
    120             magnitude: avgA - avgB,
    121             runs: allRuns,
    122             configDiffs: getConfigDiffs(runsA, runsB),
    123             primaryAxis: "model",
    124           });
    125         } else if (rankB < rankA && avgB > avgA + 0.02) {
    126           surprises.push({
    127             title: `${b} beat ${a}`,
    128             detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`,
    129             category: "model_upset",
    130             weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
    131             stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
    132             magnitude: avgB - avgA,
    133             runs: allRuns,
    134             configDiffs: getConfigDiffs(runsB, runsA),
    135             primaryAxis: "model",
    136           });
    137         }
    138       }
    139     }
    140   }
    141 
    142   // Find individual outlier runs where a stronger model scored far below haiku
    143   const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!);
    144   const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0;
    145 
    146   for (const run of runs) {
    147     if (run.eval_results?.score == null) continue;
    148     const model = run.meta.model;
    149     const score = run.eval_results.score;
    150     const rank = MODEL_RANK[model] || 0;
    151 
    152     if (rank > 1 && score < haikuMean - 0.15) {
    153       surprises.push({
    154         title: `${model} run scored far below haiku avg`,
    155         detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`,
    156         category: "individual_outlier",
    157         weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 },
    158         stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 },
    159         magnitude: haikuMean - score,
    160         runs: [{
    161           run_id: run.meta.run_id, short_id: run.meta.short_id, model,
    162           score, cost: run.claude_output?.total_cost_usd ?? 0,
    163           config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])),
    164         }],
    165         configDiffs: CONFIG_KEYS.filter(k => {
    166           const v = String((run.meta as Record<string, unknown>)[k]);
    167           return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple";
    168         }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`),
    169         primaryAxis: "model",
    170       });
    171     }
    172   }
    173 
    174   // Simple prompt beats detailed
    175   const promptGroups: Record<string, Run[]> = {};
    176   for (const run of runs) {
    177     if (run.eval_results?.score == null) continue;
    178     const m = run.meta;
    179     const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|");
    180     (promptGroups[key] ??= []).push(run);
    181   }
    182 
    183   for (const [, group] of Object.entries(promptGroups)) {
    184     const byPrompt: Record<string, Run[]> = {};
    185     for (const run of group) {
    186       (byPrompt[run.meta.prompt_style] ??= []).push(run);
    187     }
    188     if (byPrompt.simple && byPrompt.detailed) {
    189       const simpleRuns = byPrompt.simple;
    190       const detailedRuns = byPrompt.detailed;
    191       const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length;
    192       const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length;
    193       if (avgSimple > avgDetailed + 0.05) {
    194         const allRuns = [
    195           ...simpleRuns.map(r => ({
    196             run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
    197             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    198             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    199           })),
    200           ...detailedRuns.map(r => ({
    201             run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
    202             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    203             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    204           })),
    205         ];
    206         surprises.push({
    207           title: "Simple prompt beat detailed",
    208           detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
    209           category: "prompt_upset",
    210           weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
    211           stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
    212           magnitude: avgSimple - avgDetailed,
    213           runs: allRuns,
    214           configDiffs: ["prompt_style: simple vs detailed"],
    215           primaryAxis: "prompt_style",
    216         });
    217       }
    218     }
    219   }
    220 
    221   return surprises.sort((a, b) => b.magnitude - a.magnitude);
    222 }
    223 
    224 const CATEGORY_LABELS: Record<string, string> = {
    225   model_upset: "Model upsets",
    226   prompt_upset: "Prompt upsets",
    227   individual_outlier: "Individual outliers",
    228 };
    229 
    230 const CATEGORY_DESCRIPTIONS: Record<string, string> = {
    231   model_upset: "A cheaper/weaker model outperformed a more capable one under the same configuration.",
    232   prompt_upset: "A simpler prompt style beat a more detailed one, suggesting diminishing returns from verbosity.",
    233   individual_outlier: "A single run from a stronger model scored far below the weaker model's average.",
    234 };
    235 
    236 const CATEGORY_COLORS: Record<string, string> = {
    237   model_upset: "var(--yellow)",
    238   prompt_upset: "var(--accent)",
    239   individual_outlier: "var(--red)",
    240 };
    241 
    242 function SurpriseCard({ surprise }: { surprise: Surprise }) {
    243   const [expanded, setExpanded] = useState(false);
    244 
    245   return (
    246     <div
    247       className="card"
    248       style={{
    249         padding: "14px",
    250         borderLeft: `3px solid ${CATEGORY_COLORS[surprise.category] || "var(--yellow)"}`,
    251         cursor: "pointer",
    252       }}
    253       onClick={() => setExpanded(!expanded)}
    254     >
    255       <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
    256         {surprise.title}
    257       </div>
    258       <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
    259         {surprise.detail}
    260       </div>
    261       <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
    262         <div>
    263           <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span>
    264           <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
    265             {(surprise.weaker.score * 100).toFixed(0)}%
    266           </span>
    267         </div>
    268         <div style={{ color: "var(--text-muted)" }}>vs</div>
    269         <div>
    270           <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span>
    271           <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
    272             {(surprise.stronger.score * 100).toFixed(0)}%
    273           </span>
    274         </div>
    275       </div>
    276 
    277       <div style={{ display: "flex", gap: "8px", marginTop: "8px", flexWrap: "wrap" }}>
    278         <span style={{
    279           fontSize: "10px",
    280           padding: "2px 6px",
    281           borderRadius: "3px",
    282           background: "hsl(var(--muted))",
    283           color: "hsl(var(--muted-foreground))",
    284           fontFamily: "var(--font-mono)",
    285         }}>
    286           +{(surprise.magnitude * 100).toFixed(0)}pp
    287         </span>
    288         <span style={{
    289           fontSize: "10px",
    290           padding: "2px 6px",
    291           borderRadius: "3px",
    292           background: "hsl(var(--muted))",
    293           color: "hsl(var(--muted-foreground))",
    294           fontFamily: "var(--font-mono)",
    295         }}>
    296           {surprise.runs.length} run{surprise.runs.length !== 1 ? "s" : ""}
    297         </span>
    298       </div>
    299 
    300       {expanded && (
    301         <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}>
    302           {surprise.configDiffs.length > 0 && (
    303             <div style={{ marginBottom: "8px" }}>
    304               <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div>
    305               {surprise.configDiffs.map((diff, i) => (
    306                 <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div>
    307               ))}
    308             </div>
    309           )}
    310 
    311           <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>
    312             Runs ({surprise.runs.length})
    313           </div>
    314           {surprise.runs.map((r) => (
    315             <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}>
    316               <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}>
    317                 {r.model}
    318               </span>
    319               <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}>
    320                 {(r.score * 100).toFixed(0)}%
    321               </span>
    322               <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
    323                 ${r.cost.toFixed(2)}
    324               </span>
    325               <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }} onClick={e => e.stopPropagation()}>
    326                 view
    327               </a>
    328             </div>
    329           ))}
    330         </div>
    331       )}
    332     </div>
    333   );
    334 }
    335 
    336 export default function SurprisesPage({ runs }: SurprisesPageProps) {
    337   const surprises = useMemo(() => findSurprises(runs), [runs]);
    338 
    339   // Aggregate stats
    340   const byCategory = useMemo(() => {
    341     const groups: Record<string, Surprise[]> = {};
    342     for (const s of surprises) {
    343       (groups[s.category] ??= []).push(s);
    344     }
    345     return groups;
    346   }, [surprises]);
    347 
    348   const axisCounts = useMemo(() => {
    349     const counts: Record<string, number> = {};
    350     for (const s of surprises) {
    351       counts[s.primaryAxis] = (counts[s.primaryAxis] || 0) + 1;
    352     }
    353     return Object.entries(counts).sort((a, b) => b[1] - a[1]);
    354   }, [surprises]);
    355 
    356   const avgMagnitude = useMemo(() => {
    357     if (surprises.length === 0) return 0;
    358     return surprises.reduce((sum, s) => sum + s.magnitude, 0) / surprises.length;
    359   }, [surprises]);
    360 
    361   const maxMagnitude = useMemo(() => {
    362     if (surprises.length === 0) return 0;
    363     return Math.max(...surprises.map(s => s.magnitude));
    364   }, [surprises]);
    365 
    366   // Category order for display
    367   const categoryOrder = ["model_upset", "prompt_upset", "individual_outlier"];
    368   const orderedCategories = categoryOrder.filter(c => byCategory[c]?.length);
    369 
    370   if (surprises.length === 0) {
    371     return (
    372       <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}>
    373         No surprises yet. Run more experiments with different models to find upsets.
    374       </div>
    375     );
    376   }
    377 
    378   return (
    379     <div>
    380       {/* Explanation */}
    381       <div className="card" style={{ padding: "16px", marginBottom: "24px" }}>
    382         <p style={{ fontSize: "12px", color: "var(--text-muted)", margin: 0, lineHeight: "1.6" }}>
    383           A "surprise" is a result that defies expectations: a weaker or cheaper model outperforming a stronger one,
    384           or a simpler configuration beating a more elaborate one. These findings highlight where conventional assumptions
    385           about model capability and configuration complexity break down. Click any card to see the runs involved.
    386         </p>
    387       </div>
    388 
    389       {/* Summary stats */}
    390       <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(180px, 1fr))", gap: "12px", marginBottom: "24px" }}>
    391         <div className="stat-card">
    392           <div className="stat-value">{surprises.length}</div>
    393           <div className="stat-label">Total surprises</div>
    394         </div>
    395         <div className="stat-card">
    396           <div className="stat-value">{(avgMagnitude * 100).toFixed(0)}pp</div>
    397           <div className="stat-label">Avg magnitude</div>
    398         </div>
    399         <div className="stat-card">
    400           <div className="stat-value">{(maxMagnitude * 100).toFixed(0)}pp</div>
    401           <div className="stat-label">Largest upset</div>
    402         </div>
    403         <div className="stat-card">
    404           <div className="stat-value">{axisCounts[0]?.[0] || "--"}</div>
    405           <div className="stat-label">Most surprising axis</div>
    406         </div>
    407       </div>
    408 
    409       {/* Breakdown by type */}
    410       <div className="card" style={{ padding: "16px", marginBottom: "24px" }}>
    411         <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "12px" }}>
    412           Breakdown by type
    413         </div>
    414         <div style={{ display: "flex", gap: "24px", flexWrap: "wrap" }}>
    415           {orderedCategories.map(cat => (
    416             <div key={cat} style={{ display: "flex", alignItems: "baseline", gap: "8px" }}>
    417               <span style={{
    418                 width: "8px",
    419                 height: "8px",
    420                 borderRadius: "2px",
    421                 background: CATEGORY_COLORS[cat],
    422                 display: "inline-block",
    423                 flexShrink: 0,
    424                 position: "relative",
    425                 top: "-1px",
    426               }} />
    427               <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "14px" }}>
    428                 {byCategory[cat]?.length || 0}
    429               </span>
    430               <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>
    431                 {CATEGORY_LABELS[cat]}
    432               </span>
    433             </div>
    434           ))}
    435         </div>
    436         {axisCounts.length > 1 && (
    437           <div style={{ marginTop: "12px", paddingTop: "12px", borderTop: "1px solid var(--border)" }}>
    438             <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "8px" }}>
    439               Surprises by axis
    440             </div>
    441             <div style={{ display: "flex", gap: "16px", flexWrap: "wrap" }}>
    442               {axisCounts.map(([axis, count]) => (
    443                 <div key={axis} style={{ display: "flex", alignItems: "baseline", gap: "6px" }}>
    444                   <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "13px" }}>
    445                     {count}
    446                   </span>
    447                   <span style={{ fontSize: "11px", color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
    448                     {axis}
    449                   </span>
    450                 </div>
    451               ))}
    452             </div>
    453           </div>
    454         )}
    455       </div>
    456 
    457       {/* Grouped surprise cards */}
    458       {orderedCategories.map(cat => (
    459         <div key={cat} style={{ marginBottom: "32px" }}>
    460           <h3 style={{ marginBottom: "4px" }}>{CATEGORY_LABELS[cat]}</h3>
    461           <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
    462             {CATEGORY_DESCRIPTIONS[cat]}
    463           </p>
    464           <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
    465             {byCategory[cat]!.map((s, i) => (
    466               <SurpriseCard key={i} surprise={s} />
    467             ))}
    468           </div>
    469         </div>
    470       ))}
    471     </div>
    472   );
    473 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README