Surprises.tsx - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

Surprises.tsx (12876B)
      1 import { useState } from "react";
      2 import type { Run } from "../lib/types";
      3 
      4 interface SurprisesProps {
      5   runs: Run[];
      6 }
      7 
      8 interface RunRef {
      9   run_id: string;
     10   short_id?: string;
     11   model: string;
     12   score: number;
     13   cost: number;
     14   config: Record<string, string>;
     15 }
     16 
     17 interface Surprise {
     18   title: string;
     19   detail: string;
     20   weaker: { model: string; config: string; score: number; cost: number };
     21   stronger: { model: string; config: string; score: number; cost: number };
     22   magnitude: number;
     23   runs: RunRef[];
     24   configDiffs: string[];
     25 }
     26 
     27 const MODEL_RANK: Record<string, number> = {
     28   haiku: 1,
     29   sonnet: 2,
     30   opus: 3,
     31 };
     32 
     33 const CONFIG_KEYS = [
     34   "prompt_style", "language", "effort", "human_language",
     35   "linter", "playwright", "context_file",
     36   "web_search", "max_budget", "tool_read", "tool_write",
     37   "tool_edit", "tool_glob", "tool_grep",
     38   "tests_provided", "strategy", "design_guidance", "architecture",
     39   "error_checking", "context_noise", "renderer",
     40 ];
     41 
     42 function getConfigKey(meta: Run["meta"]): string {
     43   return CONFIG_KEYS.map(k => `${k}=${(meta as Record<string, unknown>)[k]}`).join("|");
     44 }
     45 
     46 function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] {
     47   // Find which config values differ between the two groups
     48   const diffs: string[] = [];
     49   const metaA = runsA[0]?.meta;
     50   const metaB = runsB[0]?.meta;
     51   if (!metaA || !metaB) return diffs;
     52 
     53   for (const key of CONFIG_KEYS) {
     54     const va = String((metaA as Record<string, unknown>)[key]);
     55     const vb = String((metaB as Record<string, unknown>)[key]);
     56     if (va !== vb) {
     57       diffs.push(`${key}: ${va} vs ${vb}`);
     58     }
     59   }
     60   return diffs;
     61 }
     62 
     63 function findSurprises(runs: Run[]): Surprise[] {
     64   const surprises: Surprise[] = [];
     65 
     66   // Group runs by config (everything except model and run number)
     67   const configGroups: Record<string, Run[]> = {};
     68   for (const run of runs) {
     69     if (run.eval_results?.score == null) continue;
     70     const m = run.meta;
     71     const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|");
     72     (configGroups[key] ??= []).push(run);
     73   }
     74 
     75   // Within each config group, compare models
     76   for (const [, group] of Object.entries(configGroups)) {
     77     const byModel: Record<string, Run[]> = {};
     78     for (const run of group) {
     79       (byModel[run.meta.model] ??= []).push(run);
     80     }
     81 
     82     const models = Object.keys(byModel);
     83     for (let i = 0; i < models.length; i++) {
     84       for (let j = i + 1; j < models.length; j++) {
     85         const a = models[i];
     86         const b = models[j];
     87         const rankA = MODEL_RANK[a] || 0;
     88         const rankB = MODEL_RANK[b] || 0;
     89 
     90         const runsA = byModel[a];
     91         const runsB = byModel[b];
     92         const scoresA = runsA.map(r => r.eval_results!.score!);
     93         const scoresB = runsB.map(r => r.eval_results!.score!);
     94         const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length;
     95         const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length;
     96 
     97         const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0);
     98         const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0);
     99         const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length;
    100         const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length;
    101 
    102         const allRuns = [
    103           ...runsA.map(r => ({
    104             run_id: r.meta.run_id, short_id: r.meta.short_id, model: a,
    105             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    106             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    107           })),
    108           ...runsB.map(r => ({
    109             run_id: r.meta.run_id, short_id: r.meta.short_id, model: b,
    110             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    111             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    112           })),
    113         ];
    114 
    115         if (rankA < rankB && avgA > avgB + 0.02) {
    116           surprises.push({
    117             title: `${a} beat ${b}`,
    118             detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`,
    119             weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
    120             stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
    121             magnitude: avgA - avgB,
    122             runs: allRuns,
    123             configDiffs: getConfigDiffs(runsA, runsB),
    124           });
    125         } else if (rankB < rankA && avgB > avgA + 0.02) {
    126           surprises.push({
    127             title: `${b} beat ${a}`,
    128             detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`,
    129             weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
    130             stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
    131             magnitude: avgB - avgA,
    132             runs: allRuns,
    133             configDiffs: getConfigDiffs(runsB, runsA),
    134           });
    135         }
    136       }
    137     }
    138   }
    139 
    140   // Find individual outlier runs where sonnet scored far below haiku
    141   const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!);
    142   const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0;
    143 
    144   for (const run of runs) {
    145     if (run.eval_results?.score == null) continue;
    146     const model = run.meta.model;
    147     const score = run.eval_results.score;
    148     const rank = MODEL_RANK[model] || 0;
    149 
    150     // Flag if a "stronger" model scored significantly below haiku average
    151     if (rank > 1 && score < haikuMean - 0.15) {
    152       surprises.push({
    153         title: `${model} run scored far below haiku avg`,
    154         detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`,
    155         weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 },
    156         stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 },
    157         magnitude: haikuMean - score,
    158         runs: [{
    159           run_id: run.meta.run_id, short_id: run.meta.short_id, model,
    160           score, cost: run.claude_output?.total_cost_usd ?? 0,
    161           config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])),
    162         }],
    163         configDiffs: CONFIG_KEYS.filter(k => {
    164           const v = String((run.meta as Record<string, unknown>)[k]);
    165           return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple";
    166         }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`),
    167       });
    168     }
    169   }
    170 
    171   // Simple prompt beats detailed
    172   const promptGroups: Record<string, Run[]> = {};
    173   for (const run of runs) {
    174     if (run.eval_results?.score == null) continue;
    175     const m = run.meta;
    176     const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|");
    177     (promptGroups[key] ??= []).push(run);
    178   }
    179 
    180   for (const [, group] of Object.entries(promptGroups)) {
    181     const byPrompt: Record<string, Run[]> = {};
    182     for (const run of group) {
    183       (byPrompt[run.meta.prompt_style] ??= []).push(run);
    184     }
    185     if (byPrompt.simple && byPrompt.detailed) {
    186       const simpleRuns = byPrompt.simple;
    187       const detailedRuns = byPrompt.detailed;
    188       const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length;
    189       const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length;
    190       if (avgSimple > avgDetailed + 0.05) {
    191         const allRuns = [
    192           ...simpleRuns.map(r => ({
    193             run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
    194             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    195             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    196           })),
    197           ...detailedRuns.map(r => ({
    198             run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
    199             score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
    200             config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
    201           })),
    202         ];
    203         surprises.push({
    204           title: "Simple prompt beat detailed",
    205           detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
    206           weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
    207           stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
    208           magnitude: avgSimple - avgDetailed,
    209           runs: allRuns,
    210           configDiffs: ["prompt_style: simple vs detailed"],
    211         });
    212       }
    213     }
    214   }
    215 
    216   return surprises.sort((a, b) => b.magnitude - a.magnitude);
    217 }
    218 
    219 function SurpriseCard({ surprise }: { surprise: Surprise }) {
    220   const [expanded, setExpanded] = useState(false);
    221 
    222   return (
    223     <div className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)", cursor: "pointer" }} onClick={() => setExpanded(!expanded)}>
    224       <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
    225         {surprise.title}
    226       </div>
    227       <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
    228         {surprise.detail}
    229       </div>
    230       <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
    231         <div>
    232           <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span>
    233           <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
    234             {(surprise.weaker.score * 100).toFixed(0)}%
    235           </span>
    236         </div>
    237         <div style={{ color: "var(--text-muted)" }}>vs</div>
    238         <div>
    239           <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span>
    240           <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
    241             {(surprise.stronger.score * 100).toFixed(0)}%
    242           </span>
    243         </div>
    244       </div>
    245 
    246       {expanded && (
    247         <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}>
    248           {surprise.configDiffs.length > 0 && (
    249             <div style={{ marginBottom: "8px" }}>
    250               <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div>
    251               {surprise.configDiffs.map((diff, i) => (
    252                 <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div>
    253               ))}
    254             </div>
    255           )}
    256 
    257           <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>
    258             Runs ({surprise.runs.length})
    259           </div>
    260           {surprise.runs.map((r) => (
    261             <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}>
    262               <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}>
    263                 {r.model}
    264               </span>
    265               <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}>
    266                 {(r.score * 100).toFixed(0)}%
    267               </span>
    268               <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
    269                 ${r.cost.toFixed(2)}
    270               </span>
    271               <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }}>
    272                 view
    273               </a>
    274             </div>
    275           ))}
    276         </div>
    277       )}
    278     </div>
    279   );
    280 }
    281 
    282 export default function Surprises({ runs }: SurprisesProps) {
    283   const surprises = findSurprises(runs);
    284 
    285   if (surprises.length === 0) {
    286     return (
    287       <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}>
    288         No surprises yet. Run more experiments with different models to find upsets.
    289       </div>
    290     );
    291   }
    292 
    293   return (
    294     <div>
    295       <h3 style={{ marginBottom: "12px" }}>Surprises</h3>
    296       <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
    297         Click to expand. Where weaker configs outperformed stronger ones.
    298       </p>
    299       <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
    300         {surprises.map((s, i) => (
    301           <SurpriseCard key={i} surprise={s} />
    302         ))}
    303       </div>
    304     </div>
    305   );
    306 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README