Surprises.tsx (12876B)
1 import { useState } from "react"; 2 import type { Run } from "../lib/types"; 3 4 interface SurprisesProps { 5 runs: Run[]; 6 } 7 8 interface RunRef { 9 run_id: string; 10 short_id?: string; 11 model: string; 12 score: number; 13 cost: number; 14 config: Record<string, string>; 15 } 16 17 interface Surprise { 18 title: string; 19 detail: string; 20 weaker: { model: string; config: string; score: number; cost: number }; 21 stronger: { model: string; config: string; score: number; cost: number }; 22 magnitude: number; 23 runs: RunRef[]; 24 configDiffs: string[]; 25 } 26 27 const MODEL_RANK: Record<string, number> = { 28 haiku: 1, 29 sonnet: 2, 30 opus: 3, 31 }; 32 33 const CONFIG_KEYS = [ 34 "prompt_style", "language", "effort", "human_language", 35 "linter", "playwright", "context_file", 36 "web_search", "max_budget", "tool_read", "tool_write", 37 "tool_edit", "tool_glob", "tool_grep", 38 "tests_provided", "strategy", "design_guidance", "architecture", 39 "error_checking", "context_noise", "renderer", 40 ]; 41 42 function getConfigKey(meta: Run["meta"]): string { 43 return CONFIG_KEYS.map(k => `${k}=${(meta as Record<string, unknown>)[k]}`).join("|"); 44 } 45 46 function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] { 47 // Find which config values differ between the two groups 48 const diffs: string[] = []; 49 const metaA = runsA[0]?.meta; 50 const metaB = runsB[0]?.meta; 51 if (!metaA || !metaB) return diffs; 52 53 for (const key of CONFIG_KEYS) { 54 const va = String((metaA as Record<string, unknown>)[key]); 55 const vb = String((metaB as Record<string, unknown>)[key]); 56 if (va !== vb) { 57 diffs.push(`${key}: ${va} vs ${vb}`); 58 } 59 } 60 return diffs; 61 } 62 63 function findSurprises(runs: Run[]): Surprise[] { 64 const surprises: Surprise[] = []; 65 66 // Group runs by config (everything except model and run number) 67 const configGroups: Record<string, Run[]> = {}; 68 for (const run of runs) { 69 if (run.eval_results?.score == null) continue; 70 const m = run.meta; 71 const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|"); 72 (configGroups[key] ??= []).push(run); 73 } 74 75 // Within each config group, compare models 76 for (const [, group] of Object.entries(configGroups)) { 77 const byModel: Record<string, Run[]> = {}; 78 for (const run of group) { 79 (byModel[run.meta.model] ??= []).push(run); 80 } 81 82 const models = Object.keys(byModel); 83 for (let i = 0; i < models.length; i++) { 84 for (let j = i + 1; j < models.length; j++) { 85 const a = models[i]; 86 const b = models[j]; 87 const rankA = MODEL_RANK[a] || 0; 88 const rankB = MODEL_RANK[b] || 0; 89 90 const runsA = byModel[a]; 91 const runsB = byModel[b]; 92 const scoresA = runsA.map(r => r.eval_results!.score!); 93 const scoresB = runsB.map(r => r.eval_results!.score!); 94 const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length; 95 const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length; 96 97 const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0); 98 const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0); 99 const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length; 100 const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length; 101 102 const allRuns = [ 103 ...runsA.map(r => ({ 104 run_id: r.meta.run_id, short_id: r.meta.short_id, model: a, 105 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 106 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 107 })), 108 ...runsB.map(r => ({ 109 run_id: r.meta.run_id, short_id: r.meta.short_id, model: b, 110 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 111 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 112 })), 113 ]; 114 115 if (rankA < rankB && avgA > avgB + 0.02) { 116 surprises.push({ 117 title: `${a} beat ${b}`, 118 detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`, 119 weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, 120 stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, 121 magnitude: avgA - avgB, 122 runs: allRuns, 123 configDiffs: getConfigDiffs(runsA, runsB), 124 }); 125 } else if (rankB < rankA && avgB > avgA + 0.02) { 126 surprises.push({ 127 title: `${b} beat ${a}`, 128 detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`, 129 weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, 130 stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, 131 magnitude: avgB - avgA, 132 runs: allRuns, 133 configDiffs: getConfigDiffs(runsB, runsA), 134 }); 135 } 136 } 137 } 138 } 139 140 // Find individual outlier runs where sonnet scored far below haiku 141 const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!); 142 const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0; 143 144 for (const run of runs) { 145 if (run.eval_results?.score == null) continue; 146 const model = run.meta.model; 147 const score = run.eval_results.score; 148 const rank = MODEL_RANK[model] || 0; 149 150 // Flag if a "stronger" model scored significantly below haiku average 151 if (rank > 1 && score < haikuMean - 0.15) { 152 surprises.push({ 153 title: `${model} run scored far below haiku avg`, 154 detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`, 155 weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 }, 156 stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 }, 157 magnitude: haikuMean - score, 158 runs: [{ 159 run_id: run.meta.run_id, short_id: run.meta.short_id, model, 160 score, cost: run.claude_output?.total_cost_usd ?? 0, 161 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])), 162 }], 163 configDiffs: CONFIG_KEYS.filter(k => { 164 const v = String((run.meta as Record<string, unknown>)[k]); 165 return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple"; 166 }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`), 167 }); 168 } 169 } 170 171 // Simple prompt beats detailed 172 const promptGroups: Record<string, Run[]> = {}; 173 for (const run of runs) { 174 if (run.eval_results?.score == null) continue; 175 const m = run.meta; 176 const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|"); 177 (promptGroups[key] ??= []).push(run); 178 } 179 180 for (const [, group] of Object.entries(promptGroups)) { 181 const byPrompt: Record<string, Run[]> = {}; 182 for (const run of group) { 183 (byPrompt[run.meta.prompt_style] ??= []).push(run); 184 } 185 if (byPrompt.simple && byPrompt.detailed) { 186 const simpleRuns = byPrompt.simple; 187 const detailedRuns = byPrompt.detailed; 188 const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length; 189 const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length; 190 if (avgSimple > avgDetailed + 0.05) { 191 const allRuns = [ 192 ...simpleRuns.map(r => ({ 193 run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, 194 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 195 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 196 })), 197 ...detailedRuns.map(r => ({ 198 run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, 199 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 200 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 201 })), 202 ]; 203 surprises.push({ 204 title: "Simple prompt beat detailed", 205 detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`, 206 weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 }, 207 stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 }, 208 magnitude: avgSimple - avgDetailed, 209 runs: allRuns, 210 configDiffs: ["prompt_style: simple vs detailed"], 211 }); 212 } 213 } 214 } 215 216 return surprises.sort((a, b) => b.magnitude - a.magnitude); 217 } 218 219 function SurpriseCard({ surprise }: { surprise: Surprise }) { 220 const [expanded, setExpanded] = useState(false); 221 222 return ( 223 <div className="card" style={{ padding: "14px", borderLeft: "3px solid var(--yellow)", cursor: "pointer" }} onClick={() => setExpanded(!expanded)}> 224 <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> 225 {surprise.title} 226 </div> 227 <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> 228 {surprise.detail} 229 </div> 230 <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> 231 <div> 232 <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span> 233 <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> 234 {(surprise.weaker.score * 100).toFixed(0)}% 235 </span> 236 </div> 237 <div style={{ color: "var(--text-muted)" }}>vs</div> 238 <div> 239 <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span> 240 <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> 241 {(surprise.stronger.score * 100).toFixed(0)}% 242 </span> 243 </div> 244 </div> 245 246 {expanded && ( 247 <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}> 248 {surprise.configDiffs.length > 0 && ( 249 <div style={{ marginBottom: "8px" }}> 250 <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div> 251 {surprise.configDiffs.map((diff, i) => ( 252 <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div> 253 ))} 254 </div> 255 )} 256 257 <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}> 258 Runs ({surprise.runs.length}) 259 </div> 260 {surprise.runs.map((r) => ( 261 <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}> 262 <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}> 263 {r.model} 264 </span> 265 <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}> 266 {(r.score * 100).toFixed(0)}% 267 </span> 268 <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> 269 ${r.cost.toFixed(2)} 270 </span> 271 <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }}> 272 view 273 </a> 274 </div> 275 ))} 276 </div> 277 )} 278 </div> 279 ); 280 } 281 282 export default function Surprises({ runs }: SurprisesProps) { 283 const surprises = findSurprises(runs); 284 285 if (surprises.length === 0) { 286 return ( 287 <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}> 288 No surprises yet. Run more experiments with different models to find upsets. 289 </div> 290 ); 291 } 292 293 return ( 294 <div> 295 <h3 style={{ marginBottom: "12px" }}>Surprises</h3> 296 <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}> 297 Click to expand. Where weaker configs outperformed stronger ones. 298 </p> 299 <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}> 300 {surprises.map((s, i) => ( 301 <SurpriseCard key={i} surprise={s} /> 302 ))} 303 </div> 304 </div> 305 ); 306 }