SurprisesPage.tsx (19596B)
1 import { useState, useMemo } from "react"; 2 import type { Run } from "../lib/types"; 3 4 interface SurprisesPageProps { 5 runs: Run[]; 6 } 7 8 interface RunRef { 9 run_id: string; 10 short_id?: string; 11 model: string; 12 score: number; 13 cost: number; 14 config: Record<string, string>; 15 } 16 17 interface Surprise { 18 title: string; 19 detail: string; 20 category: "model_upset" | "prompt_upset" | "individual_outlier"; 21 weaker: { model: string; config: string; score: number; cost: number }; 22 stronger: { model: string; config: string; score: number; cost: number }; 23 magnitude: number; 24 runs: RunRef[]; 25 configDiffs: string[]; 26 /** Which config axis is the primary differentiator */ 27 primaryAxis: string; 28 } 29 30 const MODEL_RANK: Record<string, number> = { 31 haiku: 1, 32 sonnet: 2, 33 opus: 3, 34 }; 35 36 const CONFIG_KEYS = [ 37 "prompt_style", "language", "effort", "human_language", 38 "linter", "playwright", "context_file", 39 "web_search", "max_budget", "tool_read", "tool_write", 40 "tool_edit", "tool_glob", "tool_grep", 41 "tests_provided", "strategy", "design_guidance", "architecture", 42 "error_checking", "context_noise", "renderer", 43 ]; 44 45 function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] { 46 const diffs: string[] = []; 47 const metaA = runsA[0]?.meta; 48 const metaB = runsB[0]?.meta; 49 if (!metaA || !metaB) return diffs; 50 51 for (const key of CONFIG_KEYS) { 52 const va = String((metaA as Record<string, unknown>)[key]); 53 const vb = String((metaB as Record<string, unknown>)[key]); 54 if (va !== vb) { 55 diffs.push(`${key}: ${va} vs ${vb}`); 56 } 57 } 58 return diffs; 59 } 60 61 function findSurprises(runs: Run[]): Surprise[] { 62 const surprises: Surprise[] = []; 63 64 // Group runs by config (everything except model and run number) 65 const configGroups: Record<string, Run[]> = {}; 66 for (const run of runs) { 67 if (run.eval_results?.score == null) continue; 68 const m = run.meta; 69 const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|"); 70 (configGroups[key] ??= []).push(run); 71 } 72 73 // Within each config group, compare models 74 for (const [, group] of Object.entries(configGroups)) { 75 const byModel: Record<string, Run[]> = {}; 76 for (const run of group) { 77 (byModel[run.meta.model] ??= []).push(run); 78 } 79 80 const models = Object.keys(byModel); 81 for (let i = 0; i < models.length; i++) { 82 for (let j = i + 1; j < models.length; j++) { 83 const a = models[i]; 84 const b = models[j]; 85 const rankA = MODEL_RANK[a] || 0; 86 const rankB = MODEL_RANK[b] || 0; 87 88 const runsA = byModel[a]; 89 const runsB = byModel[b]; 90 const scoresA = runsA.map(r => r.eval_results!.score!); 91 const scoresB = runsB.map(r => r.eval_results!.score!); 92 const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length; 93 const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length; 94 95 const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0); 96 const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0); 97 const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length; 98 const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length; 99 100 const allRuns = [ 101 ...runsA.map(r => ({ 102 run_id: r.meta.run_id, short_id: r.meta.short_id, model: a, 103 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 104 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 105 })), 106 ...runsB.map(r => ({ 107 run_id: r.meta.run_id, short_id: r.meta.short_id, model: b, 108 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 109 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 110 })), 111 ]; 112 113 if (rankA < rankB && avgA > avgB + 0.02) { 114 surprises.push({ 115 title: `${a} beat ${b}`, 116 detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`, 117 category: "model_upset", 118 weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, 119 stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, 120 magnitude: avgA - avgB, 121 runs: allRuns, 122 configDiffs: getConfigDiffs(runsA, runsB), 123 primaryAxis: "model", 124 }); 125 } else if (rankB < rankA && avgB > avgA + 0.02) { 126 surprises.push({ 127 title: `${b} beat ${a}`, 128 detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`, 129 category: "model_upset", 130 weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, 131 stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, 132 magnitude: avgB - avgA, 133 runs: allRuns, 134 configDiffs: getConfigDiffs(runsB, runsA), 135 primaryAxis: "model", 136 }); 137 } 138 } 139 } 140 } 141 142 // Find individual outlier runs where a stronger model scored far below haiku 143 const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!); 144 const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0; 145 146 for (const run of runs) { 147 if (run.eval_results?.score == null) continue; 148 const model = run.meta.model; 149 const score = run.eval_results.score; 150 const rank = MODEL_RANK[model] || 0; 151 152 if (rank > 1 && score < haikuMean - 0.15) { 153 surprises.push({ 154 title: `${model} run scored far below haiku avg`, 155 detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`, 156 category: "individual_outlier", 157 weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 }, 158 stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 }, 159 magnitude: haikuMean - score, 160 runs: [{ 161 run_id: run.meta.run_id, short_id: run.meta.short_id, model, 162 score, cost: run.claude_output?.total_cost_usd ?? 0, 163 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])), 164 }], 165 configDiffs: CONFIG_KEYS.filter(k => { 166 const v = String((run.meta as Record<string, unknown>)[k]); 167 return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple"; 168 }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`), 169 primaryAxis: "model", 170 }); 171 } 172 } 173 174 // Simple prompt beats detailed 175 const promptGroups: Record<string, Run[]> = {}; 176 for (const run of runs) { 177 if (run.eval_results?.score == null) continue; 178 const m = run.meta; 179 const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|"); 180 (promptGroups[key] ??= []).push(run); 181 } 182 183 for (const [, group] of Object.entries(promptGroups)) { 184 const byPrompt: Record<string, Run[]> = {}; 185 for (const run of group) { 186 (byPrompt[run.meta.prompt_style] ??= []).push(run); 187 } 188 if (byPrompt.simple && byPrompt.detailed) { 189 const simpleRuns = byPrompt.simple; 190 const detailedRuns = byPrompt.detailed; 191 const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length; 192 const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length; 193 if (avgSimple > avgDetailed + 0.05) { 194 const allRuns = [ 195 ...simpleRuns.map(r => ({ 196 run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, 197 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 198 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 199 })), 200 ...detailedRuns.map(r => ({ 201 run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, 202 score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, 203 config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), 204 })), 205 ]; 206 surprises.push({ 207 title: "Simple prompt beat detailed", 208 detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`, 209 category: "prompt_upset", 210 weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 }, 211 stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 }, 212 magnitude: avgSimple - avgDetailed, 213 runs: allRuns, 214 configDiffs: ["prompt_style: simple vs detailed"], 215 primaryAxis: "prompt_style", 216 }); 217 } 218 } 219 } 220 221 return surprises.sort((a, b) => b.magnitude - a.magnitude); 222 } 223 224 const CATEGORY_LABELS: Record<string, string> = { 225 model_upset: "Model upsets", 226 prompt_upset: "Prompt upsets", 227 individual_outlier: "Individual outliers", 228 }; 229 230 const CATEGORY_DESCRIPTIONS: Record<string, string> = { 231 model_upset: "A cheaper/weaker model outperformed a more capable one under the same configuration.", 232 prompt_upset: "A simpler prompt style beat a more detailed one, suggesting diminishing returns from verbosity.", 233 individual_outlier: "A single run from a stronger model scored far below the weaker model's average.", 234 }; 235 236 const CATEGORY_COLORS: Record<string, string> = { 237 model_upset: "var(--yellow)", 238 prompt_upset: "var(--accent)", 239 individual_outlier: "var(--red)", 240 }; 241 242 function SurpriseCard({ surprise }: { surprise: Surprise }) { 243 const [expanded, setExpanded] = useState(false); 244 245 return ( 246 <div 247 className="card" 248 style={{ 249 padding: "14px", 250 borderLeft: `3px solid ${CATEGORY_COLORS[surprise.category] || "var(--yellow)"}`, 251 cursor: "pointer", 252 }} 253 onClick={() => setExpanded(!expanded)} 254 > 255 <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> 256 {surprise.title} 257 </div> 258 <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> 259 {surprise.detail} 260 </div> 261 <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> 262 <div> 263 <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span> 264 <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> 265 {(surprise.weaker.score * 100).toFixed(0)}% 266 </span> 267 </div> 268 <div style={{ color: "var(--text-muted)" }}>vs</div> 269 <div> 270 <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span> 271 <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> 272 {(surprise.stronger.score * 100).toFixed(0)}% 273 </span> 274 </div> 275 </div> 276 277 <div style={{ display: "flex", gap: "8px", marginTop: "8px", flexWrap: "wrap" }}> 278 <span style={{ 279 fontSize: "10px", 280 padding: "2px 6px", 281 borderRadius: "3px", 282 background: "hsl(var(--muted))", 283 color: "hsl(var(--muted-foreground))", 284 fontFamily: "var(--font-mono)", 285 }}> 286 +{(surprise.magnitude * 100).toFixed(0)}pp 287 </span> 288 <span style={{ 289 fontSize: "10px", 290 padding: "2px 6px", 291 borderRadius: "3px", 292 background: "hsl(var(--muted))", 293 color: "hsl(var(--muted-foreground))", 294 fontFamily: "var(--font-mono)", 295 }}> 296 {surprise.runs.length} run{surprise.runs.length !== 1 ? "s" : ""} 297 </span> 298 </div> 299 300 {expanded && ( 301 <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}> 302 {surprise.configDiffs.length > 0 && ( 303 <div style={{ marginBottom: "8px" }}> 304 <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div> 305 {surprise.configDiffs.map((diff, i) => ( 306 <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div> 307 ))} 308 </div> 309 )} 310 311 <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}> 312 Runs ({surprise.runs.length}) 313 </div> 314 {surprise.runs.map((r) => ( 315 <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}> 316 <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}> 317 {r.model} 318 </span> 319 <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}> 320 {(r.score * 100).toFixed(0)}% 321 </span> 322 <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> 323 ${r.cost.toFixed(2)} 324 </span> 325 <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }} onClick={e => e.stopPropagation()}> 326 view 327 </a> 328 </div> 329 ))} 330 </div> 331 )} 332 </div> 333 ); 334 } 335 336 export default function SurprisesPage({ runs }: SurprisesPageProps) { 337 const surprises = useMemo(() => findSurprises(runs), [runs]); 338 339 // Aggregate stats 340 const byCategory = useMemo(() => { 341 const groups: Record<string, Surprise[]> = {}; 342 for (const s of surprises) { 343 (groups[s.category] ??= []).push(s); 344 } 345 return groups; 346 }, [surprises]); 347 348 const axisCounts = useMemo(() => { 349 const counts: Record<string, number> = {}; 350 for (const s of surprises) { 351 counts[s.primaryAxis] = (counts[s.primaryAxis] || 0) + 1; 352 } 353 return Object.entries(counts).sort((a, b) => b[1] - a[1]); 354 }, [surprises]); 355 356 const avgMagnitude = useMemo(() => { 357 if (surprises.length === 0) return 0; 358 return surprises.reduce((sum, s) => sum + s.magnitude, 0) / surprises.length; 359 }, [surprises]); 360 361 const maxMagnitude = useMemo(() => { 362 if (surprises.length === 0) return 0; 363 return Math.max(...surprises.map(s => s.magnitude)); 364 }, [surprises]); 365 366 // Category order for display 367 const categoryOrder = ["model_upset", "prompt_upset", "individual_outlier"]; 368 const orderedCategories = categoryOrder.filter(c => byCategory[c]?.length); 369 370 if (surprises.length === 0) { 371 return ( 372 <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}> 373 No surprises yet. Run more experiments with different models to find upsets. 374 </div> 375 ); 376 } 377 378 return ( 379 <div> 380 {/* Explanation */} 381 <div className="card" style={{ padding: "16px", marginBottom: "24px" }}> 382 <p style={{ fontSize: "12px", color: "var(--text-muted)", margin: 0, lineHeight: "1.6" }}> 383 A "surprise" is a result that defies expectations: a weaker or cheaper model outperforming a stronger one, 384 or a simpler configuration beating a more elaborate one. These findings highlight where conventional assumptions 385 about model capability and configuration complexity break down. Click any card to see the runs involved. 386 </p> 387 </div> 388 389 {/* Summary stats */} 390 <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(180px, 1fr))", gap: "12px", marginBottom: "24px" }}> 391 <div className="stat-card"> 392 <div className="stat-value">{surprises.length}</div> 393 <div className="stat-label">Total surprises</div> 394 </div> 395 <div className="stat-card"> 396 <div className="stat-value">{(avgMagnitude * 100).toFixed(0)}pp</div> 397 <div className="stat-label">Avg magnitude</div> 398 </div> 399 <div className="stat-card"> 400 <div className="stat-value">{(maxMagnitude * 100).toFixed(0)}pp</div> 401 <div className="stat-label">Largest upset</div> 402 </div> 403 <div className="stat-card"> 404 <div className="stat-value">{axisCounts[0]?.[0] || "--"}</div> 405 <div className="stat-label">Most surprising axis</div> 406 </div> 407 </div> 408 409 {/* Breakdown by type */} 410 <div className="card" style={{ padding: "16px", marginBottom: "24px" }}> 411 <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "12px" }}> 412 Breakdown by type 413 </div> 414 <div style={{ display: "flex", gap: "24px", flexWrap: "wrap" }}> 415 {orderedCategories.map(cat => ( 416 <div key={cat} style={{ display: "flex", alignItems: "baseline", gap: "8px" }}> 417 <span style={{ 418 width: "8px", 419 height: "8px", 420 borderRadius: "2px", 421 background: CATEGORY_COLORS[cat], 422 display: "inline-block", 423 flexShrink: 0, 424 position: "relative", 425 top: "-1px", 426 }} /> 427 <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "14px" }}> 428 {byCategory[cat]?.length || 0} 429 </span> 430 <span style={{ fontSize: "11px", color: "var(--text-muted)" }}> 431 {CATEGORY_LABELS[cat]} 432 </span> 433 </div> 434 ))} 435 </div> 436 {axisCounts.length > 1 && ( 437 <div style={{ marginTop: "12px", paddingTop: "12px", borderTop: "1px solid var(--border)" }}> 438 <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "8px" }}> 439 Surprises by axis 440 </div> 441 <div style={{ display: "flex", gap: "16px", flexWrap: "wrap" }}> 442 {axisCounts.map(([axis, count]) => ( 443 <div key={axis} style={{ display: "flex", alignItems: "baseline", gap: "6px" }}> 444 <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "13px" }}> 445 {count} 446 </span> 447 <span style={{ fontSize: "11px", color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> 448 {axis} 449 </span> 450 </div> 451 ))} 452 </div> 453 </div> 454 )} 455 </div> 456 457 {/* Grouped surprise cards */} 458 {orderedCategories.map(cat => ( 459 <div key={cat} style={{ marginBottom: "32px" }}> 460 <h3 style={{ marginBottom: "4px" }}>{CATEGORY_LABELS[cat]}</h3> 461 <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}> 462 {CATEGORY_DESCRIPTIONS[cat]} 463 </p> 464 <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}> 465 {byCategory[cat]!.map((s, i) => ( 466 <SurpriseCard key={i} surprise={s} /> 467 ))} 468 </div> 469 </div> 470 ))} 471 </div> 472 ); 473 }