Variability.tsx (21521B)
1 import { useMemo } from "react"; 2 import type { Run, AxisName } from "../lib/types"; 3 import { AXIS_NAMES } from "../lib/types"; 4 import { groupIntoCells } from "../lib/analysis"; 5 import type { Cell } from "../lib/analysis"; 6 7 interface VariabilityProps { 8 runs: Run[]; 9 } 10 11 const AXIS_LABELS: Record<string, string> = { 12 model: "Model", 13 effort: "Effort", 14 prompt_style: "Prompt Style", 15 language: "Language", 16 human_language: "Human Language", 17 tool_read: "Read Tool", 18 tool_write: "Write Tool", 19 tool_edit: "Edit Tool", 20 tool_glob: "Glob Tool", 21 tool_grep: "Grep Tool", 22 linter: "Linter", 23 playwright: "Playwright", 24 context_file: "Context File", 25 web_search: "Web Search", 26 max_budget: "Budget", 27 tests_provided: "Tests Provided", 28 strategy: "Strategy", 29 design_guidance: "Design Guidance", 30 architecture: "Architecture", 31 error_checking: "Error Checking", 32 context_noise: "Context Noise", 33 renderer: "Renderer", 34 provider: "Provider", 35 }; 36 37 /* ---------- helpers ---------- */ 38 39 function quantile(sorted: number[], q: number): number { 40 if (sorted.length === 0) return 0; 41 if (sorted.length === 1) return sorted[0]; 42 const pos = q * (sorted.length - 1); 43 const lo = Math.floor(pos); 44 const hi = Math.ceil(pos); 45 if (lo === hi) return sorted[lo]; 46 return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]); 47 } 48 49 function variance(values: number[]): number { 50 if (values.length < 2) return 0; 51 const mean = values.reduce((a, b) => a + b, 0) / values.length; 52 return values.reduce((s, v) => s + (v - mean) ** 2, 0) / values.length; 53 } 54 55 /* ---------- Section 1: Box Plots ---------- */ 56 57 interface BoxPlotStats { 58 model: string; 59 min: number; 60 q1: number; 61 median: number; 62 q3: number; 63 max: number; 64 points: number[]; 65 cellCount: number; 66 } 67 68 function computeBoxPlots(cells: Cell[]): BoxPlotStats[] { 69 const MODEL_ORDER: Record<string, number> = { haiku: 1, sonnet: 2, opus: 3 }; 70 const models = Array.from(new Set(cells.map((c) => c.meta.model))) 71 .sort((a, b) => (MODEL_ORDER[a] || 99) - (MODEL_ORDER[b] || 99)); 72 const results: BoxPlotStats[] = []; 73 74 for (const model of models) { 75 const modelCells = cells.filter((c) => c.meta.model === model); 76 const scores = modelCells 77 .map((c) => c.score.avg) 78 .filter((s) => s > 0) 79 .sort((a, b) => a - b); 80 81 if (scores.length === 0) continue; 82 83 results.push({ 84 model, 85 min: scores[0], 86 q1: quantile(scores, 0.25), 87 median: quantile(scores, 0.5), 88 q3: quantile(scores, 0.75), 89 max: scores[scores.length - 1], 90 points: scores, 91 cellCount: scores.length, 92 }); 93 } 94 95 return results; 96 } 97 98 function BoxPlotSection({ cells }: { cells: Cell[] }) { 99 const stats = useMemo(() => computeBoxPlots(cells), [cells]); 100 101 if (stats.length === 0) { 102 return ( 103 <div style={{ color: "var(--text-muted)", padding: "20px" }}> 104 No scored cells available. 105 </div> 106 ); 107 } 108 109 // Global scale across all models 110 const globalMin = Math.min(...stats.map((s) => s.min)); 111 const globalMax = Math.max(...stats.map((s) => s.max)); 112 const range = globalMax - globalMin || 0.01; 113 114 const toPercent = (v: number) => ((v - globalMin) / range) * 100; 115 116 return ( 117 <div> 118 {/* Axis labels */} 119 <div 120 style={{ 121 display: "flex", 122 justifyContent: "space-between", 123 marginBottom: "4px", 124 paddingLeft: "140px", 125 paddingRight: "12px", 126 }} 127 > 128 <span style={axisLabelStyle}>{(globalMin * 100).toFixed(0)}%</span> 129 <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span> 130 </div> 131 132 {stats.map((s) => { 133 const isLowN = s.cellCount < 3; 134 return ( 135 <div 136 key={s.model} 137 style={{ 138 display: "flex", 139 alignItems: "center", 140 marginBottom: "16px", 141 gap: "12px", 142 opacity: isLowN ? 0.4 : 1, 143 }} 144 > 145 {/* Label */} 146 <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}> 147 <div style={labelPrimaryStyle}>{s.model}</div> 148 <div style={{ ...labelSecondaryStyle, color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)" }}> 149 median {(s.median * 100).toFixed(1)}% / n={s.cellCount} cell{s.cellCount !== 1 ? "s" : ""} 150 </div> 151 </div> 152 153 {/* Box plot */} 154 <div 155 style={{ 156 flex: 1, 157 position: "relative", 158 height: "32px", 159 marginRight: "12px", 160 }} 161 > 162 {/* Background track */} 163 <div 164 style={{ 165 position: "absolute", 166 top: "50%", 167 left: 0, 168 right: 0, 169 height: "1px", 170 background: "hsl(var(--border))", 171 transform: "translateY(-50%)", 172 }} 173 /> 174 175 {/* Whisker line: min to max */} 176 <div 177 style={{ 178 position: "absolute", 179 top: "50%", 180 left: `${toPercent(s.min)}%`, 181 width: `${toPercent(s.max) - toPercent(s.min)}%`, 182 height: "2px", 183 background: "var(--accent)", 184 opacity: 0.5, 185 transform: "translateY(-50%)", 186 }} 187 /> 188 189 {/* Min whisker cap */} 190 <div 191 style={{ 192 position: "absolute", 193 top: "50%", 194 left: `${toPercent(s.min)}%`, 195 width: "1px", 196 height: "12px", 197 background: "var(--accent)", 198 opacity: 0.5, 199 transform: "translate(-50%, -50%)", 200 }} 201 /> 202 203 {/* Max whisker cap */} 204 <div 205 style={{ 206 position: "absolute", 207 top: "50%", 208 left: `${toPercent(s.max)}%`, 209 width: "1px", 210 height: "12px", 211 background: "var(--accent)", 212 opacity: 0.5, 213 transform: "translate(-50%, -50%)", 214 }} 215 /> 216 217 {/* IQR box: Q1 to Q3 */} 218 <div 219 style={{ 220 position: "absolute", 221 top: "50%", 222 left: `${toPercent(s.q1)}%`, 223 width: `${Math.max(toPercent(s.q3) - toPercent(s.q1), 0.5)}%`, 224 height: "18px", 225 background: "var(--accent)", 226 opacity: 0.2, 227 border: "1px solid var(--accent)", 228 transform: "translateY(-50%)", 229 }} 230 /> 231 232 {/* Median line */} 233 <div 234 style={{ 235 position: "absolute", 236 top: "50%", 237 left: `${toPercent(s.median)}%`, 238 width: "2px", 239 height: "22px", 240 background: "var(--accent)", 241 transform: "translate(-50%, -50%)", 242 }} 243 /> 244 245 {/* Individual cell dots */} 246 {s.points.map((p, i) => ( 247 <div 248 key={i} 249 style={{ 250 position: "absolute", 251 top: "50%", 252 left: `${toPercent(p)}%`, 253 width: "5px", 254 height: "5px", 255 borderRadius: "50%", 256 background: "var(--accent)", 257 opacity: 0.6, 258 transform: "translate(-50%, -50%)", 259 zIndex: 1, 260 }} 261 /> 262 ))} 263 </div> 264 </div> 265 ); 266 })} 267 </div> 268 ); 269 } 270 271 /* ---------- Section 2: Reliability Ranking ---------- */ 272 273 interface ReliabilityRow { 274 axis: string; 275 value: string; 276 avgScore: number; 277 avgRange: number; 278 n: number; 279 } 280 281 function computeReliability(cells: Cell[]): ReliabilityRow[] { 282 const rows: ReliabilityRow[] = []; 283 284 for (const axis of AXIS_NAMES) { 285 const groups: Record<string, { scores: number[]; ranges: number[] }> = {}; 286 for (const cell of cells) { 287 const val = String( 288 (cell.meta as Record<string, unknown>)[axis] ?? "unknown" 289 ); 290 const g = (groups[val] ??= { scores: [], ranges: [] }); 291 if (cell.score.avg > 0) { 292 g.scores.push(cell.score.avg); 293 g.ranges.push(cell.score.range); 294 } 295 } 296 297 for (const [val, { scores, ranges }] of Object.entries(groups)) { 298 if (scores.length < 2) continue; 299 rows.push({ 300 axis, 301 value: val, 302 avgScore: scores.reduce((a, b) => a + b, 0) / scores.length, 303 avgRange: ranges.reduce((a, b) => a + b, 0) / ranges.length, 304 n: scores.length, 305 }); 306 } 307 } 308 309 return rows.sort((a, b) => a.avgRange - b.avgRange); 310 } 311 312 function reliabilityColor(avgRange: number): string { 313 if (avgRange <= 0.05) return "var(--green)"; 314 if (avgRange <= 0.12) return "var(--yellow)"; 315 return "var(--red)"; 316 } 317 318 function ReliabilitySection({ cells }: { cells: Cell[] }) { 319 const rows = useMemo(() => computeReliability(cells), [cells]); 320 321 if (rows.length === 0) { 322 return ( 323 <div style={{ color: "var(--text-muted)", padding: "20px" }}> 324 Not enough multi-run cells to compute reliability. 325 </div> 326 ); 327 } 328 329 const maxRange = Math.max(...rows.map((r) => r.avgRange), 0.01); 330 331 return ( 332 <div style={{ overflowX: "auto" }}> 333 <table style={{ borderCollapse: "collapse", width: "100%" }}> 334 <thead> 335 <tr> 336 {["VARIABLE", "VALUE", "N", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map( 337 (h) => ( 338 <th key={h} style={thStyle}> 339 {h} 340 </th> 341 ) 342 )} 343 </tr> 344 </thead> 345 <tbody> 346 {rows.map((row, i) => { 347 const barWidth = (row.avgRange / maxRange) * 100; 348 const color = reliabilityColor(row.avgRange); 349 const isLowN = row.n < 3; 350 return ( 351 <tr 352 key={`${row.axis}-${row.value}`} 353 style={{ 354 borderBottom: "1px solid hsl(var(--border))", 355 background: 356 i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)", 357 opacity: isLowN ? 0.4 : 1, 358 }} 359 > 360 <td style={tdStyle}> 361 {AXIS_LABELS[row.axis] || row.axis} 362 </td> 363 <td style={{ ...tdStyle, fontFamily: "var(--font-mono)" }}> 364 {row.value} 365 </td> 366 <td 367 style={{ 368 ...tdStyle, 369 fontFamily: "var(--font-mono)", 370 textAlign: "right", 371 color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)", 372 fontWeight: isLowN ? 600 : 400, 373 }} 374 > 375 {row.n} 376 </td> 377 <td 378 style={{ 379 ...tdStyle, 380 fontFamily: "var(--font-mono)", 381 textAlign: "right", 382 }} 383 > 384 {(row.avgScore * 100).toFixed(1)}% 385 </td> 386 <td 387 style={{ 388 ...tdStyle, 389 fontFamily: "var(--font-mono)", 390 textAlign: "right", 391 color, 392 }} 393 > 394 {(row.avgRange * 100).toFixed(1)}% 395 </td> 396 <td style={{ ...tdStyle, width: "200px" }}> 397 <div 398 style={{ 399 position: "relative", 400 height: "12px", 401 background: "hsl(var(--border) / 0.2)", 402 }} 403 > 404 <div 405 style={{ 406 position: "absolute", 407 top: 0, 408 left: 0, 409 height: "100%", 410 width: `${Math.max(barWidth, 1)}%`, 411 background: color, 412 opacity: 0.7, 413 }} 414 /> 415 </div> 416 </td> 417 </tr> 418 ); 419 })} 420 </tbody> 421 </table> 422 </div> 423 ); 424 } 425 426 /* ---------- Section 3: Variance Contribution ---------- */ 427 428 interface VarianceDecomp { 429 totalVariance: number; 430 withinVariance: number; 431 betweenVariance: number; 432 betweenPct: number; 433 withinPct: number; 434 } 435 436 function computeVarianceDecomp( 437 runs: Run[], 438 cells: Cell[] 439 ): VarianceDecomp | null { 440 // All individual run scores 441 const allScores = runs 442 .map((r) => r.eval_results?.score ?? null) 443 .filter((s): s is number => s !== null && s > 0); 444 445 if (allScores.length < 2) return null; 446 447 const totalVar = variance(allScores); 448 if (totalVar === 0) return null; 449 450 // Within-cell variance: average variance within each cell 451 const cellVariances: number[] = []; 452 for (const cell of cells) { 453 const scores = cell.runs 454 .map((r) => r.eval_results?.score ?? null) 455 .filter((s): s is number => s !== null && s > 0); 456 if (scores.length >= 2) { 457 cellVariances.push(variance(scores)); 458 } 459 } 460 461 const withinVar = 462 cellVariances.length > 0 463 ? cellVariances.reduce((a, b) => a + b, 0) / cellVariances.length 464 : 0; 465 466 const betweenVar = Math.max(totalVar - withinVar, 0); 467 const betweenPct = totalVar > 0 ? betweenVar / totalVar : 0; 468 const withinPct = totalVar > 0 ? withinVar / totalVar : 0; 469 470 return { 471 totalVariance: totalVar, 472 withinVariance: withinVar, 473 betweenVariance: betweenVar, 474 betweenPct, 475 withinPct, 476 }; 477 } 478 479 function VarianceSection({ 480 runs, 481 cells, 482 }: { 483 runs: Run[]; 484 cells: Cell[]; 485 }) { 486 const decomp = useMemo( 487 () => computeVarianceDecomp(runs, cells), 488 [runs, cells] 489 ); 490 491 if (!decomp) { 492 return ( 493 <div style={{ color: "var(--text-muted)", padding: "20px" }}> 494 Not enough data to decompose variance. 495 </div> 496 ); 497 } 498 499 return ( 500 <div> 501 <div 502 style={{ 503 display: "flex", 504 height: "32px", 505 marginBottom: "12px", 506 border: "1px solid hsl(var(--border))", 507 }} 508 > 509 {/* Between-cell (config choices) */} 510 <div 511 style={{ 512 width: `${decomp.betweenPct * 100}%`, 513 background: "var(--accent)", 514 opacity: 0.7, 515 display: "flex", 516 alignItems: "center", 517 justifyContent: "center", 518 fontSize: "11px", 519 fontFamily: "var(--font-mono)", 520 color: "var(--text)", 521 fontWeight: 600, 522 minWidth: decomp.betweenPct > 0.08 ? undefined : "0px", 523 overflow: "hidden", 524 whiteSpace: "nowrap", 525 }} 526 > 527 {decomp.betweenPct > 0.08 && 528 `${(decomp.betweenPct * 100).toFixed(0)}%`} 529 </div> 530 {/* Within-cell (randomness) */} 531 <div 532 style={{ 533 width: `${decomp.withinPct * 100}%`, 534 background: "var(--yellow)", 535 opacity: 0.5, 536 display: "flex", 537 alignItems: "center", 538 justifyContent: "center", 539 fontSize: "11px", 540 fontFamily: "var(--font-mono)", 541 color: "var(--text)", 542 fontWeight: 600, 543 minWidth: decomp.withinPct > 0.08 ? undefined : "0px", 544 overflow: "hidden", 545 whiteSpace: "nowrap", 546 }} 547 > 548 {decomp.withinPct > 0.08 && 549 `${(decomp.withinPct * 100).toFixed(0)}%`} 550 </div> 551 </div> 552 553 <div 554 style={{ 555 display: "flex", 556 gap: "24px", 557 flexWrap: "wrap", 558 }} 559 > 560 <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> 561 <div 562 style={{ 563 width: "12px", 564 height: "12px", 565 background: "var(--accent)", 566 opacity: 0.7, 567 }} 568 /> 569 <span style={legendStyle}> 570 CONFIG CHOICES: {(decomp.betweenPct * 100).toFixed(0)}% 571 </span> 572 </div> 573 <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> 574 <div 575 style={{ 576 width: "12px", 577 height: "12px", 578 background: "var(--yellow)", 579 opacity: 0.5, 580 }} 581 /> 582 <span style={legendStyle}> 583 RANDOMNESS: {(decomp.withinPct * 100).toFixed(0)}% 584 </span> 585 </div> 586 </div> 587 588 <p 589 style={{ 590 marginTop: "12px", 591 fontSize: "12px", 592 color: "var(--text-muted)", 593 lineHeight: "1.5", 594 }} 595 > 596 {decomp.betweenPct >= 0.5 597 ? `Configuration choices explain ${(decomp.betweenPct * 100).toFixed(0)}% of score variance. The config matters more than run-to-run randomness.` 598 : decomp.betweenPct >= 0.3 599 ? `Configuration and randomness contribute roughly equally. Scores are moderately sensitive to config choices.` 600 : `Run-to-run randomness dominates (${(decomp.withinPct * 100).toFixed(0)}%). Config choices have limited impact on scores -- results are noisy.`} 601 </p> 602 </div> 603 ); 604 } 605 606 /* ---------- shared styles ---------- */ 607 608 const sectionHeaderStyle: React.CSSProperties = { 609 fontSize: "11px", 610 fontFamily: "var(--font-mono)", 611 textTransform: "uppercase", 612 letterSpacing: "0.08em", 613 color: "var(--text-muted)", 614 marginBottom: "4px", 615 }; 616 617 const sectionTitleStyle: React.CSSProperties = { 618 fontSize: "16px", 619 fontWeight: 600, 620 marginBottom: "4px", 621 }; 622 623 const sectionDescStyle: React.CSSProperties = { 624 fontSize: "12px", 625 color: "var(--text-muted)", 626 marginBottom: "16px", 627 lineHeight: "1.4", 628 }; 629 630 const cardStyle: React.CSSProperties = { 631 border: "1px solid hsl(var(--border))", 632 padding: "20px", 633 marginBottom: "16px", 634 background: "var(--surface-1)", 635 }; 636 637 const axisLabelStyle: React.CSSProperties = { 638 fontSize: "10px", 639 fontFamily: "var(--font-mono)", 640 color: "var(--text-muted)", 641 textTransform: "uppercase", 642 letterSpacing: "0.06em", 643 }; 644 645 const labelPrimaryStyle: React.CSSProperties = { 646 fontSize: "13px", 647 fontFamily: "var(--font-mono)", 648 fontWeight: 600, 649 color: "var(--text)", 650 }; 651 652 const labelSecondaryStyle: React.CSSProperties = { 653 fontSize: "10px", 654 fontFamily: "var(--font-mono)", 655 color: "var(--text-muted)", 656 marginTop: "2px", 657 }; 658 659 const thStyle: React.CSSProperties = { 660 fontSize: "11px", 661 fontFamily: "var(--font-mono)", 662 textTransform: "uppercase", 663 letterSpacing: "0.06em", 664 color: "var(--text-muted)", 665 padding: "8px 12px", 666 textAlign: "left", 667 borderBottom: "1px solid hsl(var(--border))", 668 fontWeight: 500, 669 }; 670 671 const tdStyle: React.CSSProperties = { 672 fontSize: "12px", 673 padding: "6px 12px", 674 color: "var(--text)", 675 }; 676 677 const legendStyle: React.CSSProperties = { 678 fontSize: "11px", 679 fontFamily: "var(--font-mono)", 680 textTransform: "uppercase", 681 letterSpacing: "0.06em", 682 color: "var(--text-muted)", 683 }; 684 685 /* ---------- Main Component ---------- */ 686 687 export default function Variability({ runs }: VariabilityProps) { 688 const cells = useMemo(() => groupIntoCells(runs), [runs]); 689 690 if (runs.length === 0) { 691 return ( 692 <div 693 style={{ 694 ...cardStyle, 695 textAlign: "center", 696 padding: "40px", 697 color: "var(--text-muted)", 698 }} 699 > 700 No runs available for variability analysis. 701 </div> 702 ); 703 } 704 705 return ( 706 <div> 707 {/* Overall sample size subtitle */} 708 <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))", marginBottom: "8px" }}> 709 (n={runs.length} runs across {cells.length} cells) 710 </div> 711 {/* Section 1: Box Plots */} 712 <div style={cardStyle}> 713 <div style={sectionHeaderStyle}>CONSISTENCY</div> 714 <div style={sectionTitleStyle}>Score Distribution by Model</div> 715 <p style={sectionDescStyle}> 716 Each dot is a cell (unique config). The box spans Q1-Q3; the line 717 marks the median. Tighter boxes mean more consistent results across 718 configs. 719 </p> 720 <BoxPlotSection cells={cells} /> 721 </div> 722 723 {/* Section 2: Reliability Ranking */} 724 <div style={cardStyle}> 725 <div style={sectionHeaderStyle}>RELIABILITY</div> 726 <div style={sectionTitleStyle}>Reliability Ranking by Variable</div> 727 <p style={sectionDescStyle}> 728 How much do repeat runs of the same config vary? Sorted by average 729 range (smallest = most reliable). Green means scores are consistent 730 across re-runs; red means volatile. 731 </p> 732 <ReliabilitySection cells={cells} /> 733 </div> 734 735 {/* Section 3: Variance Decomposition */} 736 <div style={cardStyle}> 737 <div style={sectionHeaderStyle}>VARIANCE</div> 738 <div style={sectionTitleStyle}>Variance Contribution</div> 739 <p style={sectionDescStyle}> 740 ANOVA-style decomposition: how much of the total score variance comes 741 from config choices (between cells) vs run-to-run randomness (within 742 cells)? 743 </p> 744 <VarianceSection runs={runs} cells={cells} /> 745 </div> 746 </div> 747 ); 748 }