CellDetail.tsx (23706B)
1 import type { Run, AxisName } from "../lib/types"; 2 3 interface CellDetailProps { 4 runs: Run[]; 5 axisValues: Record<AxisName, string[]>; 6 } 7 8 const AXIS_CONFIG: Array<{ key: string; label: string }> = [ 9 { key: "model", label: "Model" }, 10 { key: "effort", label: "Effort" }, 11 { key: "prompt_style", label: "Prompt" }, 12 { key: "language", label: "Language" }, 13 { key: "human_language", label: "Human Lang" }, 14 { key: "tool_read", label: "Read" }, 15 { key: "tool_write", label: "Write" }, 16 { key: "tool_edit", label: "Edit" }, 17 { key: "tool_glob", label: "Glob" }, 18 { key: "tool_grep", label: "Grep" }, 19 { key: "linter", label: "Linter" }, 20 { key: "playwright", label: "Playwright" }, 21 { key: "context_file", label: "Context" }, 22 { key: "sub_agents", label: "Sub-agents" }, 23 { key: "web_search", label: "Web Search" }, 24 { key: "max_budget", label: "Budget" }, 25 ]; 26 27 const EVAL_DIMENSIONS: Array<{ key: string; label: string }> = [ 28 { key: "structural", label: "Structural" }, 29 { key: "functional", label: "Functional" }, 30 { key: "quality", label: "Quality" }, 31 { key: "code_analysis", label: "Code Analysis" }, 32 { key: "gameplay_bot", label: "Gameplay Bot" }, 33 { key: "transcript_analysis", label: "Transcript" }, 34 ]; 35 36 // -- Helpers -- 37 38 function avg(arr: number[]): number { 39 return arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0; 40 } 41 42 function median(arr: number[]): number { 43 if (arr.length === 0) return 0; 44 const sorted = [...arr].sort((a, b) => a - b); 45 const mid = Math.floor(sorted.length / 2); 46 return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2; 47 } 48 49 function formatPct(v: number): string { 50 return Math.round(v * 100) + "%"; 51 } 52 53 function formatCost(v: number | null | undefined): string { 54 if (v === null || v === undefined) return "-"; 55 return "$" + v.toFixed(2); 56 } 57 58 function formatTime(seconds: number | null | undefined): string { 59 if (seconds === null || seconds === undefined) return "-"; 60 const s = Math.round(seconds); 61 if (s < 60) return s + "s"; 62 return Math.floor(s / 60) + "m " + (s % 60) + "s"; 63 } 64 65 function scoreColor(pct: number): string { 66 if (pct >= 70) return "var(--green)"; 67 if (pct >= 40) return "var(--yellow)"; 68 return "var(--red)"; 69 } 70 71 // -- Sub-components -- 72 73 function ConfigPills({ 74 label, 75 activeValue, 76 allValues, 77 }: { 78 label: string; 79 activeValue: string; 80 allValues: string[]; 81 }) { 82 return ( 83 <div style={{ display: "flex", alignItems: "center", gap: "8px", marginBottom: "4px" }}> 84 <div style={{ 85 width: "80px", fontSize: "0.65rem", color: "var(--text-muted)", 86 textTransform: "uppercase", letterSpacing: "0.03em", textAlign: "right", flexShrink: 0, 87 }}> 88 {label} 89 </div> 90 <div style={{ display: "flex", gap: "3px", flexWrap: "wrap" }}> 91 {allValues.map((val) => ( 92 <span key={val} style={{ 93 padding: "1px 6px", 94 fontSize: "0.65rem", 95 fontFamily: "var(--font-mono)", 96 background: val === activeValue ? "rgba(255, 255, 255, 0.1)" : "transparent", 97 color: val === activeValue ? "#fff" : "rgba(255, 255, 255, 0.2)", 98 border: val === activeValue ? "1px solid rgba(255, 255, 255, 0.3)" : "1px solid rgba(255, 255, 255, 0.05)", 99 }}> 100 {val} 101 </span> 102 ))} 103 </div> 104 </div> 105 ); 106 } 107 108 function RangeBar({ 109 label, 110 avgScore, 111 minScore, 112 maxScore, 113 }: { 114 label: string; 115 avgScore: number; 116 minScore: number; 117 maxScore: number; 118 }) { 119 const avgPct = Math.round(avgScore * 100); 120 const minPct = Math.round(minScore * 100); 121 const maxPct = Math.round(maxScore * 100); 122 const color = scoreColor(avgPct); 123 124 return ( 125 <div style={{ marginBottom: "8px" }}> 126 <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.75rem", marginBottom: "2px" }}> 127 <span>{label}</span> 128 <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, color }}> 129 {avgPct}% 130 {minPct !== maxPct && ( 131 <span style={{ fontSize: "0.65rem", color: "var(--text-muted)", marginLeft: "4px" }}> 132 {minPct}-{maxPct}% 133 </span> 134 )} 135 </span> 136 </div> 137 <div style={{ position: "relative", background: "var(--bg)", height: "6px", overflow: "hidden" }}> 138 {/* Range band */} 139 {minPct !== maxPct && ( 140 <div style={{ 141 position: "absolute", 142 left: `${minPct}%`, 143 width: `${maxPct - minPct}%`, 144 height: "100%", 145 background: color, 146 opacity: 0.2, 147 }} /> 148 )} 149 {/* Average marker */} 150 <div style={{ 151 position: "absolute", 152 left: 0, 153 width: `${avgPct}%`, 154 height: "100%", 155 background: color, 156 opacity: 0.7, 157 }} /> 158 </div> 159 </div> 160 ); 161 } 162 163 // -- Main component -- 164 165 export default function CellDetail({ runs, axisValues }: CellDetailProps) { 166 const meta = runs[0].meta; 167 168 // Aggregate scores 169 const scores = runs.map(r => r.eval_results?.score).filter((s): s is number => s != null); 170 const costs = runs.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); 171 const times = runs.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null); 172 const turnsList = runs.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null); 173 174 const avgScore = avg(scores); 175 const minScore = scores.length > 0 ? Math.min(...scores) : 0; 176 const maxScore = scores.length > 0 ? Math.max(...scores) : 0; 177 const medianScore = median(scores); 178 const range = maxScore - minScore; 179 const stdDev = scores.length > 1 180 ? Math.sqrt(scores.reduce((sum, s) => sum + (s - avgScore) ** 2, 0) / (scores.length - 1)) 181 : 0; 182 const coefficientOfVariation = avgScore > 0 ? (range / avgScore) * 100 : 0; 183 184 let consistencyLabel: string; 185 let consistencyColor: string; 186 if (coefficientOfVariation < 10) { 187 consistencyLabel = "Highly consistent"; 188 consistencyColor = "var(--green)"; 189 } else if (coefficientOfVariation <= 25) { 190 consistencyLabel = "Moderate"; 191 consistencyColor = "var(--yellow)"; 192 } else { 193 consistencyLabel = "Volatile"; 194 consistencyColor = "var(--red)"; 195 } 196 197 // Per-dimension aggregation 198 const dimStats = EVAL_DIMENSIONS.map(({ key, label }) => { 199 const dimScores = runs 200 .map(r => (r.eval_results as Record<string, any>)?.[key]?.score) 201 .filter((s): s is number => s != null); 202 return { 203 key, 204 label, 205 avg: avg(dimScores), 206 min: dimScores.length > 0 ? Math.min(...dimScores) : 0, 207 max: dimScores.length > 0 ? Math.max(...dimScores) : 0, 208 count: dimScores.length, 209 }; 210 }); 211 212 // Comparison table metrics 213 const comparisonRows: Array<{ 214 label: string; 215 values: (string | number | null)[]; 216 type: "pct" | "cost" | "time" | "number"; 217 }> = [ 218 { 219 label: "Overall Score", 220 values: runs.map(r => r.eval_results?.score ?? null), 221 type: "pct", 222 }, 223 ...EVAL_DIMENSIONS.map(({ key, label }) => ({ 224 label, 225 values: runs.map(r => (r.eval_results as Record<string, any>)?.[key]?.score ?? null), 226 type: "pct" as const, 227 })), 228 { 229 label: "Cost", 230 values: runs.map(r => r.claude_output?.total_cost_usd ?? null), 231 type: "cost" as const, 232 }, 233 { 234 label: "Turns", 235 values: runs.map(r => r.claude_output?.num_turns ?? null), 236 type: "number" as const, 237 }, 238 { 239 label: "Wall Time", 240 values: runs.map(r => r.meta.wall_time_seconds ?? null), 241 type: "time" as const, 242 }, 243 ]; 244 245 // Agent behavior data 246 const agentData = runs.map(r => { 247 const ta = (r.eval_results as Record<string, any>)?.transcript_analysis; 248 return { 249 run_id: r.meta.run_id, 250 short_id: r.meta.short_id, 251 run_number: r.meta.run_number, 252 tool_calls: ta?.tool_calls?.total ?? null, 253 bash: ta?.tool_calls?.bash ?? null, 254 write: ta?.tool_calls?.write ?? null, 255 edit: ta?.tool_calls?.edit ?? null, 256 wasted_turns: ta?.wasted_turns?.total ?? null, 257 productivity: ta?.productivity_ratio ?? null, 258 self_tested: ta?.self_tested ?? null, 259 errors: ta?.errors_encountered ?? null, 260 thinking_blocks: ta?.thinking_blocks ?? null, 261 }; 262 }); 263 264 // Artifact check 265 const hasArtifact = meta.task === "tetris" || meta.task === "bookmarks-api"; 266 267 return ( 268 <div style={{ display: "flex", flexDirection: "column", gap: "20px" }}> 269 270 {/* A. Header row: three cards */} 271 <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr 1fr", gap: "16px" }}> 272 273 {/* Summary stats card */} 274 <div className="card" style={{ padding: "16px" }}> 275 <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Summary</h3> 276 277 {/* Score range visual */} 278 <div style={{ marginBottom: "16px" }}> 279 <div style={{ display: "flex", justifyContent: "space-between", alignItems: "baseline", marginBottom: "4px" }}> 280 <span style={{ fontSize: "0.65rem", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px" }}> 281 Score 282 </span> 283 <span style={{ 284 fontFamily: "var(--font-mono)", fontWeight: 700, fontSize: "1.75rem", 285 color: scoreColor(Math.round(avgScore * 100)), 286 }}> 287 {formatPct(avgScore)} 288 </span> 289 </div> 290 <div style={{ position: "relative", background: "var(--bg)", height: "8px", marginBottom: "4px" }}> 291 {/* Range band */} 292 <div style={{ 293 position: "absolute", 294 left: `${Math.round(minScore * 100)}%`, 295 width: `${Math.round((maxScore - minScore) * 100)}%`, 296 height: "100%", 297 background: scoreColor(Math.round(avgScore * 100)), 298 opacity: 0.3, 299 }} /> 300 {/* Avg marker */} 301 <div style={{ 302 position: "absolute", 303 left: `${Math.round(avgScore * 100)}%`, 304 top: "-2px", 305 width: "2px", 306 height: "12px", 307 background: scoreColor(Math.round(avgScore * 100)), 308 }} /> 309 </div> 310 <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.6rem", color: "var(--text-muted)" }}> 311 <span>min {formatPct(minScore)}</span> 312 <span>med {formatPct(medianScore)}</span> 313 <span>max {formatPct(maxScore)}</span> 314 </div> 315 </div> 316 317 <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "8px", fontSize: "0.75rem" }}> 318 <div> 319 <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Cost</div> 320 <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{formatCost(avg(costs))}</div> 321 </div> 322 <div> 323 <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Turns</div> 324 <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{turnsList.length > 0 ? Math.round(avg(turnsList)) : "-"}</div> 325 </div> 326 <div> 327 <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Avg Time</div> 328 <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{formatTime(avg(times))}</div> 329 </div> 330 <div> 331 <div style={{ color: "var(--text-muted)", fontSize: "0.6rem", textTransform: "uppercase", letterSpacing: "0.5px" }}>Runs</div> 332 <div style={{ fontFamily: "var(--font-mono)", fontWeight: 600 }}>{runs.length}</div> 333 </div> 334 </div> 335 336 <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "8px" }}> 337 <div style={{ fontSize: "0.6rem", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "2px" }}> 338 Consistency 339 </div> 340 <div style={{ display: "flex", alignItems: "baseline", gap: "6px" }}> 341 <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "0.8rem", color: consistencyColor }}> 342 {consistencyLabel} 343 </span> 344 <span style={{ fontSize: "0.65rem", color: "var(--text-muted)" }}> 345 {coefficientOfVariation.toFixed(0)}% CV 346 </span> 347 </div> 348 <div style={{ marginTop: "8px", fontSize: "0.65rem", display: "flex", flexDirection: "column", gap: "2px" }}> 349 <div style={{ display: "flex", justifyContent: "space-between" }}> 350 <span style={{ color: "var(--text-muted)" }}>Range</span> 351 <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(minScore)} - {formatPct(maxScore)}</span> 352 </div> 353 <div style={{ display: "flex", justifyContent: "space-between" }}> 354 <span style={{ color: "var(--text-muted)" }}>Spread</span> 355 <span style={{ fontFamily: "var(--font-mono)" }}>{(range * 100).toFixed(1)}pp</span> 356 </div> 357 <div style={{ display: "flex", justifyContent: "space-between" }}> 358 <span style={{ color: "var(--text-muted)" }}>Std Dev</span> 359 <span style={{ fontFamily: "var(--font-mono)" }}>{(stdDev * 100).toFixed(1)}pp</span> 360 </div> 361 <div style={{ display: "flex", justifyContent: "space-between" }}> 362 <span style={{ color: "var(--text-muted)" }}>Median</span> 363 <span style={{ fontFamily: "var(--font-mono)" }}>{formatPct(medianScore)}</span> 364 </div> 365 <div style={{ display: "flex", justifyContent: "space-between" }}> 366 <span style={{ color: "var(--text-muted)" }}>Cost range</span> 367 <span style={{ fontFamily: "var(--font-mono)" }}>{formatCost(costs.length > 0 ? Math.min(...costs) : null)} - {formatCost(costs.length > 0 ? Math.max(...costs) : null)}</span> 368 </div> 369 <div style={{ display: "flex", justifyContent: "space-between" }}> 370 <span style={{ color: "var(--text-muted)" }}>Turns range</span> 371 <span style={{ fontFamily: "var(--font-mono)" }}>{turnsList.length > 0 ? Math.min(...turnsList) : "-"} - {turnsList.length > 0 ? Math.max(...turnsList) : "-"}</span> 372 </div> 373 </div> 374 </div> 375 </div> 376 377 {/* Configuration card */} 378 <div className="card" style={{ padding: "16px" }}> 379 <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Configuration</h3> 380 {AXIS_CONFIG.map(({ key, label }) => { 381 const active = String((meta as unknown as Record<string, unknown>)[key] ?? ""); 382 const all = (axisValues as Record<string, string[]>)[key] || [active]; 383 if (!active) return null; 384 return <ConfigPills key={key} label={label} activeValue={active} allValues={all} />; 385 })} 386 </div> 387 388 {/* Variance card */} 389 <div className="card" style={{ padding: "16px" }}> 390 <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Score Breakdown</h3> 391 {dimStats.map((d) => ( 392 d.count > 0 ? ( 393 <RangeBar 394 key={d.key} 395 label={d.label} 396 avgScore={d.avg} 397 minScore={d.min} 398 maxScore={d.max} 399 /> 400 ) : ( 401 <div key={d.key} style={{ marginBottom: "8px" }}> 402 <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.75rem", marginBottom: "2px" }}> 403 <span>{d.label}</span> 404 <span style={{ color: "var(--text-muted)" }}>N/A</span> 405 </div> 406 </div> 407 ) 408 ))} 409 </div> 410 </div> 411 412 {/* B. Run comparison table */} 413 <div className="card" style={{ padding: "16px", overflowX: "auto" }}> 414 <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Run Comparison</h3> 415 <table> 416 <thead> 417 <tr> 418 <th style={{ textAlign: "left" }}>Metric</th> 419 {runs.map((r) => ( 420 <th key={r.meta.run_id} style={{ textAlign: "center" }}> 421 <a href={`/r/${r.meta.short_id || r.meta.run_id}`} style={{ color: "var(--accent)" }}> 422 Run #{r.meta.run_number} 423 </a> 424 </th> 425 ))} 426 </tr> 427 </thead> 428 <tbody> 429 {comparisonRows.map((row) => { 430 const numericVals = row.values.filter((v): v is number => v != null); 431 const best = numericVals.length > 0 ? ( 432 row.type === "cost" || row.type === "time" ? Math.min(...numericVals) : Math.max(...numericVals) 433 ) : null; 434 const worst = numericVals.length > 0 ? ( 435 row.type === "cost" || row.type === "time" ? Math.max(...numericVals) : Math.min(...numericVals) 436 ) : null; 437 const allSame = best !== null && best === worst; 438 439 return ( 440 <tr key={row.label}> 441 <td style={{ fontSize: "0.75rem" }}>{row.label}</td> 442 {row.values.map((v, i) => { 443 let display: string; 444 if (v === null || v === undefined) { 445 display = "-"; 446 } else if (row.type === "pct") { 447 display = formatPct(v as number); 448 } else if (row.type === "cost") { 449 display = formatCost(v as number); 450 } else if (row.type === "time") { 451 display = formatTime(v as number); 452 } else { 453 display = String(v); 454 } 455 456 let cellColor: string | undefined; 457 if (v !== null && !allSame) { 458 if (v === best) cellColor = "var(--green)"; 459 else if (v === worst) cellColor = "var(--red)"; 460 } 461 462 return ( 463 <td key={i} style={{ 464 textAlign: "center", 465 fontFamily: "var(--font-mono)", 466 fontSize: "0.75rem", 467 color: cellColor, 468 fontWeight: cellColor ? 600 : undefined, 469 }}> 470 {display} 471 </td> 472 ); 473 })} 474 </tr> 475 ); 476 })} 477 </tbody> 478 </table> 479 </div> 480 481 {/* C. Artifact gallery */} 482 {hasArtifact && ( 483 <div> 484 <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Artifacts</h3> 485 <div style={{ display: "grid", gridTemplateColumns: `repeat(${Math.min(runs.length, 3)}, 1fr)`, gap: "16px" }}> 486 {runs.map((r) => { 487 const artifactUrl = `/artifacts/${r.meta.run_id}/index.html`; 488 const score = r.eval_results?.score; 489 return ( 490 <div key={r.meta.run_id} className="card" style={{ padding: "0", overflow: "hidden" }}> 491 <div style={{ 492 padding: "8px 12px", 493 borderBottom: "1px solid var(--border)", 494 fontSize: "0.75rem", 495 display: "flex", 496 justifyContent: "space-between", 497 alignItems: "center", 498 }}> 499 <a href={`/r/${r.meta.short_id || r.meta.run_id}`} style={{ color: "var(--accent)", fontWeight: 600 }}> 500 Run #{r.meta.run_number} 501 </a> 502 <span style={{ 503 fontFamily: "var(--font-mono)", 504 fontWeight: 600, 505 color: score != null ? scoreColor(Math.round(score * 100)) : "var(--text-muted)", 506 }}> 507 {score != null ? formatPct(score) : "-"} 508 </span> 509 </div> 510 <iframe 511 src={artifactUrl} 512 style={{ 513 width: "100%", 514 height: "50vh", 515 border: "none", 516 background: "#fff", 517 }} 518 sandbox="allow-scripts" 519 title={`Run #${r.meta.run_number} preview`} 520 /> 521 </div> 522 ); 523 })} 524 </div> 525 </div> 526 )} 527 528 {/* D. Agent behavior comparison */} 529 {agentData.some(a => a.tool_calls !== null) && ( 530 <div className="card" style={{ padding: "16px", overflowX: "auto" }}> 531 <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Agent Behavior</h3> 532 <table> 533 <thead> 534 <tr> 535 <th style={{ textAlign: "left" }}>Metric</th> 536 {agentData.map((a) => ( 537 <th key={a.run_id} style={{ textAlign: "center" }}> 538 <a href={`/r/${a.short_id || a.run_id}`} style={{ color: "var(--accent)" }}> 539 Run #{a.run_number} 540 </a> 541 </th> 542 ))} 543 </tr> 544 </thead> 545 <tbody> 546 {[ 547 { label: "Tool calls", key: "tool_calls" as const }, 548 { label: "Bash", key: "bash" as const }, 549 { label: "Write", key: "write" as const }, 550 { label: "Edit", key: "edit" as const }, 551 { label: "Wasted turns", key: "wasted_turns" as const }, 552 { label: "Errors", key: "errors" as const }, 553 { label: "Thinking blocks", key: "thinking_blocks" as const }, 554 ].map((row) => ( 555 <tr key={row.label}> 556 <td style={{ fontSize: "0.75rem" }}>{row.label}</td> 557 {agentData.map((a, i) => ( 558 <td key={i} style={{ textAlign: "center", fontFamily: "var(--font-mono)", fontSize: "0.75rem" }}> 559 {a[row.key] ?? "-"} 560 </td> 561 ))} 562 </tr> 563 ))} 564 <tr> 565 <td style={{ fontSize: "0.75rem" }}>Productivity</td> 566 {agentData.map((a, i) => ( 567 <td key={i} style={{ textAlign: "center", fontFamily: "var(--font-mono)", fontSize: "0.75rem" }}> 568 {a.productivity != null ? Math.round(a.productivity * 100) + "%" : "-"} 569 </td> 570 ))} 571 </tr> 572 <tr> 573 <td style={{ fontSize: "0.75rem" }}>Self-tested</td> 574 {agentData.map((a, i) => ( 575 <td key={i} style={{ 576 textAlign: "center", 577 fontFamily: "var(--font-mono)", 578 fontSize: "0.75rem", 579 color: a.self_tested === true ? "var(--green)" : a.self_tested === false ? "var(--text-muted)" : undefined, 580 }}> 581 {a.self_tested === true ? "yes" : a.self_tested === false ? "no" : "-"} 582 </td> 583 ))} 584 </tr> 585 </tbody> 586 </table> 587 </div> 588 )} 589 </div> 590 ); 591 }