Grid.tsx (14070B)
1 import { useState, useMemo } from "react"; 2 import type { Run, AxisName } from "../lib/types"; 3 import Filters from "./Filters"; 4 5 interface GridProps { 6 runs: Run[]; 7 axisValues: Record<AxisName, string[]>; 8 tasks: string[]; 9 } 10 11 function scoreClass(score: number | null | undefined): string { 12 if (score === null || score === undefined) return ""; 13 if (score >= 0.7) return "score-high"; 14 if (score >= 0.4) return "score-mid"; 15 return "score-low"; 16 } 17 18 function formatScore(score: number | null | undefined): string { 19 if (score === null || score === undefined) return "-"; 20 return (score * 100).toFixed(0) + "%"; 21 } 22 23 function formatCost(cost: number | null | undefined): string { 24 if (cost === null || cost === undefined) return "-"; 25 return "$" + cost.toFixed(2); 26 } 27 28 function formatRunId(run: Run): React.ReactNode { 29 const m = run.meta; 30 return ( 31 <span style={{ display: "inline-flex", gap: "4px", alignItems: "center", flexWrap: "wrap" }}> 32 <span className="badge badge-neutral" style={{ fontSize: "0.7rem" }}>{m.task}</span> 33 <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}> 34 {m.actual_model || m.model} {m.prompt_style} {m.language} 35 </span> 36 </span> 37 ); 38 } 39 40 function formatTime(seconds: number | null | undefined): string { 41 if (seconds === null || seconds === undefined) return "-"; 42 if (seconds < 60) return seconds + "s"; 43 return Math.floor(seconds / 60) + "m " + (seconds % 60) + "s"; 44 } 45 46 type SortKey = "task" | "model" | "effort" | "prompt" | "lang" | "score" | "cost" | "time" | "turns"; 47 48 function getSortValue(run: Run, key: SortKey): string | number { 49 switch (key) { 50 case "task": return run.meta.task; 51 case "model": return run.meta.actual_model || run.meta.model; 52 case "effort": return run.meta.effort; 53 case "prompt": return run.meta.prompt_style; 54 case "lang": return run.meta.language; 55 case "score": return run.eval_results?.score ?? -1; 56 case "cost": return run.claude_output?.total_cost_usd ?? -1; 57 case "time": return run.meta.wall_time_seconds ?? -1; 58 case "turns": return run.claude_output?.num_turns ?? -1; 59 } 60 } 61 62 interface CellGroup { 63 cell_id: string; 64 runs: Run[]; 65 meta: Run["meta"]; 66 scores: number[]; 67 costs: number[]; 68 times: number[]; 69 turns: number[]; 70 avg: { score: number; cost: number; time: number; turns: number }; 71 min: { score: number; cost: number }; 72 max: { score: number; cost: number }; 73 } 74 75 function groupByCellId(runs: Run[]): CellGroup[] { 76 const groups: Record<string, Run[]> = {}; 77 for (const run of runs) { 78 const cellId = run.meta.cell_id; 79 (groups[cellId] ??= []).push(run); 80 } 81 return Object.entries(groups).map(([cell_id, cellRuns]) => { 82 const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null); 83 const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); 84 const times = cellRuns.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null); 85 const turnsList = cellRuns.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null); 86 const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0; 87 return { 88 cell_id, 89 runs: cellRuns, 90 meta: cellRuns[0].meta, 91 scores, costs, times, turns: turnsList, 92 avg: { score: avg(scores), cost: avg(costs), time: avg(times), turns: avg(turnsList) }, 93 min: { score: scores.length > 0 ? Math.min(...scores) : 0, cost: costs.length > 0 ? Math.min(...costs) : 0 }, 94 max: { score: scores.length > 0 ? Math.max(...scores) : 0, cost: costs.length > 0 ? Math.max(...costs) : 0 }, 95 }; 96 }); 97 } 98 99 function RangeCell({ min, max, avg, format }: { min: number; max: number; avg: number; format: (v: number) => string }) { 100 const spread = max - min; 101 const isWide = format === formatScore ? spread > 0.1 : spread > avg * 0.3; 102 return ( 103 <span style={{ fontFamily: "var(--font-mono)" }}> 104 <span style={{ fontWeight: 600 }}>{format(avg)}</span> 105 {min !== max && ( 106 <span style={{ fontSize: "0.65rem", color: isWide ? "var(--yellow)" : "var(--text-muted)", marginLeft: "4px" }}> 107 {format(min)}-{format(max)} 108 </span> 109 )} 110 </span> 111 ); 112 } 113 114 export default function Grid({ runs, axisValues, tasks }: GridProps) { 115 const [filters, setFilters] = useState<Record<string, string>>({}); 116 const [sortKey, setSortKey] = useState<SortKey>("score"); 117 const [sortAsc, setSortAsc] = useState(false); 118 const [grouped, setGrouped] = useState(true); 119 120 const handleSort = (key: SortKey) => { 121 if (sortKey === key) { 122 setSortAsc(!sortAsc); 123 } else { 124 setSortKey(key); 125 setSortAsc(false); 126 } 127 }; 128 129 const filteredRuns = useMemo(() => { 130 const filtered = runs.filter((run) => { 131 for (const [key, value] of Object.entries(filters)) { 132 if (key === "task") { 133 if (run.meta.task !== value) return false; 134 } else { 135 if (String(run.meta[key as keyof typeof run.meta]) !== value) 136 return false; 137 } 138 } 139 return true; 140 }); 141 142 return filtered.sort((a, b) => { 143 const va = getSortValue(a, sortKey); 144 const vb = getSortValue(b, sortKey); 145 const cmp = va < vb ? -1 : va > vb ? 1 : 0; 146 return sortAsc ? cmp : -cmp; 147 }); 148 }, [runs, filters, sortKey, sortAsc]); 149 150 const cellGroups = useMemo(() => { 151 const groups = groupByCellId(filteredRuns); 152 return groups.sort((a, b) => { 153 const va = sortKey === "score" ? a.avg.score : sortKey === "cost" ? a.avg.cost : sortKey === "time" ? a.avg.time : sortKey === "turns" ? a.avg.turns : 0; 154 const vb = sortKey === "score" ? b.avg.score : sortKey === "cost" ? b.avg.cost : sortKey === "time" ? b.avg.time : sortKey === "turns" ? b.avg.turns : 0; 155 const cmp = va < vb ? -1 : va > vb ? 1 : 0; 156 return sortAsc ? cmp : -cmp; 157 }); 158 }, [filteredRuns, sortKey, sortAsc]); 159 160 return ( 161 <div> 162 <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "12px" }}> 163 <Filters 164 axisValues={axisValues} 165 tasks={tasks} 166 onFilterChange={setFilters} 167 /> 168 <button 169 onClick={() => setGrouped(!grouped)} 170 style={{ 171 padding: "4px 10px", fontSize: "11px", fontFamily: "var(--font-mono)", 172 background: "transparent", border: "1px solid hsl(var(--border))", 173 color: "hsl(var(--foreground))", cursor: "pointer", textTransform: "uppercase", 174 letterSpacing: "0.5px", flexShrink: 0, 175 }} 176 > 177 {grouped ? "Show individual runs" : "Group by config"} 178 </button> 179 </div> 180 181 <div className="card" style={{ overflowX: "auto" }}> 182 <table> 183 <thead> 184 <tr> 185 <th>Run ID</th> 186 {(["task", "model", "effort", "prompt", "lang", "score"] as SortKey[]).map((key) => { 187 const labels: Record<SortKey, string> = { 188 task: "Task", model: "Model", effort: "Effort", prompt: "Prompt", 189 lang: "Lang", score: "Score", cost: "Cost", time: "Time", turns: "Turns", 190 }; 191 return ( 192 <th 193 key={key} 194 onClick={() => handleSort(key)} 195 style={{ cursor: "pointer", userSelect: "none" }} 196 > 197 {labels[key]} {sortKey === key ? (sortAsc ? "\u25B2" : "\u25BC") : ""} 198 </th> 199 ); 200 })} 201 <th>Pass</th> 202 {(["cost", "time", "turns"] as SortKey[]).map((key) => { 203 const labels: Record<SortKey, string> = { 204 task: "Task", model: "Model", effort: "Effort", prompt: "Prompt", 205 lang: "Lang", score: "Score", cost: "Cost", time: "Time", turns: "Turns", 206 }; 207 return ( 208 <th 209 key={key} 210 onClick={() => handleSort(key)} 211 style={{ cursor: "pointer", userSelect: "none" }} 212 > 213 {labels[key]} {sortKey === key ? (sortAsc ? "\u25B2" : "\u25BC") : ""} 214 </th> 215 ); 216 })} 217 </tr> 218 </thead> 219 <tbody> 220 {grouped ? ( 221 cellGroups.length === 0 ? ( 222 <tr> 223 <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}> 224 {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."} 225 </td> 226 </tr> 227 ) : ( 228 cellGroups.map((g) => ( 229 <tr key={g.cell_id}> 230 <td> 231 <div style={{ fontSize: "0.75rem", display: "flex", alignItems: "center", gap: "6px" }}> 232 <a href={`/c/${g.runs[0]?.meta.short_cell_id || g.cell_id}`} style={{ color: "var(--accent)", fontSize: "0.65rem", textTransform: "uppercase", letterSpacing: "0.5px", opacity: 0.7 }} title="View cell detail">cell</a> 233 <span style={{ color: "var(--text-muted)", fontSize: "0.6rem", fontFamily: "var(--font-mono)", opacity: g.runs.length < 3 ? 0.5 : 0.8 }}>n={g.runs.length}</span> 234 {g.runs.map((r, i) => ( 235 <span key={r.meta.run_id}> 236 {i > 0 && " "} 237 <a href={`/r/${r.meta.short_id || r.meta.run_id}`} style={{ color: "var(--accent)" }}>#{r.meta.run_number}</a> 238 </span> 239 ))} 240 </div> 241 </td> 242 <td> 243 {g.meta.task} 244 {g.runs.some(r => { 245 const cost = r.claude_output?.total_cost_usd ?? 0; 246 const budget = r.meta.max_budget_usd ?? 0; 247 return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124; 248 }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>} 249 </td> 250 <td><span className="badge badge-neutral">{g.meta.actual_model || g.meta.model}</span></td> 251 <td>{g.meta.effort}</td> 252 <td>{g.meta.prompt_style}</td> 253 <td>{g.meta.language}</td> 254 <td className={`score-cell ${scoreClass(g.avg.score)}`}> 255 <RangeCell min={g.min.score} max={g.max.score} avg={g.avg.score} format={formatScore} /> 256 </td> 257 <td> 258 {(() => { 259 const passes = g.runs.filter(r => r.eval_results?.functional?.pass === true).length; 260 const fails = g.runs.filter(r => r.eval_results?.functional?.pass === false).length; 261 const total = g.runs.length; 262 if (passes === total) return <span className="badge badge-pass">{passes}/{total}</span>; 263 if (fails === total) return <span className="badge badge-fail">0/{total}</span>; 264 return <span className="badge badge-neutral">{passes}/{total}</span>; 265 })()} 266 </td> 267 <td> 268 <RangeCell min={g.min.cost} max={g.max.cost} avg={g.avg.cost} format={formatCost} /> 269 </td> 270 <td>{formatTime(Math.round(g.avg.time))}</td> 271 <td>{Math.round(g.avg.turns)}</td> 272 </tr> 273 )) 274 ) 275 ) : ( 276 filteredRuns.length === 0 ? ( 277 <tr> 278 <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}> 279 {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."} 280 </td> 281 </tr> 282 ) : ( 283 filteredRuns.map((run) => ( 284 <tr key={run.meta.run_id}> 285 <td> 286 <a href={`/r/${run.meta.short_id || run.meta.run_id}`} style={{ fontSize: "0.75rem" }}> 287 {formatRunId(run)} 288 </a> 289 </td> 290 <td>{run.meta.task}</td> 291 <td><span className="badge badge-neutral">{run.meta.actual_model || run.meta.model}</span></td> 292 <td>{run.meta.effort}</td> 293 <td>{run.meta.prompt_style}</td> 294 <td>{run.meta.language}</td> 295 <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}> 296 {formatScore(run.eval_results?.score)} 297 </td> 298 <td> 299 {run.eval_results?.functional?.pass === true ? ( 300 <span className="badge badge-pass">PASS</span> 301 ) : run.eval_results?.functional?.pass === false ? ( 302 <span className="badge badge-fail">FAIL</span> 303 ) : ( 304 <span className="badge badge-neutral">-</span> 305 )} 306 </td> 307 <td>{formatCost(run.claude_output?.total_cost_usd)}</td> 308 <td>{formatTime(run.meta.wall_time_seconds)}</td> 309 <td>{run.claude_output?.num_turns ?? "-"}</td> 310 </tr> 311 )) 312 ) 313 )} 314 </tbody> 315 </table> 316 <div style={{ padding: "12px", color: "var(--text-muted)", fontSize: "0.75rem" }}> 317 {grouped 318 ? `${cellGroups.length} configs (${filteredRuns.length} runs)` 319 : `${filteredRuns.length} of ${runs.length} runs`} 320 </div> 321 </div> 322 </div> 323 ); 324 }