compare.astro (6584B)
1 --- 2 import Base from "../layouts/Base.astro"; 3 import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES, projectRunForIndex } from "../lib/data"; 4 import type { Run, AxisName } from "../lib/data"; 5 import VariabilityViolin from "../components/VariabilityViolin"; 6 7 const runs = loadAllRuns(); 8 const axisValues = getAxisValues(runs); 9 const tasks = getTaskNames(runs); 10 11 // Projected slim runs for the client:load VariabilityViolin island. 12 const slimRuns = runs.map(projectRunForIndex); 13 14 // Build comparison data using cell-based aggregation. 15 // A "cell" is a unique configuration (cell_id). Multiple runs share a cell_id 16 // when they are repeat trials of the same config. Averaging per-cell first, 17 // then aggregating across cells, prevents configs with more repeats from 18 // dominating the average. 19 20 interface ComparisonRow { 21 axis: string; 22 value: string; 23 cells: number; // number of unique configs 24 runs: number; // total runs 25 avg_score: string; 26 score_range: string; // "68%-80%" 27 avg_cost: string; 28 cost_range: string; // "$0.15-$0.22" 29 avg_time: string; 30 } 31 32 interface CellStats { 33 avg_score: number | null; 34 avg_cost: number | null; 35 avg_time: number | null; 36 run_count: number; 37 } 38 39 /** Compute per-cell averages from a list of runs. */ 40 function getCellStats(runs: Run[]): Map<string, CellStats> { 41 const cells = new Map<string, Run[]>(); 42 for (const run of runs) { 43 const id = run.meta.cell_id; 44 if (!cells.has(id)) cells.set(id, []); 45 cells.get(id)!.push(run); 46 } 47 48 const result = new Map<string, CellStats>(); 49 for (const [cellId, cellRuns] of cells) { 50 const scores = cellRuns 51 .map((r) => r.eval_results?.score) 52 .filter((s): s is number => s != null); 53 const costs = cellRuns 54 .map((r) => r.claude_output?.total_cost_usd) 55 .filter((c): c is number => c != null); 56 const times = cellRuns 57 .map((r) => r.meta.wall_time_seconds) 58 .filter((t): t is number => t != null); 59 60 const avg = (arr: number[]) => 61 arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; 62 63 result.set(cellId, { 64 avg_score: avg(scores), 65 avg_cost: avg(costs), 66 avg_time: avg(times), 67 run_count: cellRuns.length, 68 }); 69 } 70 return result; 71 } 72 73 function formatRange(values: number[], formatter: (v: number) => string): string { 74 if (values.length === 0) return "-"; 75 if (values.length === 1) return formatter(values[0]); 76 const min = Math.min(...values); 77 const max = Math.max(...values); 78 if (min === max) return formatter(min); 79 return formatter(min) + "-" + formatter(max); 80 } 81 82 const comparisons: ComparisonRow[] = []; 83 84 const AXIS_LABELS: Record<AxisName, string> = { 85 model: "Model", 86 effort: "Effort", 87 prompt_style: "Prompt Style", 88 language: "Language", 89 human_language: "Human Language", 90 tool_read: "Tool: Read", 91 tool_write: "Tool: Write", 92 tool_edit: "Tool: Edit", 93 tool_glob: "Tool: Glob", 94 tool_grep: "Tool: Grep", 95 linter: "Linter", 96 playwright: "Playwright", 97 context_file: "Context File", 98 web_search: "Web Search", 99 max_budget: "Budget", 100 tests_provided: "Tests Provided", 101 strategy: "Strategy", 102 design_guidance: "Design Guidance", 103 architecture: "Architecture", 104 error_checking: "Error Checking", 105 context_noise: "Context Noise", 106 renderer: "Renderer", 107 provider: "Provider", 108 }; 109 110 // Pre-compute all cell stats once 111 const allCellStats = getCellStats(runs); 112 113 for (const axis of AXIS_NAMES) { 114 for (const value of axisValues[axis]) { 115 const filtered = runs.filter( 116 (r: Run) => String(r.meta[axis as keyof typeof r.meta]) === value 117 ); 118 119 // Find the unique cell_ids in these runs and gather their stats 120 const cellIds = new Set(filtered.map((r) => r.meta.cell_id)); 121 const matchingCells: CellStats[] = []; 122 for (const id of cellIds) { 123 const cs = allCellStats.get(id); 124 if (cs) matchingCells.push(cs); 125 } 126 127 const cellScores = matchingCells 128 .map((c) => c.avg_score) 129 .filter((s): s is number => s != null); 130 const cellCosts = matchingCells 131 .map((c) => c.avg_cost) 132 .filter((c): c is number => c != null); 133 const cellTimes = matchingCells 134 .map((c) => c.avg_time) 135 .filter((t): t is number => t != null); 136 137 const avg = (arr: number[]) => 138 arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; 139 140 const avgScore = avg(cellScores); 141 const avgCost = avg(cellCosts); 142 const avgTime = avg(cellTimes); 143 144 comparisons.push({ 145 axis: AXIS_LABELS[axis], 146 value, 147 cells: cellIds.size, 148 runs: filtered.length, 149 avg_score: avgScore != null ? (avgScore * 100).toFixed(0) + "%" : "-", 150 score_range: formatRange(cellScores, (v) => (v * 100).toFixed(0) + "%"), 151 avg_cost: avgCost != null ? "$" + avgCost.toFixed(2) : "-", 152 cost_range: formatRange(cellCosts, (v) => "$" + v.toFixed(2)), 153 avg_time: avgTime != null ? Math.round(avgTime) + "s" : "-", 154 }); 155 } 156 } 157 --- 158 159 <Base title="Compare"> 160 <h1 style="margin-bottom: 8px;">Compare Configurations</h1> 161 <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;"> 162 Aggregate performance across each axis value. 163 </p> 164 165 {runs.length === 0 ? ( 166 <div class="card" style="text-align: center; padding: 40px; color: var(--text-muted);"> 167 No benchmark results yet. Run the harness to generate data. 168 </div> 169 ) : ( 170 <div class="card" style="overflow-x: auto;"> 171 <table> 172 <thead> 173 <tr> 174 <th>Axis</th> 175 <th>Value</th> 176 <th>Cells</th> 177 <th>Runs</th> 178 <th>Avg Score</th> 179 <th>Score Range</th> 180 <th>Avg Cost</th> 181 <th>Cost Range</th> 182 <th>Avg Time</th> 183 </tr> 184 </thead> 185 <tbody> 186 {comparisons.map((row) => ( 187 <tr> 188 <td style="color: var(--text-muted);">{row.axis}</td> 189 <td> 190 <span class="badge badge-neutral">{row.value}</span> 191 </td> 192 <td>{row.cells}</td> 193 <td>{row.runs}</td> 194 <td class="score-cell">{row.avg_score}</td> 195 <td style="color: var(--text-muted); font-size: 0.85rem;">{row.score_range}</td> 196 <td>{row.avg_cost}</td> 197 <td style="color: var(--text-muted); font-size: 0.85rem;">{row.cost_range}</td> 198 <td>{row.avg_time}</td> 199 </tr> 200 ))} 201 </tbody> 202 </table> 203 </div> 204 )} 205 206 <div style="margin-top: 24px;"> 207 <VariabilityViolin client:load runs={slimRuns} /> 208 </div> 209 </Base>