index.astro (4725B)
1 --- 2 import Base from "../layouts/Base.astro"; 3 import { loadAllRuns, getAxisValues, getTaskNames, projectRunForIndex } from "../lib/data"; 4 import type { Run } from "../lib/types"; 5 import Grid from "../components/Grid"; 6 import Charts from "../components/Charts"; 7 import TopBottomConfigs from "../components/TopBottomConfigs"; 8 import StatisticalPowerCard from "../components/StatisticalPowerCard"; 9 10 const runs = loadAllRuns(); 11 const axisValues = getAxisValues(runs); 12 const tasks = getTaskNames(runs); 13 14 // Each client:load island below serializes its props independently into the 15 // HTML. Without projection, the full eval_results payload (gameplay bot 16 // reports, SonarQube details, etc.) gets embedded 4x at ~10KB/run. Projecting 17 // down to the union of fields these islands actually read drops per-run size 18 // ~50x. 19 const runsForIndex = runs.map(projectRunForIndex); 20 21 // Compute per-task cell-based stats 22 interface TaskSummary { 23 task: string; 24 cells: number; 25 runs: number; 26 avg_score: number | null; 27 pass_rate: number | null; 28 avg_cost: number | null; 29 } 30 31 function computeTaskSummaries(runs: Run[]): TaskSummary[] { 32 const byTask: Record<string, Run[]> = {}; 33 for (const run of runs) { 34 (byTask[run.meta.task] ??= []).push(run); 35 } 36 37 return Object.entries(byTask).map(([task, taskRuns]) => { 38 // Group into cells 39 const cells = new Map<string, Run[]>(); 40 for (const run of taskRuns) { 41 const id = run.meta.cell_id; 42 if (!cells.has(id)) cells.set(id, []); 43 cells.get(id)!.push(run); 44 } 45 46 // Compute cell averages 47 const cellScores: number[] = []; 48 const cellCosts: number[] = []; 49 let totalPasses = 0; 50 51 for (const [, cellRuns] of cells) { 52 const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null); 53 if (scores.length > 0) cellScores.push(scores.reduce((a, b) => a + b, 0) / scores.length); 54 55 const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null); 56 if (costs.length > 0) cellCosts.push(costs.reduce((a, b) => a + b, 0) / costs.length); 57 58 totalPasses += cellRuns.filter(r => r.eval_results?.functional?.pass === true).length; 59 } 60 61 const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; 62 63 return { 64 task, 65 cells: cells.size, 66 runs: taskRuns.length, 67 avg_score: avg(cellScores), 68 pass_rate: taskRuns.length > 0 ? totalPasses / taskRuns.length : null, 69 avg_cost: avg(cellCosts), 70 }; 71 }); 72 } 73 74 const taskSummaries = computeTaskSummaries(runs); 75 const totalCells = new Set(runs.map(r => r.meta.cell_id)).size; 76 --- 77 78 <Base title="Grid Overview"> 79 <h1 style="margin-bottom: 8px;">Benchmark Results</h1> 80 <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> 81 Comparing agentic coding loop configurations across tasks and setups. 82 </p> 83 84 {taskSummaries.map((ts) => ( 85 <div class="card" style="margin-bottom: 16px; padding: 16px;"> 86 <div style="display: flex; align-items: center; gap: 24px; flex-wrap: wrap;"> 87 <h3 style="margin: 0; min-width: 100px;">{ts.task}</h3> 88 <div style="display: flex; gap: 24px; font-size: 13px;"> 89 <div> 90 <span style="color: var(--text-muted);">cells </span> 91 <span style="font-weight: 600;">{ts.cells}</span> 92 </div> 93 <div> 94 <span style="color: var(--text-muted);">runs </span> 95 <span style="font-weight: 600;">{ts.runs}</span> 96 </div> 97 <div> 98 <span style="color: var(--text-muted);">avg score </span> 99 <span style="font-weight: 600;">{ts.avg_score != null ? (ts.avg_score * 100).toFixed(0) + "%" : "-"}</span> 100 </div> 101 <div> 102 <span style="color: var(--text-muted);">pass rate </span> 103 <span style="font-weight: 600;">{ts.pass_rate != null ? (ts.pass_rate * 100).toFixed(0) + "%" : "-"}</span> 104 </div> 105 <div> 106 <span style="color: var(--text-muted);">avg cost </span> 107 <span style="font-weight: 600;">{ts.avg_cost != null ? "$" + ts.avg_cost.toFixed(2) : "-"}</span> 108 </div> 109 </div> 110 </div> 111 </div> 112 ))} 113 114 <StatisticalPowerCard client:load runs={runsForIndex} /> 115 116 <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> 117 <Charts client:load runs={runsForIndex} /> 118 <TopBottomConfigs client:load runs={runsForIndex} /> 119 </div> 120 121 <div style="margin-top: 32px;"> 122 <h2 style="margin-bottom: 16px;">All Cells</h2> 123 <Grid client:load runs={runsForIndex} axisValues={axisValues} tasks={tasks} /> 124 </div> 125 </Base>