loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

index.astro (4725B)


      1 ---
      2 import Base from "../layouts/Base.astro";
      3 import { loadAllRuns, getAxisValues, getTaskNames, projectRunForIndex } from "../lib/data";
      4 import type { Run } from "../lib/types";
      5 import Grid from "../components/Grid";
      6 import Charts from "../components/Charts";
      7 import TopBottomConfigs from "../components/TopBottomConfigs";
      8 import StatisticalPowerCard from "../components/StatisticalPowerCard";
      9 
     10 const runs = loadAllRuns();
     11 const axisValues = getAxisValues(runs);
     12 const tasks = getTaskNames(runs);
     13 
     14 // Each client:load island below serializes its props independently into the
     15 // HTML. Without projection, the full eval_results payload (gameplay bot
     16 // reports, SonarQube details, etc.) gets embedded 4x at ~10KB/run. Projecting
     17 // down to the union of fields these islands actually read drops per-run size
     18 // ~50x.
     19 const runsForIndex = runs.map(projectRunForIndex);
     20 
     21 // Compute per-task cell-based stats
     22 interface TaskSummary {
     23   task: string;
     24   cells: number;
     25   runs: number;
     26   avg_score: number | null;
     27   pass_rate: number | null;
     28   avg_cost: number | null;
     29 }
     30 
     31 function computeTaskSummaries(runs: Run[]): TaskSummary[] {
     32   const byTask: Record<string, Run[]> = {};
     33   for (const run of runs) {
     34     (byTask[run.meta.task] ??= []).push(run);
     35   }
     36 
     37   return Object.entries(byTask).map(([task, taskRuns]) => {
     38     // Group into cells
     39     const cells = new Map<string, Run[]>();
     40     for (const run of taskRuns) {
     41       const id = run.meta.cell_id;
     42       if (!cells.has(id)) cells.set(id, []);
     43       cells.get(id)!.push(run);
     44     }
     45 
     46     // Compute cell averages
     47     const cellScores: number[] = [];
     48     const cellCosts: number[] = [];
     49     let totalPasses = 0;
     50 
     51     for (const [, cellRuns] of cells) {
     52       const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null);
     53       if (scores.length > 0) cellScores.push(scores.reduce((a, b) => a + b, 0) / scores.length);
     54 
     55       const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null);
     56       if (costs.length > 0) cellCosts.push(costs.reduce((a, b) => a + b, 0) / costs.length);
     57 
     58       totalPasses += cellRuns.filter(r => r.eval_results?.functional?.pass === true).length;
     59     }
     60 
     61     const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
     62 
     63     return {
     64       task,
     65       cells: cells.size,
     66       runs: taskRuns.length,
     67       avg_score: avg(cellScores),
     68       pass_rate: taskRuns.length > 0 ? totalPasses / taskRuns.length : null,
     69       avg_cost: avg(cellCosts),
     70     };
     71   });
     72 }
     73 
     74 const taskSummaries = computeTaskSummaries(runs);
     75 const totalCells = new Set(runs.map(r => r.meta.cell_id)).size;
     76 ---
     77 
     78 <Base title="Grid Overview">
     79   <h1 style="margin-bottom: 8px;">Benchmark Results</h1>
     80   <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
     81     Comparing agentic coding loop configurations across tasks and setups.
     82   </p>
     83 
     84   {taskSummaries.map((ts) => (
     85     <div class="card" style="margin-bottom: 16px; padding: 16px;">
     86       <div style="display: flex; align-items: center; gap: 24px; flex-wrap: wrap;">
     87         <h3 style="margin: 0; min-width: 100px;">{ts.task}</h3>
     88         <div style="display: flex; gap: 24px; font-size: 13px;">
     89           <div>
     90             <span style="color: var(--text-muted);">cells </span>
     91             <span style="font-weight: 600;">{ts.cells}</span>
     92           </div>
     93           <div>
     94             <span style="color: var(--text-muted);">runs </span>
     95             <span style="font-weight: 600;">{ts.runs}</span>
     96           </div>
     97           <div>
     98             <span style="color: var(--text-muted);">avg score </span>
     99             <span style="font-weight: 600;">{ts.avg_score != null ? (ts.avg_score * 100).toFixed(0) + "%" : "-"}</span>
    100           </div>
    101           <div>
    102             <span style="color: var(--text-muted);">pass rate </span>
    103             <span style="font-weight: 600;">{ts.pass_rate != null ? (ts.pass_rate * 100).toFixed(0) + "%" : "-"}</span>
    104           </div>
    105           <div>
    106             <span style="color: var(--text-muted);">avg cost </span>
    107             <span style="font-weight: 600;">{ts.avg_cost != null ? "$" + ts.avg_cost.toFixed(2) : "-"}</span>
    108           </div>
    109         </div>
    110       </div>
    111     </div>
    112   ))}
    113 
    114   <StatisticalPowerCard client:load runs={runsForIndex} />
    115 
    116   <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
    117     <Charts client:load runs={runsForIndex} />
    118     <TopBottomConfigs client:load runs={runsForIndex} />
    119   </div>
    120 
    121   <div style="margin-top: 32px;">
    122     <h2 style="margin-bottom: 16px;">All Cells</h2>
    123     <Grid client:load runs={runsForIndex} axisValues={axisValues} tasks={tasks} />
    124   </div>
    125 </Base>

Impressum · Datenschutz