loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

compare.astro (6584B)


      1 ---
      2 import Base from "../layouts/Base.astro";
      3 import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES, projectRunForIndex } from "../lib/data";
      4 import type { Run, AxisName } from "../lib/data";
      5 import VariabilityViolin from "../components/VariabilityViolin";
      6 
      7 const runs = loadAllRuns();
      8 const axisValues = getAxisValues(runs);
      9 const tasks = getTaskNames(runs);
     10 
     11 // Projected slim runs for the client:load VariabilityViolin island.
     12 const slimRuns = runs.map(projectRunForIndex);
     13 
     14 // Build comparison data using cell-based aggregation.
     15 // A "cell" is a unique configuration (cell_id). Multiple runs share a cell_id
     16 // when they are repeat trials of the same config. Averaging per-cell first,
     17 // then aggregating across cells, prevents configs with more repeats from
     18 // dominating the average.
     19 
     20 interface ComparisonRow {
     21   axis: string;
     22   value: string;
     23   cells: number;   // number of unique configs
     24   runs: number;    // total runs
     25   avg_score: string;
     26   score_range: string;  // "68%-80%"
     27   avg_cost: string;
     28   cost_range: string;   // "$0.15-$0.22"
     29   avg_time: string;
     30 }
     31 
     32 interface CellStats {
     33   avg_score: number | null;
     34   avg_cost: number | null;
     35   avg_time: number | null;
     36   run_count: number;
     37 }
     38 
     39 /** Compute per-cell averages from a list of runs. */
     40 function getCellStats(runs: Run[]): Map<string, CellStats> {
     41   const cells = new Map<string, Run[]>();
     42   for (const run of runs) {
     43     const id = run.meta.cell_id;
     44     if (!cells.has(id)) cells.set(id, []);
     45     cells.get(id)!.push(run);
     46   }
     47 
     48   const result = new Map<string, CellStats>();
     49   for (const [cellId, cellRuns] of cells) {
     50     const scores = cellRuns
     51       .map((r) => r.eval_results?.score)
     52       .filter((s): s is number => s != null);
     53     const costs = cellRuns
     54       .map((r) => r.claude_output?.total_cost_usd)
     55       .filter((c): c is number => c != null);
     56     const times = cellRuns
     57       .map((r) => r.meta.wall_time_seconds)
     58       .filter((t): t is number => t != null);
     59 
     60     const avg = (arr: number[]) =>
     61       arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
     62 
     63     result.set(cellId, {
     64       avg_score: avg(scores),
     65       avg_cost: avg(costs),
     66       avg_time: avg(times),
     67       run_count: cellRuns.length,
     68     });
     69   }
     70   return result;
     71 }
     72 
     73 function formatRange(values: number[], formatter: (v: number) => string): string {
     74   if (values.length === 0) return "-";
     75   if (values.length === 1) return formatter(values[0]);
     76   const min = Math.min(...values);
     77   const max = Math.max(...values);
     78   if (min === max) return formatter(min);
     79   return formatter(min) + "-" + formatter(max);
     80 }
     81 
     82 const comparisons: ComparisonRow[] = [];
     83 
     84 const AXIS_LABELS: Record<AxisName, string> = {
     85   model: "Model",
     86   effort: "Effort",
     87   prompt_style: "Prompt Style",
     88   language: "Language",
     89   human_language: "Human Language",
     90   tool_read: "Tool: Read",
     91   tool_write: "Tool: Write",
     92   tool_edit: "Tool: Edit",
     93   tool_glob: "Tool: Glob",
     94   tool_grep: "Tool: Grep",
     95   linter: "Linter",
     96   playwright: "Playwright",
     97   context_file: "Context File",
     98   web_search: "Web Search",
     99   max_budget: "Budget",
    100   tests_provided: "Tests Provided",
    101   strategy: "Strategy",
    102   design_guidance: "Design Guidance",
    103   architecture: "Architecture",
    104   error_checking: "Error Checking",
    105   context_noise: "Context Noise",
    106   renderer: "Renderer",
    107   provider: "Provider",
    108 };
    109 
    110 // Pre-compute all cell stats once
    111 const allCellStats = getCellStats(runs);
    112 
    113 for (const axis of AXIS_NAMES) {
    114   for (const value of axisValues[axis]) {
    115     const filtered = runs.filter(
    116       (r: Run) => String(r.meta[axis as keyof typeof r.meta]) === value
    117     );
    118 
    119     // Find the unique cell_ids in these runs and gather their stats
    120     const cellIds = new Set(filtered.map((r) => r.meta.cell_id));
    121     const matchingCells: CellStats[] = [];
    122     for (const id of cellIds) {
    123       const cs = allCellStats.get(id);
    124       if (cs) matchingCells.push(cs);
    125     }
    126 
    127     const cellScores = matchingCells
    128       .map((c) => c.avg_score)
    129       .filter((s): s is number => s != null);
    130     const cellCosts = matchingCells
    131       .map((c) => c.avg_cost)
    132       .filter((c): c is number => c != null);
    133     const cellTimes = matchingCells
    134       .map((c) => c.avg_time)
    135       .filter((t): t is number => t != null);
    136 
    137     const avg = (arr: number[]) =>
    138       arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
    139 
    140     const avgScore = avg(cellScores);
    141     const avgCost = avg(cellCosts);
    142     const avgTime = avg(cellTimes);
    143 
    144     comparisons.push({
    145       axis: AXIS_LABELS[axis],
    146       value,
    147       cells: cellIds.size,
    148       runs: filtered.length,
    149       avg_score: avgScore != null ? (avgScore * 100).toFixed(0) + "%" : "-",
    150       score_range: formatRange(cellScores, (v) => (v * 100).toFixed(0) + "%"),
    151       avg_cost: avgCost != null ? "$" + avgCost.toFixed(2) : "-",
    152       cost_range: formatRange(cellCosts, (v) => "$" + v.toFixed(2)),
    153       avg_time: avgTime != null ? Math.round(avgTime) + "s" : "-",
    154     });
    155   }
    156 }
    157 ---
    158 
    159 <Base title="Compare">
    160   <h1 style="margin-bottom: 8px;">Compare Configurations</h1>
    161   <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
    162     Aggregate performance across each axis value.
    163   </p>
    164 
    165   {runs.length === 0 ? (
    166     <div class="card" style="text-align: center; padding: 40px; color: var(--text-muted);">
    167       No benchmark results yet. Run the harness to generate data.
    168     </div>
    169   ) : (
    170     <div class="card" style="overflow-x: auto;">
    171       <table>
    172         <thead>
    173           <tr>
    174             <th>Axis</th>
    175             <th>Value</th>
    176             <th>Cells</th>
    177             <th>Runs</th>
    178             <th>Avg Score</th>
    179             <th>Score Range</th>
    180             <th>Avg Cost</th>
    181             <th>Cost Range</th>
    182             <th>Avg Time</th>
    183           </tr>
    184         </thead>
    185         <tbody>
    186           {comparisons.map((row) => (
    187             <tr>
    188               <td style="color: var(--text-muted);">{row.axis}</td>
    189               <td>
    190                 <span class="badge badge-neutral">{row.value}</span>
    191               </td>
    192               <td>{row.cells}</td>
    193               <td>{row.runs}</td>
    194               <td class="score-cell">{row.avg_score}</td>
    195               <td style="color: var(--text-muted); font-size: 0.85rem;">{row.score_range}</td>
    196               <td>{row.avg_cost}</td>
    197               <td style="color: var(--text-muted); font-size: 0.85rem;">{row.cost_range}</td>
    198               <td>{row.avg_time}</td>
    199             </tr>
    200           ))}
    201         </tbody>
    202       </table>
    203     </div>
    204   )}
    205 
    206   <div style="margin-top: 24px;">
    207     <VariabilityViolin client:load runs={slimRuns} />
    208   </div>
    209 </Base>

Impressum · Datenschutz