loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

analysis.ts (10020B)


      1 import type { Run, AxisName } from "./types";
      2 import { AXIS_NAMES } from "./types";
      3 
      4 export interface ConfidenceIntervalResult {
      5   mean: number;
      6   ci: number;
      7   lower: number;
      8   upper: number;
      9   n: number;
     10 }
     11 
     12 export function confidenceInterval(
     13   values: number[],
     14   confidence = 0.95
     15 ): ConfidenceIntervalResult {
     16   const n = values.length;
     17   if (n < 2) {
     18     const mean = n === 1 ? values[0] : 0;
     19     return { mean, ci: 0, lower: mean, upper: mean, n };
     20   }
     21   const mean = values.reduce((a, b) => a + b, 0) / n;
     22   const stdDev = Math.sqrt(
     23     values.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1)
     24   );
     25   // t-value approximation for 95% CI (exact for small n, 1.96 for large n)
     26   const tValues: Record<number, number> = {
     27     2: 12.71,
     28     3: 4.3,
     29     4: 3.18,
     30     5: 2.78,
     31     6: 2.57,
     32     7: 2.45,
     33     8: 2.36,
     34     9: 2.31,
     35     10: 2.26,
     36   };
     37   const t = tValues[n] ?? (n > 30 ? 1.96 : 2.0);
     38   const ci = t * stdDev / Math.sqrt(n);
     39   return { mean, ci, lower: mean - ci, upper: mean + ci, n };
     40 }
     41 
     42 export interface Cell {
     43   cell_id: string;
     44   runs: Run[];
     45   meta: Run["meta"]; // from first run
     46   n: number;
     47   score: { avg: number; min: number; max: number; range: number };
     48   cost: { avg: number; min: number; max: number };
     49   turns: { avg: number; min: number; max: number };
     50   wall_time: { avg: number; min: number; max: number };
     51   gameplay: { avg: number; min: number; max: number };
     52   code_quality: { avg: number; min: number; max: number };
     53   quality: { avg: number; min: number; max: number };
     54   structural: { avg: number; min: number; max: number };
     55   sonarqube: { avg: number; min: number; max: number };
     56   transcript: { avg: number; min: number; max: number };
     57 }
     58 
     59 export interface EffectEntry {
     60   value: string;
     61   mean: number;
     62   effect: number;
     63   variance: number;
     64   n: number;
     65 }
     66 
     67 export interface AxisEffect {
     68   axis: string;
     69   spread: number;
     70   values: EffectEntry[];
     71 }
     72 
     73 export interface InteractionCell {
     74   mean: number;
     75   variance: number;
     76   n: number;
     77 }
     78 
     79 export interface InteractionResult {
     80   axisA: string;
     81   axisB: string;
     82   table: Record<string, Record<string, InteractionCell>>;
     83   maxInteraction: number;
     84 }
     85 
     86 const SKIP_KEYS = new Set([
     87   "task",
     88   "cell_id",
     89   "run_id",
     90   "run_number",
     91   "runs_per_cell",
     92   "max_budget_usd",
     93   "timeout_seconds",
     94   "base_tools",
     95   "started_at",
     96   "completed_at",
     97   "wall_time_seconds",
     98   "exit_code",
     99   "short_id",
    100   "short_cell_id",
    101   "claude_version",
    102   "sub_agents",
    103   "actual_model",
    104 ]);
    105 
    106 type MetricExtractor = (run: Run) => number | null;
    107 
    108 const METRICS: Record<string, MetricExtractor> = {
    109   score: (r) => r.eval_results?.score ?? null,
    110   cost: (r) => r.claude_output?.total_cost_usd ?? null,
    111   turns: (r) => r.claude_output?.num_turns ?? null,
    112   wall_time: (r) => r.meta.wall_time_seconds ?? null,
    113   gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null,
    114   code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null,
    115   structural: (r) => r.eval_results?.structural?.score ?? null,
    116   quality: (r) => r.eval_results?.quality?.score ?? null,
    117   transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
    118   sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null,
    119 };
    120 
    121 function agg(values: number[]): { avg: number; min: number; max: number } {
    122   if (values.length === 0) return { avg: 0, min: 0, max: 0 };
    123   const sum = values.reduce((a, b) => a + b, 0);
    124   return {
    125     avg: sum / values.length,
    126     min: Math.min(...values),
    127     max: Math.max(...values),
    128   };
    129 }
    130 
    131 export function groupIntoCells(runs: Run[]): Cell[] {
    132   const byCell = new Map<string, Run[]>();
    133   for (const run of runs) {
    134     const id = run.meta.cell_id;
    135     const list = byCell.get(id);
    136     if (list) list.push(run);
    137     else byCell.set(id, [run]);
    138   }
    139 
    140   const cells: Cell[] = [];
    141   for (const [cell_id, cellRuns] of byCell) {
    142     const extractVals = (extractor: MetricExtractor): number[] => {
    143       const vals: number[] = [];
    144       for (const r of cellRuns) {
    145         const v = extractor(r);
    146         if (v !== null) vals.push(v);
    147       }
    148       return vals;
    149     };
    150 
    151     const scoreVals = extractVals(METRICS.score);
    152     const scoreAgg = agg(scoreVals);
    153 
    154     cells.push({
    155       cell_id,
    156       runs: cellRuns,
    157       meta: cellRuns[0].meta,
    158       n: cellRuns.length,
    159       score: { ...scoreAgg, range: scoreAgg.max - scoreAgg.min },
    160       cost: agg(extractVals(METRICS.cost)),
    161       turns: agg(extractVals(METRICS.turns)),
    162       wall_time: agg(extractVals(METRICS.wall_time)),
    163       gameplay: agg(extractVals(METRICS.gameplay)),
    164       code_quality: agg(extractVals(METRICS.code_quality)),
    165       quality: agg(extractVals(METRICS.quality)),
    166       structural: agg(extractVals(METRICS.structural)),
    167       sonarqube: agg(extractVals(METRICS.sonarqube)),
    168       transcript: agg(extractVals(METRICS.transcript)),
    169     });
    170   }
    171 
    172   return cells;
    173 }
    174 
    175 export function computeMainEffects(
    176   runs: Run[],
    177   metric: string = "score"
    178 ): AxisEffect[] {
    179   const extract = METRICS[metric];
    180   if (!extract) return [];
    181 
    182   const cells = groupIntoCells(runs);
    183 
    184   // Compute per-cell metric averages and ranges
    185   const scored: Array<{ meta: Run["meta"]; avg: number; range: number }> = [];
    186   for (const cell of cells) {
    187     const vals: number[] = [];
    188     for (const run of cell.runs) {
    189       const v = extract(run);
    190       if (v !== null) vals.push(v);
    191     }
    192     if (vals.length === 0) continue;
    193     const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
    194     const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
    195     scored.push({ meta: cell.meta, avg: cellAvg, range: cellRange });
    196   }
    197   if (scored.length === 0) return [];
    198 
    199   const grandMean = scored.reduce((s, c) => s + c.avg, 0) / scored.length;
    200 
    201   // Find axis keys from meta
    202   const axisKeys = Object.keys(scored[0].meta).filter(
    203     (k) => !SKIP_KEYS.has(k)
    204   );
    205 
    206   const effects: AxisEffect[] = [];
    207 
    208   for (const axis of axisKeys) {
    209     const groups: Record<string, { avgs: number[]; ranges: number[] }> = {};
    210     for (const { meta, avg, range } of scored) {
    211       const key = String((meta as Record<string, unknown>)[axis] ?? "unknown");
    212       const g = groups[key] ??= { avgs: [], ranges: [] };
    213       g.avgs.push(avg);
    214       g.ranges.push(range);
    215     }
    216 
    217     if (Object.keys(groups).length < 2) continue;
    218 
    219     const values: EffectEntry[] = [];
    220     for (const [val, { avgs, ranges }] of Object.entries(groups)) {
    221       const mean = avgs.reduce((a, b) => a + b, 0) / avgs.length;
    222       const variance = ranges.reduce((a, b) => a + b, 0) / ranges.length;
    223       values.push({
    224         value: val,
    225         mean: Math.round(mean * 10000) / 10000,
    226         effect: Math.round((mean - grandMean) * 10000) / 10000,
    227         variance: Math.round(variance * 10000) / 10000,
    228         n: avgs.length,
    229       });
    230     }
    231 
    232     const means = values.map((v) => v.mean);
    233     const spread = Math.max(...means) - Math.min(...means);
    234 
    235     effects.push({
    236       axis,
    237       spread: Math.round(spread * 10000) / 10000,
    238       values: values.sort((a, b) => b.effect - a.effect),
    239     });
    240   }
    241 
    242   return effects.sort((a, b) => b.spread - a.spread);
    243 }
    244 
    245 export function computeInteraction(
    246   runs: Run[],
    247   axisA: string,
    248   axisB: string,
    249   metric: string = "score"
    250 ): InteractionResult {
    251   const extract = METRICS[metric];
    252   if (!extract)
    253     return { axisA, axisB, table: {}, maxInteraction: 0 };
    254 
    255   const cells = groupIntoCells(runs);
    256 
    257   // Group cells by (axisA, axisB) combination
    258   const groups: Record<string, Record<string, { avgs: number[]; ranges: number[] }>> = {};
    259 
    260   for (const cell of cells) {
    261     const vals: number[] = [];
    262     for (const run of cell.runs) {
    263       const v = extract(run);
    264       if (v !== null) vals.push(v);
    265     }
    266     if (vals.length === 0) continue;
    267 
    268     const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
    269     const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
    270 
    271     const a = String((cell.meta as Record<string, unknown>)[axisA] ?? "?");
    272     const b = String((cell.meta as Record<string, unknown>)[axisB] ?? "?");
    273     const g = ((groups[a] ??= {})[b] ??= { avgs: [], ranges: [] });
    274     g.avgs.push(cellAvg);
    275     g.ranges.push(cellRange);
    276   }
    277 
    278   const table: Record<string, Record<string, InteractionCell>> = {};
    279   const allMeans: number[] = [];
    280 
    281   for (const [a, bGroups] of Object.entries(groups)) {
    282     table[a] = {};
    283     for (const [b, { avgs, ranges }] of Object.entries(bGroups)) {
    284       const mean = avgs.reduce((s, v) => s + v, 0) / avgs.length;
    285       const variance = ranges.reduce((s, v) => s + v, 0) / ranges.length;
    286       table[a][b] = {
    287         mean: Math.round(mean * 10000) / 10000,
    288         variance: Math.round(variance * 10000) / 10000,
    289         n: avgs.length,
    290       };
    291       allMeans.push(mean);
    292     }
    293   }
    294 
    295   const grandMean =
    296     allMeans.length > 0
    297       ? allMeans.reduce((a, b) => a + b, 0) / allMeans.length
    298       : 0;
    299 
    300   // Row and column means
    301   const aMeans: Record<string, number> = {};
    302   const bMeans: Record<string, number> = {};
    303   const bKeys = new Set<string>();
    304 
    305   for (const [a, bGroups] of Object.entries(table)) {
    306     const vals = Object.values(bGroups).map((c) => c.mean);
    307     aMeans[a] = vals.reduce((s, v) => s + v, 0) / vals.length;
    308     for (const b of Object.keys(bGroups)) bKeys.add(b);
    309   }
    310 
    311   for (const b of bKeys) {
    312     const vals: number[] = [];
    313     for (const a of Object.keys(table)) {
    314       if (table[a][b]) vals.push(table[a][b].mean);
    315     }
    316     bMeans[b] = vals.length > 0 ? vals.reduce((s, v) => s + v, 0) / vals.length : grandMean;
    317   }
    318 
    319   // Max interaction = max deviation from additive model
    320   let maxInteraction = 0;
    321   for (const a of Object.keys(table)) {
    322     for (const b of Object.keys(table[a])) {
    323       const expected = aMeans[a] + bMeans[b] - grandMean;
    324       const actual = table[a][b].mean;
    325       maxInteraction = Math.max(maxInteraction, Math.abs(actual - expected));
    326     }
    327   }
    328 
    329   return {
    330     axisA,
    331     axisB,
    332     table,
    333     maxInteraction: Math.round(maxInteraction * 10000) / 10000,
    334   };
    335 }

Impressum · Datenschutz