analysis.ts (10020B)
1 import type { Run, AxisName } from "./types"; 2 import { AXIS_NAMES } from "./types"; 3 4 export interface ConfidenceIntervalResult { 5 mean: number; 6 ci: number; 7 lower: number; 8 upper: number; 9 n: number; 10 } 11 12 export function confidenceInterval( 13 values: number[], 14 confidence = 0.95 15 ): ConfidenceIntervalResult { 16 const n = values.length; 17 if (n < 2) { 18 const mean = n === 1 ? values[0] : 0; 19 return { mean, ci: 0, lower: mean, upper: mean, n }; 20 } 21 const mean = values.reduce((a, b) => a + b, 0) / n; 22 const stdDev = Math.sqrt( 23 values.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1) 24 ); 25 // t-value approximation for 95% CI (exact for small n, 1.96 for large n) 26 const tValues: Record<number, number> = { 27 2: 12.71, 28 3: 4.3, 29 4: 3.18, 30 5: 2.78, 31 6: 2.57, 32 7: 2.45, 33 8: 2.36, 34 9: 2.31, 35 10: 2.26, 36 }; 37 const t = tValues[n] ?? (n > 30 ? 1.96 : 2.0); 38 const ci = t * stdDev / Math.sqrt(n); 39 return { mean, ci, lower: mean - ci, upper: mean + ci, n }; 40 } 41 42 export interface Cell { 43 cell_id: string; 44 runs: Run[]; 45 meta: Run["meta"]; // from first run 46 n: number; 47 score: { avg: number; min: number; max: number; range: number }; 48 cost: { avg: number; min: number; max: number }; 49 turns: { avg: number; min: number; max: number }; 50 wall_time: { avg: number; min: number; max: number }; 51 gameplay: { avg: number; min: number; max: number }; 52 code_quality: { avg: number; min: number; max: number }; 53 quality: { avg: number; min: number; max: number }; 54 structural: { avg: number; min: number; max: number }; 55 sonarqube: { avg: number; min: number; max: number }; 56 transcript: { avg: number; min: number; max: number }; 57 } 58 59 export interface EffectEntry { 60 value: string; 61 mean: number; 62 effect: number; 63 variance: number; 64 n: number; 65 } 66 67 export interface AxisEffect { 68 axis: string; 69 spread: number; 70 values: EffectEntry[]; 71 } 72 73 export interface InteractionCell { 74 mean: number; 75 variance: number; 76 n: number; 77 } 78 79 export interface InteractionResult { 80 axisA: string; 81 axisB: string; 82 table: Record<string, Record<string, InteractionCell>>; 83 maxInteraction: number; 84 } 85 86 const SKIP_KEYS = new Set([ 87 "task", 88 "cell_id", 89 "run_id", 90 "run_number", 91 "runs_per_cell", 92 "max_budget_usd", 93 "timeout_seconds", 94 "base_tools", 95 "started_at", 96 "completed_at", 97 "wall_time_seconds", 98 "exit_code", 99 "short_id", 100 "short_cell_id", 101 "claude_version", 102 "sub_agents", 103 "actual_model", 104 ]); 105 106 type MetricExtractor = (run: Run) => number | null; 107 108 const METRICS: Record<string, MetricExtractor> = { 109 score: (r) => r.eval_results?.score ?? null, 110 cost: (r) => r.claude_output?.total_cost_usd ?? null, 111 turns: (r) => r.claude_output?.num_turns ?? null, 112 wall_time: (r) => r.meta.wall_time_seconds ?? null, 113 gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null, 114 code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null, 115 structural: (r) => r.eval_results?.structural?.score ?? null, 116 quality: (r) => r.eval_results?.quality?.score ?? null, 117 transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null, 118 sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null, 119 }; 120 121 function agg(values: number[]): { avg: number; min: number; max: number } { 122 if (values.length === 0) return { avg: 0, min: 0, max: 0 }; 123 const sum = values.reduce((a, b) => a + b, 0); 124 return { 125 avg: sum / values.length, 126 min: Math.min(...values), 127 max: Math.max(...values), 128 }; 129 } 130 131 export function groupIntoCells(runs: Run[]): Cell[] { 132 const byCell = new Map<string, Run[]>(); 133 for (const run of runs) { 134 const id = run.meta.cell_id; 135 const list = byCell.get(id); 136 if (list) list.push(run); 137 else byCell.set(id, [run]); 138 } 139 140 const cells: Cell[] = []; 141 for (const [cell_id, cellRuns] of byCell) { 142 const extractVals = (extractor: MetricExtractor): number[] => { 143 const vals: number[] = []; 144 for (const r of cellRuns) { 145 const v = extractor(r); 146 if (v !== null) vals.push(v); 147 } 148 return vals; 149 }; 150 151 const scoreVals = extractVals(METRICS.score); 152 const scoreAgg = agg(scoreVals); 153 154 cells.push({ 155 cell_id, 156 runs: cellRuns, 157 meta: cellRuns[0].meta, 158 n: cellRuns.length, 159 score: { ...scoreAgg, range: scoreAgg.max - scoreAgg.min }, 160 cost: agg(extractVals(METRICS.cost)), 161 turns: agg(extractVals(METRICS.turns)), 162 wall_time: agg(extractVals(METRICS.wall_time)), 163 gameplay: agg(extractVals(METRICS.gameplay)), 164 code_quality: agg(extractVals(METRICS.code_quality)), 165 quality: agg(extractVals(METRICS.quality)), 166 structural: agg(extractVals(METRICS.structural)), 167 sonarqube: agg(extractVals(METRICS.sonarqube)), 168 transcript: agg(extractVals(METRICS.transcript)), 169 }); 170 } 171 172 return cells; 173 } 174 175 export function computeMainEffects( 176 runs: Run[], 177 metric: string = "score" 178 ): AxisEffect[] { 179 const extract = METRICS[metric]; 180 if (!extract) return []; 181 182 const cells = groupIntoCells(runs); 183 184 // Compute per-cell metric averages and ranges 185 const scored: Array<{ meta: Run["meta"]; avg: number; range: number }> = []; 186 for (const cell of cells) { 187 const vals: number[] = []; 188 for (const run of cell.runs) { 189 const v = extract(run); 190 if (v !== null) vals.push(v); 191 } 192 if (vals.length === 0) continue; 193 const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length; 194 const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0; 195 scored.push({ meta: cell.meta, avg: cellAvg, range: cellRange }); 196 } 197 if (scored.length === 0) return []; 198 199 const grandMean = scored.reduce((s, c) => s + c.avg, 0) / scored.length; 200 201 // Find axis keys from meta 202 const axisKeys = Object.keys(scored[0].meta).filter( 203 (k) => !SKIP_KEYS.has(k) 204 ); 205 206 const effects: AxisEffect[] = []; 207 208 for (const axis of axisKeys) { 209 const groups: Record<string, { avgs: number[]; ranges: number[] }> = {}; 210 for (const { meta, avg, range } of scored) { 211 const key = String((meta as Record<string, unknown>)[axis] ?? "unknown"); 212 const g = groups[key] ??= { avgs: [], ranges: [] }; 213 g.avgs.push(avg); 214 g.ranges.push(range); 215 } 216 217 if (Object.keys(groups).length < 2) continue; 218 219 const values: EffectEntry[] = []; 220 for (const [val, { avgs, ranges }] of Object.entries(groups)) { 221 const mean = avgs.reduce((a, b) => a + b, 0) / avgs.length; 222 const variance = ranges.reduce((a, b) => a + b, 0) / ranges.length; 223 values.push({ 224 value: val, 225 mean: Math.round(mean * 10000) / 10000, 226 effect: Math.round((mean - grandMean) * 10000) / 10000, 227 variance: Math.round(variance * 10000) / 10000, 228 n: avgs.length, 229 }); 230 } 231 232 const means = values.map((v) => v.mean); 233 const spread = Math.max(...means) - Math.min(...means); 234 235 effects.push({ 236 axis, 237 spread: Math.round(spread * 10000) / 10000, 238 values: values.sort((a, b) => b.effect - a.effect), 239 }); 240 } 241 242 return effects.sort((a, b) => b.spread - a.spread); 243 } 244 245 export function computeInteraction( 246 runs: Run[], 247 axisA: string, 248 axisB: string, 249 metric: string = "score" 250 ): InteractionResult { 251 const extract = METRICS[metric]; 252 if (!extract) 253 return { axisA, axisB, table: {}, maxInteraction: 0 }; 254 255 const cells = groupIntoCells(runs); 256 257 // Group cells by (axisA, axisB) combination 258 const groups: Record<string, Record<string, { avgs: number[]; ranges: number[] }>> = {}; 259 260 for (const cell of cells) { 261 const vals: number[] = []; 262 for (const run of cell.runs) { 263 const v = extract(run); 264 if (v !== null) vals.push(v); 265 } 266 if (vals.length === 0) continue; 267 268 const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length; 269 const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0; 270 271 const a = String((cell.meta as Record<string, unknown>)[axisA] ?? "?"); 272 const b = String((cell.meta as Record<string, unknown>)[axisB] ?? "?"); 273 const g = ((groups[a] ??= {})[b] ??= { avgs: [], ranges: [] }); 274 g.avgs.push(cellAvg); 275 g.ranges.push(cellRange); 276 } 277 278 const table: Record<string, Record<string, InteractionCell>> = {}; 279 const allMeans: number[] = []; 280 281 for (const [a, bGroups] of Object.entries(groups)) { 282 table[a] = {}; 283 for (const [b, { avgs, ranges }] of Object.entries(bGroups)) { 284 const mean = avgs.reduce((s, v) => s + v, 0) / avgs.length; 285 const variance = ranges.reduce((s, v) => s + v, 0) / ranges.length; 286 table[a][b] = { 287 mean: Math.round(mean * 10000) / 10000, 288 variance: Math.round(variance * 10000) / 10000, 289 n: avgs.length, 290 }; 291 allMeans.push(mean); 292 } 293 } 294 295 const grandMean = 296 allMeans.length > 0 297 ? allMeans.reduce((a, b) => a + b, 0) / allMeans.length 298 : 0; 299 300 // Row and column means 301 const aMeans: Record<string, number> = {}; 302 const bMeans: Record<string, number> = {}; 303 const bKeys = new Set<string>(); 304 305 for (const [a, bGroups] of Object.entries(table)) { 306 const vals = Object.values(bGroups).map((c) => c.mean); 307 aMeans[a] = vals.reduce((s, v) => s + v, 0) / vals.length; 308 for (const b of Object.keys(bGroups)) bKeys.add(b); 309 } 310 311 for (const b of bKeys) { 312 const vals: number[] = []; 313 for (const a of Object.keys(table)) { 314 if (table[a][b]) vals.push(table[a][b].mean); 315 } 316 bMeans[b] = vals.length > 0 ? vals.reduce((s, v) => s + v, 0) / vals.length : grandMean; 317 } 318 319 // Max interaction = max deviation from additive model 320 let maxInteraction = 0; 321 for (const a of Object.keys(table)) { 322 for (const b of Object.keys(table[a])) { 323 const expected = aMeans[a] + bMeans[b] - grandMean; 324 const actual = table[a][b].mean; 325 maxInteraction = Math.max(maxInteraction, Math.abs(actual - expected)); 326 } 327 } 328 329 return { 330 axisA, 331 axisB, 332 table, 333 maxInteraction: Math.round(maxInteraction * 10000) / 10000, 334 }; 335 }