data.ts - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

data.ts (7652B)
      1 /**
      2  * Data loading module. Server-side only (uses Node.js fs).
      3  * Types are re-exported from ./types.ts which is client-safe.
      4  */
      5 import { createHash } from "node:crypto";
      6 import fs from "node:fs";
      7 import path from "node:path";
      8 
      9 // Use process.cwd() which in Astro build is the dashboard/ directory.
     10 const RESULTS_DIR = path.resolve(process.cwd(), "../results");
     11 
     12 // Re-export types so existing imports from "./data" still work
     13 export type { RunMeta, EvalResults, ClaudeOutput, Run, AxisName } from "./types";
     14 export { AXIS_NAMES } from "./types";
     15 
     16 import type { RunMeta, EvalResults, ClaudeOutput, Run, AxisName } from "./types";
     17 import { AXIS_NAMES } from "./types";
     18 
     19 function shortHash(str: string): string {
     20   return createHash("sha256").update(str).digest("hex").slice(0, 8);
     21 }
     22 
     23 export function loadAllRuns(): Run[] {
     24   const runsDir = path.join(RESULTS_DIR, "runs");
     25   if (!fs.existsSync(runsDir)) return [];
     26 
     27   const runs: Run[] = [];
     28 
     29   for (const runId of fs.readdirSync(runsDir)) {
     30     const runDir = path.join(runsDir, runId);
     31     if (!fs.statSync(runDir).isDirectory()) continue;
     32 
     33     const metaPath = path.join(runDir, "meta.json");
     34     if (!fs.existsSync(metaPath)) continue;
     35 
     36     try {
     37       const meta: RunMeta = JSON.parse(fs.readFileSync(metaPath, "utf-8"));
     38 
     39       // Normalize old schema to new
     40       if ((meta as any).sub_agents && !meta.strategy) {
     41           meta.strategy = (meta as any).sub_agents === "on" ? "use_subagents" : "none";
     42       }
     43       if (meta.playwright === "on") {
     44           (meta as any).playwright = "available";
     45       }
     46       // Default new axes for old runs
     47       meta.tests_provided = meta.tests_provided || "none";
     48       meta.strategy = meta.strategy || "none";
     49       meta.design_guidance = meta.design_guidance || "none";
     50       meta.architecture = meta.architecture || "none";
     51       meta.error_checking = meta.error_checking || "none";
     52       meta.context_noise = meta.context_noise || "clean";
     53       meta.renderer = meta.renderer || "none";
     54       meta.provider = meta.provider || "anthropic";
     55       meta.actual_model = meta.actual_model || meta.model;
     56 
     57       // Normalize legacy model names to versioned
     58       const MODEL_RENAME: Record<string, string> = {
     59         haiku: "haiku-4.5", sonnet: "sonnet-4.6", opus: "opus-4.6",
     60       };
     61       if (MODEL_RENAME[meta.model]) meta.model = MODEL_RENAME[meta.model];
     62       if (MODEL_RENAME[meta.actual_model]) meta.actual_model = MODEL_RENAME[meta.actual_model];
     63 
     64       // Compute short IDs if not in meta (backwards compat)
     65       if (!meta.short_id && meta.run_id) {
     66         meta.short_id = shortHash(meta.run_id);
     67       }
     68       if (!meta.short_cell_id && meta.cell_id) {
     69         meta.short_cell_id = shortHash(meta.cell_id);
     70       }
     71 
     72       let eval_results: EvalResults | null = null;
     73       const evalPath = path.join(runDir, "eval_results.json");
     74       if (fs.existsSync(evalPath)) {
     75         eval_results = JSON.parse(fs.readFileSync(evalPath, "utf-8"));
     76       }
     77 
     78       let claude_output: ClaudeOutput | null = null;
     79       const outputPath = path.join(runDir, "claude_output.json");
     80       if (fs.existsSync(outputPath)) {
     81         claude_output = JSON.parse(fs.readFileSync(outputPath, "utf-8"));
     82       }
     83 
     84       const has_transcript = fs.existsSync(
     85         path.join(runDir, "transcript.jsonl")
     86       );
     87 
     88       runs.push({ meta, eval_results, claude_output, has_transcript });
     89     } catch {
     90       // Skip corrupted run data
     91       continue;
     92     }
     93   }
     94 
     95   return runs.sort((a, b) => a.meta.run_id.localeCompare(b.meta.run_id));
     96 }
     97 
     98 export function loadTranscript(runId: string): string[] {
     99   const transcriptPath = path.join(
    100     RESULTS_DIR,
    101     "runs",
    102     runId,
    103     "transcript.jsonl"
    104   );
    105   if (!fs.existsSync(transcriptPath)) return [];
    106   return fs
    107     .readFileSync(transcriptPath, "utf-8")
    108     .split("\n")
    109     .filter((line) => line.trim());
    110 }
    111 
    112 export function getAxisValues(runs: Run[]): Record<AxisName, string[]> {
    113   const values: Record<string, Set<string>> = {};
    114   for (const axis of AXIS_NAMES) {
    115     values[axis] = new Set();
    116   }
    117   for (const run of runs) {
    118     for (const axis of AXIS_NAMES) {
    119       values[axis].add(String(run.meta[axis]));
    120     }
    121   }
    122   const result: Record<string, string[]> = {};
    123   for (const axis of AXIS_NAMES) {
    124     result[axis] = Array.from(values[axis]).sort();
    125   }
    126   return result as Record<AxisName, string[]>;
    127 }
    128 
    129 export function getTaskNames(runs: Run[]): string[] {
    130   return Array.from(new Set(runs.map((r) => r.meta.task))).sort();
    131 }
    132 
    133 export interface AggregateStats {
    134   count: number;
    135   avg_score: number | null;
    136   avg_cost: number | null;
    137   avg_wall_time: number | null;
    138   avg_turns: number | null;
    139   pass_rate: number | null;
    140 }
    141 
    142 export function aggregateRuns(runs: Run[]): AggregateStats {
    143   if (runs.length === 0) {
    144     return {
    145       count: 0,
    146       avg_score: null,
    147       avg_cost: null,
    148       avg_wall_time: null,
    149       avg_turns: null,
    150       pass_rate: null,
    151     };
    152   }
    153 
    154   const scores = runs
    155     .map((r) => r.eval_results?.score)
    156     .filter((s): s is number => s !== null && s !== undefined);
    157 
    158   const costs = runs
    159     .map((r) => r.claude_output?.total_cost_usd)
    160     .filter((c): c is number => c !== undefined && c !== null);
    161 
    162   const wallTimes = runs
    163     .map((r) => r.meta.wall_time_seconds)
    164     .filter((t): t is number => t !== undefined && t !== null);
    165 
    166   const turns = runs
    167     .map((r) => r.claude_output?.num_turns)
    168     .filter((t): t is number => t !== undefined && t !== null);
    169 
    170   const passes = runs.filter(
    171     (r) => r.eval_results?.functional?.pass === true
    172   ).length;
    173 
    174   const avg = (arr: number[]) =>
    175     arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
    176 
    177   return {
    178     count: runs.length,
    179     avg_score: avg(scores),
    180     avg_cost: avg(costs),
    181     avg_wall_time: avg(wallTimes),
    182     avg_turns: avg(turns),
    183     pass_rate: runs.length > 0 ? passes / runs.length : null,
    184   };
    185 }
    186 
    187 /**
    188  * Trim a Run for serialization into an Astro island that only needs
    189  * summary-level fields. Drops per-test details, full SonarQube payloads,
    190  * code_analysis/transcript_analysis payloads, etc. -- the bulk of
    191  * eval_results.json content that bloats the index page HTML.
    192  *
    193  * Fields kept are the union of everything the index page islands
    194  * (Charts, Grid, TopBottomConfigs, StatisticalPowerCard) plus
    195  * analysis.groupIntoCells actually read.
    196  */
    197 export function projectRunForIndex(run: Run): Run {
    198   const er = run.eval_results as Record<string, any> | null;
    199   const slimEval: EvalResults | null = er
    200     ? {
    201         score: er.score ?? null,
    202         functional: er.functional
    203           ? { pass: er.functional.pass, score: er.functional.score }
    204           : undefined,
    205         structural: er.structural
    206           ? { pass: er.structural.pass, score: er.structural.score, checks: [] }
    207           : undefined,
    208         quality: er.quality ? { score: er.quality.score } : undefined,
    209         // Non-interface fields the analysis layer reads via `as any`.
    210         ...(er.gameplay_bot ? { gameplay_bot: { score: er.gameplay_bot.score } } : {}),
    211         ...(er.code_analysis ? { code_analysis: { score: er.code_analysis.score } } : {}),
    212         ...(er.transcript_analysis ? { transcript_analysis: { score: er.transcript_analysis.score } } : {}),
    213         ...(er.sonarqube ? { sonarqube: { score: er.sonarqube.score } } : {}),
    214       } as EvalResults
    215     : null;
    216 
    217   const slimOutput: ClaudeOutput | null = run.claude_output
    218     ? {
    219         total_cost_usd: run.claude_output.total_cost_usd,
    220         num_turns: run.claude_output.num_turns,
    221       }
    222     : null;
    223 
    224   return {
    225     meta: run.meta,
    226     eval_results: slimEval,
    227     claude_output: slimOutput,
    228     has_transcript: run.has_transcript,
    229   };
    230 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README