loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

calibrate.astro (1750B)


      1 ---
      2 import Base from "../layouts/Base.astro";
      3 import Calibrate from "../components/Calibrate";
      4 import fs from "node:fs";
      5 import path from "node:path";
      6 import { loadAllRuns } from "../lib/data";
      7 
      8 // Load calibration data
      9 const calibrationDir = path.resolve(process.cwd(), "../tasks/tetris/eval/gameplay-bot/calibration");
     10 interface CalibrationEntry {
     11   run_id: string;
     12   short_id: string;
     13   label: string;
     14   notes: string;
     15   human_tested_at: string;
     16   human_tests: Record<string, boolean | null>;
     17 }
     18 
     19 const entries: CalibrationEntry[] = [];
     20 if (fs.existsSync(calibrationDir)) {
     21   for (const file of fs.readdirSync(calibrationDir).sort()) {
     22     if (!file.endsWith(".json")) continue;
     23     try {
     24       const data = JSON.parse(fs.readFileSync(path.join(calibrationDir, file), "utf-8"));
     25       entries.push(data);
     26     } catch {}
     27   }
     28 }
     29 
     30 // Load bot results for these runs
     31 const allRuns = loadAllRuns();
     32 const runsByRunId = new Map(allRuns.map(r => [r.meta.run_id, r]));
     33 
     34 // Build comparison data for the React component
     35 const comparisons = entries.map(entry => {
     36   const run = runsByRunId.get(entry.run_id);
     37   const botScore = (run?.eval_results as any)?.gameplay_bot?.score ?? null;
     38   const botTests = ((run?.eval_results as any)?.gameplay_bot?.report?.tests ?? []) as Array<{name: string; pass: boolean; detail: string}>;
     39   const artifactUrl = `/artifacts/${entry.run_id}/index.html`;
     40   return { entry, botScore, botTests, artifactUrl };
     41 });
     42 ---
     43 
     44 <Base title="Bot Calibration">
     45   <h1 style="margin-bottom: 8px;">Bot Calibration</h1>
     46   <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
     47     Hand-picked games with human test results compared to bot results.
     48   </p>
     49 
     50   <Calibrate client:load comparisons={comparisons} />
     51 </Base>

Impressum · Datenschutz