loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

index.ts (11441B)


      1 import { test } from "@playwright/test";
      2 import { execSync, spawn, type ChildProcess } from "node:child_process";
      3 import * as fs from "node:fs";
      4 import * as path from "node:path";
      5 import * as net from "node:net";
      6 import type { BotReport } from "./types";
      7 import { PlaywrightDriver } from "./driver";
      8 import { runAllTests } from "./bot";
      9 
     10 /**
     11  * Find an available port by briefly binding to port 0.
     12  */
     13 async function findFreePort(): Promise<number> {
     14   return new Promise((resolve, reject) => {
     15     const server = net.createServer();
     16     server.listen(0, () => {
     17       const addr = server.address();
     18       if (addr && typeof addr === "object") {
     19         const port = addr.port;
     20         server.close(() => resolve(port));
     21       } else {
     22         server.close(() => reject(new Error("could not determine port")));
     23       }
     24     });
     25     server.on("error", reject);
     26   });
     27 }
     28 
     29 /**
     30  * Start a simple HTTP server to serve workspace files.
     31  */
     32 async function startServer(workspacePath: string, port: number): Promise<ChildProcess> {
     33   let serverProc: ChildProcess;
     34 
     35   try {
     36     execSync("npx serve --version", { stdio: "ignore", timeout: 5000 });
     37     serverProc = spawn("npx", ["serve", "-l", String(port), "-s", "--no-clipboard"], {
     38       cwd: workspacePath,
     39       stdio: "ignore",
     40     });
     41   } catch {
     42     serverProc = spawn("python3", ["-m", "http.server", String(port)], {
     43       cwd: workspacePath,
     44       stdio: "ignore",
     45     });
     46   }
     47 
     48   const maxWait = 10000;
     49   const start = Date.now();
     50   while (Date.now() - start < maxWait) {
     51     try {
     52       await new Promise<void>((resolve, reject) => {
     53         const socket = net.createConnection({ port, host: "127.0.0.1" }, () => {
     54           socket.destroy();
     55           resolve();
     56         });
     57         socket.on("error", reject);
     58         socket.setTimeout(500, () => {
     59           socket.destroy();
     60           reject(new Error("timeout"));
     61         });
     62       });
     63       return serverProc;
     64     } catch {
     65       await new Promise((r) => setTimeout(r, 200));
     66     }
     67   }
     68 
     69   throw new Error(`server did not start on port ${port} within ${maxWait}ms`);
     70 }
     71 
     72 test.describe("Tetris Gameplay Bot v2", () => {
     73   let serverProc: ChildProcess | null = null;
     74   let serverUrl: string;
     75 
     76   test.beforeAll(async () => {
     77     const workspacePath =
     78       process.env.WORKSPACE_PATH || process.env.TETRIS_WORKSPACE || process.cwd();
     79     const port = await findFreePort();
     80     serverProc = await startServer(workspacePath, port);
     81     serverUrl = `http://127.0.0.1:${port}`;
     82   });
     83 
     84   test.afterAll(async () => {
     85     if (serverProc) {
     86       serverProc.kill("SIGTERM");
     87       serverProc = null;
     88     }
     89   });
     90 
     91   test("run gameplay bot", async ({ page }) => {
     92     test.setTimeout(900_000); // 15-minute total timeout; driver watchdog aborts earlier on inactivity
     93 
     94     // Measure page load time
     95     let loadTimeMs = -1;
     96     try {
     97       const loadStart = Date.now();
     98       await page.goto(serverUrl, { waitUntil: "load", timeout: 10000 });
     99       loadTimeMs = Date.now() - loadStart;
    100       await page.goto("about:blank");
    101     } catch {
    102       // Load time measurement failed, not critical
    103     }
    104 
    105     // Create the Driver (which gets the Playwright Page)
    106     const driver = new PlaywrightDriver(page);
    107 
    108     // Create the Bot (which gets the Driver) and run everything
    109     const { testResults, calibration, gameplay, session, survey, competitivePlay, calibrationDrift } =
    110       await runAllTests(driver, serverUrl);
    111 
    112     // Accessibility check
    113     let a11yIssues: string[] = [];
    114     try {
    115       await page.goto(serverUrl, { timeout: 10000 });
    116       await page.waitForTimeout(1000);
    117       a11yIssues = await page.evaluate(() => {
    118         const issues: string[] = [];
    119         if (!document.title || document.title.trim() === "") issues.push("missing page title");
    120         if (document.querySelectorAll("h1, h2, h3, [role='heading']").length === 0) issues.push("no headings found");
    121         document.querySelectorAll("img").forEach((img) => {
    122           if (!img.alt && !img.getAttribute("aria-label")) issues.push("image without alt text");
    123         });
    124         document.querySelectorAll("canvas").forEach((canvas) => {
    125           if (!canvas.getAttribute("aria-label") && !canvas.getAttribute("role")) issues.push("canvas without aria-label or role");
    126         });
    127         const focusable = document.querySelectorAll("button, a, input, [tabindex]");
    128         if (focusable.length === 0 && document.querySelectorAll("canvas").length === 0) issues.push("no focusable elements");
    129         const body = window.getComputedStyle(document.body);
    130         if (body.backgroundColor === body.color) issues.push("text color matches background color");
    131         return issues;
    132       });
    133     } catch {
    134       // a11y check failed, not critical
    135     }
    136 
    137     const passed = testResults.filter((t) => t.pass).length;
    138     const skipped = testResults.filter((t) => t.detail.startsWith("skipped:")).length;
    139     const failed = testResults.filter((t) => !t.pass && !t.detail.startsWith("skipped:")).length;
    140     const total = testResults.length;
    141     const scorable = total - skipped;
    142 
    143     const totalReads = session.gridReadSuccess + session.gridReadFail;
    144     const gridSuccessRate = totalReads > 0 ? session.gridReadSuccess / totalReads : 0;
    145 
    146     // Clean competitive play result
    147     let cleanCompetitivePlay = competitivePlay;
    148     if (cleanCompetitivePlay) {
    149       const { _ccwResult, _ccwTestDone, _softDropDistinct, _softDropTestDone, ...clean } =
    150         cleanCompetitivePlay as any;
    151       cleanCompetitivePlay = clean;
    152     }
    153 
    154     // Build the control_discovery report field from the discovered map
    155     // (may be null if discovery never ran because the game didn't start).
    156     let controlDiscoveryReport: Record<string, string> | undefined;
    157     if (calibration.controlMap) {
    158       const cm = calibration.controlMap;
    159       controlDiscoveryReport = {};
    160       const actionDescriptions: Array<[string, { key: string | null; observation: string; confidence: string }]> = [
    161         ["move_left", cm.move_left],
    162         ["move_right", cm.move_right],
    163         ["soft_drop", cm.soft_drop],
    164         ["hard_drop", cm.hard_drop],
    165         ["rotate_cw", cm.rotate_cw],
    166         ["rotate_ccw", cm.rotate_ccw],
    167       ];
    168       for (const [name, mapping] of actionDescriptions) {
    169         if (mapping.key) {
    170           controlDiscoveryReport[name] = `${mapping.key}${mapping.observation ? ` (${mapping.observation})` : ""}`;
    171         } else {
    172           controlDiscoveryReport[name] = "NOT FOUND";
    173         }
    174       }
    175       // Also report per-key observations so readers can see what every
    176       // candidate did during discovery.
    177       for (const [key, obs] of Object.entries(cm.key_observations)) {
    178         if (obs) controlDiscoveryReport[`key:${key}`] = obs;
    179       }
    180     }
    181 
    182     const report: BotReport = {
    183       implementation: {
    184         renderer: calibration.renderer,
    185         grid_detected: calibration.gridDetected,
    186         grid_detected_at: calibration.gridDetectedAt || "initial",
    187         grid_bounds: calibration.gridBounds,
    188         controls: calibration.controls as unknown as Record<string, string>,
    189         control_discovery: controlDiscoveryReport,
    190         start_mechanism: calibration.startMechanism,
    191         score_element_found: calibration.scoreElementSelector !== null,
    192         grid_confidence: calibration.gridConfidence,
    193         survey,
    194       },
    195       tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })),
    196       summary: {
    197         total,
    198         passed,
    199         failed,
    200         skipped,
    201         score: scorable > 0 ? Math.round((passed / scorable) * 100) / 100 : 0,
    202       },
    203       gameplay,
    204       competitive_play: cleanCompetitivePlay,
    205       session: {
    206         frames: session.frames,
    207         events_count: session.events.length,
    208         pieces_spawned: session.piecesSpawned,
    209         pieces_locked: session.piecesLocked,
    210         lines_cleared: session.linesCleared,
    211         piece_types_seen: [...session.pieceTypes],
    212         grid_read_success_rate: Math.round(gridSuccessRate * 100) / 100,
    213       },
    214       performance: {
    215         load_time_ms: loadTimeMs,
    216       },
    217       accessibility: {
    218         issues: a11yIssues,
    219         issue_count: a11yIssues.length,
    220         pass: a11yIssues.length === 0,
    221       },
    222       calibration_drift: calibrationDrift,
    223     };
    224 
    225     // Write report to file
    226     const reportPath =
    227       process.env.REPORT_OUTPUT_PATH ||
    228       path.join(process.cwd(), "gameplay-bot-report.json");
    229 
    230     const reportDir = path.dirname(reportPath);
    231     if (!fs.existsSync(reportDir)) {
    232       fs.mkdirSync(reportDir, { recursive: true });
    233     }
    234 
    235     fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), "utf-8");
    236 
    237     // Log summary
    238     console.log("\n=== Gameplay Bot v2 Report ===");
    239     console.log(`Renderer: ${calibration.renderer}`);
    240     console.log(`Grid detected: ${calibration.gridDetected} (at: ${calibration.gridDetectedAt})`);
    241     console.log(`Grid confidence: ${Math.round(calibration.gridConfidence * 100)}%`);
    242     console.log(`Grid read success rate: ${Math.round(gridSuccessRate * 100)}%`);
    243     console.log(`Start mechanism: ${calibration.startMechanism}`);
    244     console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`);
    245     if (calibration.controlMap) {
    246       const cm = calibration.controlMap;
    247       console.log(
    248         `Controls: left=${cm.move_left.key ?? "?"}, right=${cm.move_right.key ?? "?"}, ` +
    249         `rotate=${cm.rotate_cw.key ?? "?"}, hard_drop=${cm.hard_drop.key ?? "?"}, ` +
    250         `soft_drop=${cm.soft_drop.key ?? "NONE"}`
    251       );
    252     }
    253     console.log(`\nTests: ${passed}/${total} passed, ${skipped} skipped, ${failed} failed`);
    254     console.log(`Score: ${report.summary.score} (${passed}/${scorable} scorable)`);
    255     for (const t of testResults) {
    256       const status = t.detail.startsWith("skipped:") ? "SKIP" : t.pass ? "PASS" : "FAIL";
    257       console.log(`  ${status} ${t.name}: ${t.detail}`);
    258     }
    259     console.log(`\nSession: ${session.frames} frames, ${session.events.length} events`);
    260     console.log(`  Pieces spawned: ${session.piecesSpawned}, locked: ${session.piecesLocked}`);
    261     console.log(`  Lines cleared: ${session.linesCleared}`);
    262     console.log(`  Piece types: [${[...session.pieceTypes].join(", ")}]`);
    263     console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`);
    264     if (competitivePlay) {
    265       console.log(`\nCompetitive play: ${competitivePlay.pieces_placed} pieces, ${competitivePlay.total_lines_cleared} lines`);
    266       console.log(`  Clears: ${competitivePlay.single_clears}x single, ${competitivePlay.double_clears}x double, ${competitivePlay.triple_clears}x triple, ${competitivePlay.tetris_clears}x tetris`);
    267       console.log(`  Score: ${competitivePlay.score_final}, Level: ${competitivePlay.level_final}`);
    268       if (competitivePlay.bugs_detected.length > 0) {
    269         console.log(`  Bugs: [${competitivePlay.bugs_detected.join(", ")}]`);
    270       }
    271     }
    272     console.log(
    273       `\nCalibration cache: ${calibrationDrift.cacheHits} hits / ${calibrationDrift.cacheMisses} misses ` +
    274       `(${calibrationDrift.recalibrations} recalibrations)` +
    275       (calibrationDrift.drifted ? ` -- DRIFTED: [${calibrationDrift.changes.join(", ")}]` : "")
    276     );
    277     console.log(`\nSurvey: canvas=${survey.has_canvas}, dom_grid=${survey.has_dom_grid}, overlay=${survey.has_overlay}, clickable=${survey.clickable_elements}`);
    278     console.log(`Report written to: ${reportPath}`);
    279     console.log("==============================\n");
    280   });
    281 });

Impressum · Datenschutz