loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

index.ts (10084B)


      1 import { test } from "@playwright/test";
      2 import { execSync, spawn, type ChildProcess } from "node:child_process";
      3 import * as fs from "node:fs";
      4 import * as path from "node:path";
      5 import * as net from "node:net";
      6 import type { BotReport } from "./types";
      7 import { runAllTests } from "./tests";
      8 
      9 /**
     10  * Find an available port by briefly binding to port 0.
     11  */
     12 async function findFreePort(): Promise<number> {
     13   return new Promise((resolve, reject) => {
     14     const server = net.createServer();
     15     server.listen(0, () => {
     16       const addr = server.address();
     17       if (addr && typeof addr === "object") {
     18         const port = addr.port;
     19         server.close(() => resolve(port));
     20       } else {
     21         server.close(() => reject(new Error("could not determine port")));
     22       }
     23     });
     24     server.on("error", reject);
     25   });
     26 }
     27 
     28 /**
     29  * Start a simple HTTP server to serve workspace files.
     30  * Tries `npx serve` first, then `python3 -m http.server`.
     31  */
     32 async function startServer(workspacePath: string, port: number): Promise<ChildProcess> {
     33   let serverProc: ChildProcess;
     34 
     35   // Try npx serve first
     36   try {
     37     execSync("npx serve --version", { stdio: "ignore", timeout: 5000 });
     38     serverProc = spawn("npx", ["serve", "-l", String(port), "-s", "--no-clipboard"], {
     39       cwd: workspacePath,
     40       stdio: "ignore",
     41     });
     42   } catch {
     43     // Fallback to python
     44     serverProc = spawn("python3", ["-m", "http.server", String(port)], {
     45       cwd: workspacePath,
     46       stdio: "ignore",
     47     });
     48   }
     49 
     50   // Wait for the server to be ready
     51   const maxWait = 10000;
     52   const start = Date.now();
     53   while (Date.now() - start < maxWait) {
     54     try {
     55       await new Promise<void>((resolve, reject) => {
     56         const socket = net.createConnection({ port, host: "127.0.0.1" }, () => {
     57           socket.destroy();
     58           resolve();
     59         });
     60         socket.on("error", reject);
     61         socket.setTimeout(500, () => {
     62           socket.destroy();
     63           reject(new Error("timeout"));
     64         });
     65       });
     66       return serverProc;
     67     } catch {
     68       await new Promise((r) => setTimeout(r, 200));
     69     }
     70   }
     71 
     72   throw new Error(`server did not start on port ${port} within ${maxWait}ms`);
     73 }
     74 
     75 test.describe("Tetris Gameplay Bot", () => {
     76   let serverProc: ChildProcess | null = null;
     77   let serverUrl: string;
     78 
     79   test.beforeAll(async () => {
     80     const workspacePath =
     81       process.env.WORKSPACE_PATH || process.env.TETRIS_WORKSPACE || process.cwd();
     82     const port = await findFreePort();
     83     serverProc = await startServer(workspacePath, port);
     84     serverUrl = `http://127.0.0.1:${port}`;
     85   });
     86 
     87   test.afterAll(async () => {
     88     if (serverProc) {
     89       serverProc.kill("SIGTERM");
     90       serverProc = null;
     91     }
     92   });
     93 
     94   test("run gameplay bot", async ({ page }) => {
     95     test.setTimeout(300_000); // 5-minute total timeout (competitive play adds time)
     96 
     97     // Measure page load time
     98     let loadTimeMs = -1;
     99     try {
    100       const loadStart = Date.now();
    101       await page.goto(serverUrl, { waitUntil: "load", timeout: 10000 });
    102       loadTimeMs = Date.now() - loadStart;
    103       // Navigate away so runAllTests starts fresh
    104       await page.goto("about:blank");
    105     } catch {
    106       // Load time measurement failed, not critical
    107     }
    108 
    109     const { testResults, calibration, gameplay, session, survey, competitivePlay } =
    110       await runAllTests(page, serverUrl);
    111 
    112     // Accessibility check via page evaluation (lightweight, no axe-core dependency)
    113     let a11yIssues: string[] = [];
    114     try {
    115       await page.goto(serverUrl, { timeout: 10000 });
    116       await page.waitForTimeout(1000);
    117       a11yIssues = await page.evaluate(() => {
    118         const issues: string[] = [];
    119         // Check page title
    120         if (!document.title || document.title.trim() === "") {
    121           issues.push("missing page title");
    122         }
    123         // Check for headings
    124         if (document.querySelectorAll("h1, h2, h3, [role='heading']").length === 0) {
    125           issues.push("no headings found");
    126         }
    127         // Check images have alt text
    128         document.querySelectorAll("img").forEach((img) => {
    129           if (!img.alt && !img.getAttribute("aria-label")) {
    130             issues.push("image without alt text");
    131           }
    132         });
    133         // Check canvas has accessible label
    134         document.querySelectorAll("canvas").forEach((canvas) => {
    135           if (!canvas.getAttribute("aria-label") && !canvas.getAttribute("role")) {
    136             issues.push("canvas without aria-label or role");
    137           }
    138         });
    139         // Check for focus indicators
    140         const focusable = document.querySelectorAll("button, a, input, [tabindex]");
    141         if (focusable.length === 0 && document.querySelectorAll("canvas").length === 0) {
    142           issues.push("no focusable elements");
    143         }
    144         // Check color contrast on text elements (basic check)
    145         const body = window.getComputedStyle(document.body);
    146         const bgColor = body.backgroundColor;
    147         const textColor = body.color;
    148         if (bgColor === textColor) {
    149           issues.push("text color matches background color");
    150         }
    151         return issues;
    152       });
    153     } catch {
    154       // a11y check failed, not critical
    155     }
    156 
    157     const passed = testResults.filter((t) => t.pass).length;
    158     const skipped = testResults.filter((t) => t.detail.startsWith("skipped:")).length;
    159     const failed = testResults.filter((t) => !t.pass && !t.detail.startsWith("skipped:")).length;
    160     const total = testResults.length;
    161     const scorable = total - skipped;
    162 
    163     const totalReads = session.gridReadSuccess + session.gridReadFail;
    164     const gridSuccessRate = totalReads > 0 ? session.gridReadSuccess / totalReads : 0;
    165 
    166     // Clean competitive play result (remove internal tracking fields)
    167     let cleanCompetitivePlay = competitivePlay;
    168     if (cleanCompetitivePlay) {
    169       const { _ccwResult, _ccwTestDone, _softDropDistinct, _softDropTestDone, ...clean } =
    170         cleanCompetitivePlay as any;
    171       cleanCompetitivePlay = clean;
    172     }
    173 
    174     const report: BotReport = {
    175       implementation: {
    176         renderer: calibration.renderer,
    177         grid_detected: calibration.gridDetected,
    178         grid_detected_at: (calibration as any).grid_detected_at || "initial",
    179         grid_bounds: calibration.gridBounds,
    180         controls: calibration.controls as unknown as Record<string, string>,
    181         start_mechanism: calibration.startMechanism,
    182         score_element_found: calibration.scoreElementSelector !== null,
    183         grid_confidence: calibration.gridConfidence,
    184         survey,
    185       },
    186       tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })),
    187       summary: {
    188         total,
    189         passed,
    190         failed,
    191         skipped,
    192         score: scorable > 0 ? Math.round((passed / scorable) * 100) / 100 : 0,
    193       },
    194       gameplay,
    195       competitive_play: cleanCompetitivePlay,
    196       session: {
    197         frames: session.frames,
    198         events_count: session.events.length,
    199         pieces_spawned: session.piecesSpawned,
    200         pieces_locked: session.piecesLocked,
    201         lines_cleared: session.linesCleared,
    202         piece_types_seen: [...session.pieceTypes],
    203         grid_read_success_rate: Math.round(gridSuccessRate * 100) / 100,
    204       },
    205       performance: {
    206         load_time_ms: loadTimeMs,
    207       },
    208       accessibility: {
    209         issues: a11yIssues,
    210         issue_count: a11yIssues.length,
    211         pass: a11yIssues.length === 0,
    212       },
    213     };
    214 
    215     // Write report to file
    216     const reportPath =
    217       process.env.REPORT_OUTPUT_PATH ||
    218       path.join(process.cwd(), "gameplay-bot-report.json");
    219 
    220     // Ensure output directory exists
    221     const reportDir = path.dirname(reportPath);
    222     if (!fs.existsSync(reportDir)) {
    223       fs.mkdirSync(reportDir, { recursive: true });
    224     }
    225 
    226     fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), "utf-8");
    227 
    228     // Log summary to console for visibility
    229     console.log("\n=== Gameplay Bot Report ===");
    230     console.log(`Renderer: ${calibration.renderer}`);
    231     console.log(`Grid detected: ${calibration.gridDetected}`);
    232     console.log(`Grid confidence: ${Math.round(calibration.gridConfidence * 100)}%`);
    233     console.log(`Grid read success rate: ${Math.round(gridSuccessRate * 100)}%`);
    234     console.log(`Start mechanism: ${calibration.startMechanism}`);
    235     console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`);
    236     console.log(`\nTests: ${passed}/${total} passed, ${skipped} skipped, ${failed} failed`);
    237     console.log(`Score: ${report.summary.score} (${passed}/${scorable} scorable)`);
    238     for (const t of testResults) {
    239       const status = t.detail.startsWith("skipped:") ? "SKIP" : t.pass ? "PASS" : "FAIL";
    240       console.log(`  ${status} ${t.name}: ${t.detail}`);
    241     }
    242     console.log(`\nSession: ${session.frames} frames, ${session.events.length} events`);
    243     console.log(`  Pieces spawned: ${session.piecesSpawned}, locked: ${session.piecesLocked}`);
    244     console.log(`  Lines cleared: ${session.linesCleared}`);
    245     console.log(`  Piece types: [${[...session.pieceTypes].join(", ")}]`);
    246     console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`);
    247     if (competitivePlay) {
    248       console.log(`\nCompetitive play: ${competitivePlay.pieces_placed} pieces, ${competitivePlay.total_lines_cleared} lines`);
    249       console.log(`  Clears: ${competitivePlay.single_clears}x single, ${competitivePlay.double_clears}x double, ${competitivePlay.triple_clears}x triple, ${competitivePlay.tetris_clears}x tetris`);
    250       console.log(`  Score: ${competitivePlay.score_final}, Level: ${competitivePlay.level_final}`);
    251       if (competitivePlay.bugs_detected.length > 0) {
    252         console.log(`  Bugs: [${competitivePlay.bugs_detected.join(", ")}]`);
    253       }
    254     }
    255     console.log(`\nSurvey: canvas=${survey.has_canvas}, dom_grid=${survey.has_dom_grid}, overlay=${survey.has_overlay}, clickable=${survey.clickable_elements}`);
    256     console.log(`Report written to: ${reportPath}`);
    257     console.log("===========================\n");
    258 
    259     // Always pass the Playwright test -- results are in the report
    260   });
    261 });

Impressum · Datenschutz