commit fe319f04a3475f06a33c8d380059948714ea6455
parent a1dcd8c3630ee94ad18d319107152852e745631c
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 07:52:22 +0200
Add gameplay bot, language=unspecified option, bump Playwright timeout
Gameplay bot (Opus-built):
- 6 TypeScript files in tasks/tetris/eval/gameplay-bot/
- Calibrates to any Tetris implementation (canvas/DOM, any controls)
- Runs 15 independent tests (game loads, auto-drop, movement, rotation,
line clear, game over, 30s endurance, etc.)
- Uses 4-heuristic AI (height, lines, holes, bumpiness) for gameplay
- Falls back to random inputs if grid reading fails
- Outputs JSON report with per-test pass/fail and gameplay stats
Tested against existing artifact: 7/15 passed (correctly detected
auto-drop bug where pieces don't fall on their own).
Language axis: added "unspecified" value - agent gets no language
instruction, chooses on its own.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
12 files changed, 2244 insertions(+), 1 deletion(-)
diff --git a/grid.yaml b/grid.yaml
@@ -15,7 +15,7 @@ axes:
prompt_style:
values: [simple, detailed]
language:
- values: [typescript, javascript]
+ values: [typescript, javascript, unspecified]
human_language:
values: [en, es]
tool_read:
diff --git a/package-lock.json b/package-lock.json
@@ -0,0 +1,112 @@
+{
+ "name": "loop-benchmarking",
+ "version": "1.0.0",
+ "lockfileVersion": 3,
+ "requires": true,
+ "packages": {
+ "": {
+ "name": "loop-benchmarking",
+ "version": "1.0.0",
+ "license": "ISC",
+ "devDependencies": {
+ "@playwright/test": "^1.59.1",
+ "@types/node": "^25.5.2",
+ "typescript": "^6.0.2"
+ }
+ },
+ "node_modules/@playwright/test": {
+ "version": "1.59.1",
+ "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz",
+ "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==",
+ "dev": true,
+ "license": "Apache-2.0",
+ "dependencies": {
+ "playwright": "1.59.1"
+ },
+ "bin": {
+ "playwright": "cli.js"
+ },
+ "engines": {
+ "node": ">=18"
+ }
+ },
+ "node_modules/@types/node": {
+ "version": "25.5.2",
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.2.tgz",
+ "integrity": "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg==",
+ "dev": true,
+ "license": "MIT",
+ "dependencies": {
+ "undici-types": "~7.18.0"
+ }
+ },
+ "node_modules/fsevents": {
+ "version": "2.3.2",
+ "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+ "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+ "dev": true,
+ "hasInstallScript": true,
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "darwin"
+ ],
+ "engines": {
+ "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+ }
+ },
+ "node_modules/playwright": {
+ "version": "1.59.1",
+ "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
+ "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
+ "dev": true,
+ "license": "Apache-2.0",
+ "dependencies": {
+ "playwright-core": "1.59.1"
+ },
+ "bin": {
+ "playwright": "cli.js"
+ },
+ "engines": {
+ "node": ">=18"
+ },
+ "optionalDependencies": {
+ "fsevents": "2.3.2"
+ }
+ },
+ "node_modules/playwright-core": {
+ "version": "1.59.1",
+ "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
+ "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
+ "dev": true,
+ "license": "Apache-2.0",
+ "bin": {
+ "playwright-core": "cli.js"
+ },
+ "engines": {
+ "node": ">=18"
+ }
+ },
+ "node_modules/typescript": {
+ "version": "6.0.2",
+ "resolved": "https://registry.npmjs.org/typescript/-/typescript-6.0.2.tgz",
+ "integrity": "sha512-bGdAIrZ0wiGDo5l8c++HWtbaNCWTS4UTv7RaTH/ThVIgjkveJt83m74bBHMJkuCbslY8ixgLBVZJIOiQlQTjfQ==",
+ "dev": true,
+ "license": "Apache-2.0",
+ "bin": {
+ "tsc": "bin/tsc",
+ "tsserver": "bin/tsserver"
+ },
+ "engines": {
+ "node": ">=14.17"
+ }
+ },
+ "node_modules/undici-types": {
+ "version": "7.18.2",
+ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz",
+ "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==",
+ "dev": true,
+ "license": "MIT"
+ }
+ }
+}
diff --git a/package.json b/package.json
@@ -0,0 +1,22 @@
+{
+ "name": "loop-benchmarking",
+ "version": "1.0.0",
+ "description": "An open benchmark for comparing agentic coding loop configurations. Same task, different setups, all data public.",
+ "main": "index.js",
+ "devDependencies": {
+ "@playwright/test": "^1.59.1",
+ "@types/node": "^25.5.2",
+ "typescript": "^6.0.2"
+ },
+ "scripts": {
+ "test": "echo \"Error: no test specified\" && exit 1"
+ },
+ "repository": {
+ "type": "git",
+ "url": "https://git.statagroup.com/research/loop-benchmarking.git"
+ },
+ "keywords": [],
+ "author": "",
+ "license": "ISC",
+ "type": "commonjs"
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts
@@ -0,0 +1,439 @@
+import type { Page } from "@playwright/test";
+import type {
+ CalibrationResult,
+ Controls,
+ GridBounds,
+ RendererType,
+ StartMechanism,
+} from "./types";
+import { sampleBackgroundColor } from "./grid-reader";
+
+const DEFAULT_CONTROLS: Controls = {
+ left: "ArrowLeft",
+ right: "ArrowRight",
+ down: "ArrowDown",
+ rotate: "ArrowUp",
+ drop: "Space",
+};
+
+/**
+ * Run all calibration steps. Never throws -- returns a result
+ * with whatever could be detected.
+ */
+export async function calibrate(page: Page): Promise<CalibrationResult> {
+ const consoleErrors: string[] = [];
+ page.on("pageerror", (err) => consoleErrors.push(err.message));
+
+ const startMechanism = await detectStartMechanism(page);
+ const { renderer, gridBounds, cellWidth, cellHeight } = await detectGrid(page);
+ const backgroundColor =
+ renderer === "canvas" && gridBounds
+ ? await sampleBackgroundColor(page, gridBounds, cellWidth, cellHeight)
+ : null;
+ const controls = await detectControls(page);
+ const scoreElementSelector = await detectScoreElement(page);
+
+ return {
+ renderer,
+ gridDetected: gridBounds !== null,
+ gridBounds,
+ cellWidth,
+ cellHeight,
+ controls,
+ startMechanism,
+ scoreElementSelector,
+ backgroundColor,
+ consoleErrors,
+ };
+}
+
+/**
+ * Try multiple mechanisms to start the game.
+ * Takes a screenshot before and after each attempt, comparing
+ * to see if the game state changed.
+ */
+async function detectStartMechanism(page: Page): Promise<StartMechanism> {
+ // Take initial screenshot
+ let prevShot = await page.screenshot();
+
+ // 1. Wait 3 seconds (auto-start)
+ await page.waitForTimeout(3000);
+ let newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "auto";
+ }
+ prevShot = newShot;
+
+ // 2. Click the canvas or game container
+ try {
+ const canvas = page.locator("canvas").first();
+ if ((await canvas.count()) > 0) {
+ await canvas.click();
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "click_canvas";
+ }
+ prevShot = newShot;
+ }
+ } catch { /* continue */ }
+
+ // Try clicking any game-like container
+ try {
+ const container = page.locator(
+ '[class*="game"], [class*="board"], [id*="game"], [id*="board"]'
+ ).first();
+ if ((await container.count()) > 0) {
+ await container.click();
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "click_canvas";
+ }
+ prevShot = newShot;
+ }
+ } catch { /* continue */ }
+
+ // 3. Press Enter
+ await page.keyboard.press("Enter");
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "enter";
+ }
+ prevShot = newShot;
+
+ // 4. Press Space
+ await page.keyboard.press("Space");
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "space";
+ }
+ prevShot = newShot;
+
+ // 5. Look for a start/play button
+ try {
+ const button = page.locator("button, a, [role='button']").filter({
+ hasText: /start|play|begin|new game/i,
+ }).first();
+ if ((await button.count()) > 0) {
+ await button.click();
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "button";
+ }
+ prevShot = newShot;
+ }
+ } catch { /* continue */ }
+
+ // Also try elements that aren't buttons but have matching text
+ try {
+ const textMatch = page.locator(
+ ':text-matches("start|play|begin|new.game", "i")'
+ ).first();
+ if ((await textMatch.count()) > 0) {
+ await textMatch.click();
+ await page.waitForTimeout(500);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "button";
+ }
+ prevShot = newShot;
+ }
+ } catch { /* continue */ }
+
+ // 6. Press any key (try a few)
+ for (const key of ["p", "s", "n", "Escape"]) {
+ await page.keyboard.press(key);
+ await page.waitForTimeout(300);
+ newShot = await page.screenshot();
+ if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) {
+ return "anykey";
+ }
+ prevShot = newShot;
+ }
+
+ return "unknown";
+}
+
+interface GridDetection {
+ renderer: RendererType;
+ gridBounds: GridBounds | null;
+ cellWidth: number;
+ cellHeight: number;
+}
+
+/**
+ * Detect the game grid: canvas, DOM-based, or SVG.
+ */
+async function detectGrid(page: Page): Promise<GridDetection> {
+ // Check for canvas
+ try {
+ const canvasCount = await page.locator("canvas").count();
+ if (canvasCount > 0) {
+ const bounds = await page.locator("canvas").first().boundingBox();
+ if (bounds) {
+ // Try to get the canvas internal dimensions (which may differ from CSS size)
+ const canvasDims = await page.evaluate(() => {
+ const c = document.querySelector("canvas");
+ if (!c) return null;
+ return { width: c.width, height: c.height };
+ });
+
+ const internalW = canvasDims ? canvasDims.width : bounds.width;
+ const internalH = canvasDims ? canvasDims.height : bounds.height;
+
+ // Standard Tetris grid is 10 cols by 20 rows
+ // The canvas might include sidebars, so try to detect the grid area
+ // Heuristic: if the aspect ratio is close to 1:2, the whole canvas is the grid
+ const ratio = internalH / internalW;
+
+ let gridX = 0;
+ let gridY = 0;
+ let gridW = internalW;
+ let gridH = internalH;
+
+ if (ratio >= 1.5 && ratio <= 2.5) {
+ // Looks like the whole canvas is the grid
+ gridX = 0;
+ gridY = 0;
+ gridW = internalW;
+ gridH = internalH;
+ } else if (ratio < 1.5) {
+ // Canvas is wider than expected -- grid is probably a subset
+ // Assume grid is centered or left-aligned with 1:2 aspect ratio
+ gridW = internalH / 2;
+ gridH = internalH;
+ gridX = 0; // left-aligned by default
+ gridY = 0;
+ }
+
+ const cellWidth = gridW / 10;
+ const cellHeight = gridH / 20;
+
+ return {
+ renderer: "canvas" as RendererType,
+ gridBounds: { x: gridX, y: gridY, width: gridW, height: gridH },
+ cellWidth,
+ cellHeight,
+ };
+ }
+ }
+ } catch { /* continue */ }
+
+ // Check for DOM-based grid
+ try {
+ const domResult = await page.evaluate(() => {
+ // Look for table-based grids
+ const tables = document.querySelectorAll("table");
+ for (const table of tables) {
+ const rows = table.querySelectorAll("tr");
+ if (rows.length >= 18) {
+ // Likely a Tetris grid (might be 18-22 rows)
+ const firstRow = rows[0].querySelectorAll("td");
+ if (firstRow.length >= 8 && firstRow.length <= 12) {
+ const rect = table.getBoundingClientRect();
+ return {
+ type: "dom" as const,
+ bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
+ rows: rows.length,
+ cols: firstRow.length,
+ };
+ }
+ }
+ }
+
+ // Look for grid/flex containers
+ const containers = document.querySelectorAll(
+ '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]'
+ );
+ for (const container of containers) {
+ const children = container.children;
+ // Flat list of 200 cells (or close to it)
+ if (children.length >= 180 && children.length <= 220) {
+ const cols = 10;
+ const rows = Math.round(children.length / cols);
+ const rect = container.getBoundingClientRect();
+ return {
+ type: "dom" as const,
+ bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
+ rows,
+ cols,
+ };
+ }
+ // 20 row containers
+ if (children.length >= 18 && children.length <= 22) {
+ const firstRowCells = children[0].children;
+ if (firstRowCells.length >= 8 && firstRowCells.length <= 12) {
+ const rect = container.getBoundingClientRect();
+ return {
+ type: "dom" as const,
+ bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
+ rows: children.length,
+ cols: firstRowCells.length,
+ };
+ }
+ }
+ }
+
+ return null;
+ });
+
+ if (domResult) {
+ const cellWidth = domResult.bounds.width / domResult.cols;
+ const cellHeight = domResult.bounds.height / domResult.rows;
+ return {
+ renderer: "dom",
+ gridBounds: domResult.bounds,
+ cellWidth,
+ cellHeight,
+ };
+ }
+ } catch { /* continue */ }
+
+ // Check for SVG
+ try {
+ const svgCount = await page.locator("svg").count();
+ if (svgCount > 0) {
+ const bounds = await page.locator("svg").first().boundingBox();
+ if (bounds) {
+ return {
+ renderer: "svg",
+ gridBounds: { x: bounds.x, y: bounds.y, width: bounds.width, height: bounds.height },
+ cellWidth: bounds.width / 10,
+ cellHeight: bounds.height / 20,
+ };
+ }
+ }
+ } catch { /* continue */ }
+
+ return { renderer: "unknown", gridBounds: null, cellWidth: 0, cellHeight: 0 };
+}
+
+/**
+ * Detect which keys the game responds to for movement and rotation.
+ */
+async function detectControls(page: Page): Promise<Controls> {
+ const controls: Controls = { ...DEFAULT_CONTROLS };
+
+ // First, scan the page for control hints
+ try {
+ const pageText = await page.evaluate(() => document.body.innerText.toLowerCase());
+
+ if (pageText.includes("wasd") || pageText.includes("w,a,s,d")) {
+ controls.left = "a";
+ controls.right = "d";
+ controls.down = "s";
+ controls.rotate = "w";
+ }
+ if (/z\s*(=|:)?\s*rotate/i.test(pageText) || /rotate\s*(=|:)?\s*z/i.test(pageText)) {
+ controls.rotate = "z";
+ }
+ if (/x\s*(=|:)?\s*rotate/i.test(pageText) || /rotate\s*(=|:)?\s*x/i.test(pageText)) {
+ controls.rotate = "x";
+ }
+ } catch { /* use defaults */ }
+
+ // Verify left key works by pressing and checking for visual change
+ try {
+ const before = await page.screenshot();
+ await page.keyboard.press(controls.left);
+ await page.waitForTimeout(200);
+ const after = await page.screenshot();
+
+ if (Buffer.from(before).equals(Buffer.from(after))) {
+ // ArrowLeft didn't work, try "a"
+ await page.keyboard.press("a");
+ await page.waitForTimeout(200);
+ const afterA = await page.screenshot();
+ if (!Buffer.from(before).equals(Buffer.from(afterA))) {
+ controls.left = "a";
+ controls.right = "d";
+ controls.down = "s";
+ controls.rotate = "w";
+ }
+ }
+ } catch { /* use defaults */ }
+
+ // Verify rotate key
+ try {
+ const before = await page.screenshot();
+ await page.keyboard.press(controls.rotate);
+ await page.waitForTimeout(200);
+ const after = await page.screenshot();
+
+ if (Buffer.from(before).equals(Buffer.from(after))) {
+ // Try alternative rotate keys
+ for (const alt of ["z", "x", "ArrowUp"]) {
+ if (alt === controls.rotate) continue;
+ await page.keyboard.press(alt);
+ await page.waitForTimeout(200);
+ const afterAlt = await page.screenshot();
+ if (!Buffer.from(before).equals(Buffer.from(afterAlt))) {
+ controls.rotate = alt;
+ break;
+ }
+ }
+ }
+ } catch { /* use defaults */ }
+
+ return controls;
+}
+
+/**
+ * Find the score display element on the page.
+ */
+async function detectScoreElement(page: Page): Promise<string | null> {
+ try {
+ const selector = await page.evaluate(() => {
+ // Look for elements with "score" text nearby
+ const allElements = document.querySelectorAll("*");
+ for (const el of allElements) {
+ const text = (el as HTMLElement).innerText?.toLowerCase() || "";
+ if (text.includes("score") && el.children.length < 5) {
+ // Find the numeric part -- might be a sibling or child
+ const numChild = el.querySelector("span, div, p, td");
+ if (numChild && /^\d+$/.test(numChild.textContent?.trim() || "")) {
+ // Build a selector for this element
+ if (numChild.id) return `#${numChild.id}`;
+ if (numChild.className) {
+ const cls = numChild.className.split(" ")[0];
+ if (cls) return `.${cls}`;
+ }
+ }
+ // The element itself might contain the score
+ if (el.id) return `#${el.id}`;
+ if ((el as HTMLElement).className) {
+ const cls = (el as HTMLElement).className.split(" ")[0];
+ if (cls) return `.${cls}`;
+ }
+ }
+ }
+
+ // Fallback: look for elements that contain just a number
+ const candidates: HTMLElement[] = [];
+ for (const el of allElements) {
+ const text = (el as HTMLElement).textContent?.trim() || "";
+ if (/^\d+$/.test(text) && el.children.length === 0) {
+ candidates.push(el as HTMLElement);
+ }
+ }
+ if (candidates.length > 0) {
+ const el = candidates[0];
+ if (el.id) return `#${el.id}`;
+ if (el.className) {
+ const cls = el.className.split(" ")[0];
+ if (cls) return `.${cls}`;
+ }
+ }
+
+ return null;
+ });
+
+ return selector;
+ } catch {
+ return null;
+ }
+}
diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts
@@ -0,0 +1,267 @@
+import type { Page } from "@playwright/test";
+import type { Grid, GridBounds, CalibrationResult } from "./types";
+
+const GRID_ROWS = 20;
+const GRID_COLS = 10;
+
+/**
+ * Read the game grid state. Dispatches to canvas or DOM reader
+ * based on calibration results. Returns a 10x20 boolean matrix,
+ * or null if reading fails.
+ */
+export async function readGrid(
+ page: Page,
+ cal: CalibrationResult
+): Promise<Grid | null> {
+ try {
+ if (cal.renderer === "canvas" && cal.gridBounds) {
+ return await readCanvasGrid(page, cal.gridBounds, cal.cellWidth, cal.cellHeight, cal.backgroundColor);
+ }
+ if (cal.renderer === "dom") {
+ return await readDomGrid(page);
+ }
+ // Fallback: try canvas anyway if bounds exist
+ if (cal.gridBounds) {
+ return await readCanvasGrid(page, cal.gridBounds, cal.cellWidth, cal.cellHeight, cal.backgroundColor);
+ }
+ return null;
+ } catch {
+ return null;
+ }
+}
+
+/**
+ * Read grid from a canvas element using getImageData.
+ * Samples the center pixel of each cell and compares to the background color.
+ */
+async function readCanvasGrid(
+ page: Page,
+ bounds: GridBounds,
+ cellW: number,
+ cellH: number,
+ bgColor: [number, number, number] | null
+): Promise<Grid | null> {
+ const bgR = bgColor ? bgColor[0] : 0;
+ const bgG = bgColor ? bgColor[1] : 0;
+ const bgB = bgColor ? bgColor[2] : 0;
+ const threshold = 50; // color distance threshold
+
+ const grid = await page.evaluate(
+ ({ x, y, cellW, cellH, rows, cols, bgR, bgG, bgB, threshold }) => {
+ const canvas = document.querySelector("canvas") as HTMLCanvasElement | null;
+ if (!canvas) return null;
+ const ctx = canvas.getContext("2d");
+ if (!ctx) return null;
+
+ const result: boolean[][] = [];
+ for (let row = 0; row < rows; row++) {
+ const rowData: boolean[] = [];
+ for (let col = 0; col < cols; col++) {
+ const px = Math.floor(x + col * cellW + cellW / 2);
+ const py = Math.floor(y + row * cellH + cellH / 2);
+ const pixel = ctx.getImageData(px, py, 1, 1).data;
+ // Euclidean distance from background color
+ const dr = pixel[0] - bgR;
+ const dg = pixel[1] - bgG;
+ const db = pixel[2] - bgB;
+ const dist = Math.sqrt(dr * dr + dg * dg + db * db);
+ rowData.push(dist > threshold);
+ }
+ result.push(rowData);
+ }
+ return result;
+ },
+ { x: bounds.x, y: bounds.y, cellW, cellH, rows: GRID_ROWS, cols: GRID_COLS, bgR, bgG, bgB, threshold }
+ );
+
+ return grid;
+}
+
+/**
+ * Read grid from DOM elements. Looks for a grid-like structure and checks
+ * background colors or class names to determine filled vs empty cells.
+ */
+async function readDomGrid(page: Page): Promise<Grid | null> {
+ const grid = await page.evaluate(({ rows, cols }) => {
+ // Strategy 1: look for a table-based grid
+ const tables = document.querySelectorAll("table");
+ for (const table of tables) {
+ const trs = table.querySelectorAll("tr");
+ if (trs.length >= rows) {
+ const result: boolean[][] = [];
+ for (let r = 0; r < rows; r++) {
+ const tds = trs[r].querySelectorAll("td");
+ const rowData: boolean[] = [];
+ for (let c = 0; c < cols; c++) {
+ if (c < tds.length) {
+ const td = tds[c] as HTMLElement;
+ const style = window.getComputedStyle(td);
+ const bg = style.backgroundColor;
+ const cls = td.className.toLowerCase();
+ // Filled if it has a non-default background or a class suggesting a piece
+ const isFilled =
+ (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") ||
+ cls.includes("filled") ||
+ cls.includes("active") ||
+ cls.includes("block") ||
+ cls.includes("piece") ||
+ td.dataset.filled === "true";
+ rowData.push(isFilled);
+ } else {
+ rowData.push(false);
+ }
+ }
+ result.push(rowData);
+ }
+ return result;
+ }
+ }
+
+ // Strategy 2: look for a grid/flex container with child cells
+ const containers = document.querySelectorAll(
+ '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]'
+ );
+ for (const container of containers) {
+ const children = container.children;
+ // Could be a flat list of 200 cells (10x20) or 20 rows of 10 cells
+ if (children.length === rows * cols) {
+ const result: boolean[][] = [];
+ for (let r = 0; r < rows; r++) {
+ const rowData: boolean[] = [];
+ for (let c = 0; c < cols; c++) {
+ const cell = children[r * cols + c] as HTMLElement;
+ const style = window.getComputedStyle(cell);
+ const bg = style.backgroundColor;
+ const cls = cell.className.toLowerCase();
+ const isFilled =
+ (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") ||
+ cls.includes("filled") ||
+ cls.includes("active") ||
+ cls.includes("block") ||
+ cls.includes("piece") ||
+ cell.dataset.filled === "true";
+ rowData.push(isFilled);
+ }
+ result.push(rowData);
+ }
+ return result;
+ }
+ // Could be 20 row containers each with 10 cells
+ if (children.length === rows) {
+ let valid = true;
+ const result: boolean[][] = [];
+ for (let r = 0; r < rows; r++) {
+ const rowEl = children[r];
+ const cells = rowEl.children;
+ if (cells.length < cols) { valid = false; break; }
+ const rowData: boolean[] = [];
+ for (let c = 0; c < cols; c++) {
+ const cell = cells[c] as HTMLElement;
+ const style = window.getComputedStyle(cell);
+ const bg = style.backgroundColor;
+ const cls = cell.className.toLowerCase();
+ const isFilled =
+ (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") ||
+ cls.includes("filled") ||
+ cls.includes("active") ||
+ cls.includes("block") ||
+ cls.includes("piece") ||
+ cell.dataset.filled === "true";
+ rowData.push(isFilled);
+ }
+ result.push(rowData);
+ }
+ if (valid) return result;
+ }
+ }
+
+ return null;
+ }, { rows: GRID_ROWS, cols: GRID_COLS });
+
+ return grid;
+}
+
+/**
+ * Sample the background color from the top-left cell of an empty grid.
+ * Called during calibration before the game has pieces.
+ */
+export async function sampleBackgroundColor(
+ page: Page,
+ bounds: GridBounds,
+ cellW: number,
+ cellH: number
+): Promise<[number, number, number] | null> {
+ try {
+ const color = await page.evaluate(
+ ({ x, y, cellW, cellH }) => {
+ const canvas = document.querySelector("canvas") as HTMLCanvasElement | null;
+ if (!canvas) return null;
+ const ctx = canvas.getContext("2d");
+ if (!ctx) return null;
+ // Sample from the center of the first cell
+ const px = Math.floor(x + cellW / 2);
+ const py = Math.floor(y + cellH / 2);
+ const pixel = ctx.getImageData(px, py, 1, 1).data;
+ return [pixel[0], pixel[1], pixel[2]] as [number, number, number];
+ },
+ { x: bounds.x, y: bounds.y, cellW, cellH }
+ );
+ return color;
+ } catch {
+ return null;
+ }
+}
+
+/**
+ * Compare two grids and return true if they differ.
+ */
+export function gridsAreDifferent(a: Grid | null, b: Grid | null): boolean {
+ if (a === null || b === null) return a !== b;
+ if (a.length !== b.length) return true;
+ for (let r = 0; r < a.length; r++) {
+ if (a[r].length !== b[r].length) return true;
+ for (let c = 0; c < a[r].length; c++) {
+ if (a[r][c] !== b[r][c]) return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * Count the number of filled cells in the bottom N rows of the grid.
+ */
+export function countFilledInBottomRows(grid: Grid, rows: number): number {
+ let count = 0;
+ const startRow = Math.max(0, grid.length - rows);
+ for (let r = startRow; r < grid.length; r++) {
+ for (let c = 0; c < grid[r].length; c++) {
+ if (grid[r][c]) count++;
+ }
+ }
+ return count;
+}
+
+/**
+ * Count total filled cells in the grid.
+ */
+export function countFilled(grid: Grid): number {
+ let count = 0;
+ for (const row of grid) {
+ for (const cell of row) {
+ if (cell) count++;
+ }
+ }
+ return count;
+}
+
+/**
+ * Check if there are filled cells in the top few rows (near game over).
+ */
+export function hasFilledInTopRows(grid: Grid, rows: number): boolean {
+ for (let r = 0; r < Math.min(rows, grid.length); r++) {
+ for (let c = 0; c < grid[r].length; c++) {
+ if (grid[r][c]) return true;
+ }
+ }
+ return false;
+}
diff --git a/tasks/tetris/eval/gameplay-bot/index.ts b/tasks/tetris/eval/gameplay-bot/index.ts
@@ -0,0 +1,151 @@
+import { test } from "@playwright/test";
+import { execSync, spawn, type ChildProcess } from "node:child_process";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import * as net from "node:net";
+import type { BotReport } from "./types";
+import { runAllTests } from "./tests";
+
+/**
+ * Find an available port by briefly binding to port 0.
+ */
+async function findFreePort(): Promise<number> {
+ return new Promise((resolve, reject) => {
+ const server = net.createServer();
+ server.listen(0, () => {
+ const addr = server.address();
+ if (addr && typeof addr === "object") {
+ const port = addr.port;
+ server.close(() => resolve(port));
+ } else {
+ server.close(() => reject(new Error("could not determine port")));
+ }
+ });
+ server.on("error", reject);
+ });
+}
+
+/**
+ * Start a simple HTTP server to serve workspace files.
+ * Tries `npx serve` first, then `python3 -m http.server`.
+ */
+async function startServer(workspacePath: string, port: number): Promise<ChildProcess> {
+ let serverProc: ChildProcess;
+
+ // Try npx serve first
+ try {
+ execSync("npx serve --version", { stdio: "ignore", timeout: 5000 });
+ serverProc = spawn("npx", ["serve", "-l", String(port), "-s", "--no-clipboard"], {
+ cwd: workspacePath,
+ stdio: "ignore",
+ });
+ } catch {
+ // Fallback to python
+ serverProc = spawn("python3", ["-m", "http.server", String(port)], {
+ cwd: workspacePath,
+ stdio: "ignore",
+ });
+ }
+
+ // Wait for the server to be ready
+ const maxWait = 10000;
+ const start = Date.now();
+ while (Date.now() - start < maxWait) {
+ try {
+ await new Promise<void>((resolve, reject) => {
+ const socket = net.createConnection({ port, host: "127.0.0.1" }, () => {
+ socket.destroy();
+ resolve();
+ });
+ socket.on("error", reject);
+ socket.setTimeout(500, () => {
+ socket.destroy();
+ reject(new Error("timeout"));
+ });
+ });
+ return serverProc;
+ } catch {
+ await new Promise((r) => setTimeout(r, 200));
+ }
+ }
+
+ throw new Error(`server did not start on port ${port} within ${maxWait}ms`);
+}
+
+test.describe("Tetris Gameplay Bot", () => {
+ let serverProc: ChildProcess | null = null;
+ let serverUrl: string;
+
+ test.beforeAll(async () => {
+ const workspacePath =
+ process.env.WORKSPACE_PATH || process.env.TETRIS_WORKSPACE || process.cwd();
+ const port = await findFreePort();
+ serverProc = await startServer(workspacePath, port);
+ serverUrl = `http://127.0.0.1:${port}`;
+ });
+
+ test.afterAll(async () => {
+ if (serverProc) {
+ serverProc.kill("SIGTERM");
+ serverProc = null;
+ }
+ });
+
+ test("run gameplay bot", async ({ page }) => {
+ test.setTimeout(120_000); // 2-minute total timeout
+
+ const { testResults, calibration, gameplay } = await runAllTests(page, serverUrl);
+
+ const passed = testResults.filter((t) => t.pass).length;
+ const failed = testResults.filter((t) => !t.pass).length;
+ const total = testResults.length;
+
+ const report: BotReport = {
+ implementation: {
+ renderer: calibration.renderer,
+ grid_detected: calibration.gridDetected,
+ grid_bounds: calibration.gridBounds,
+ controls: calibration.controls as unknown as Record<string, string>,
+ start_mechanism: calibration.startMechanism,
+ score_element_found: calibration.scoreElementSelector !== null,
+ },
+ tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })),
+ summary: {
+ total,
+ passed,
+ failed,
+ score: total > 0 ? Math.round((passed / total) * 100) / 100 : 0,
+ },
+ gameplay,
+ };
+
+ // Write report to file
+ const reportPath =
+ process.env.REPORT_OUTPUT_PATH ||
+ path.join(process.cwd(), "gameplay-bot-report.json");
+
+ // Ensure output directory exists
+ const reportDir = path.dirname(reportPath);
+ if (!fs.existsSync(reportDir)) {
+ fs.mkdirSync(reportDir, { recursive: true });
+ }
+
+ fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), "utf-8");
+
+ // Log summary to console for visibility
+ console.log("\n=== Gameplay Bot Report ===");
+ console.log(`Renderer: ${calibration.renderer}`);
+ console.log(`Grid detected: ${calibration.gridDetected}`);
+ console.log(`Start mechanism: ${calibration.startMechanism}`);
+ console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`);
+ console.log(`\nTests: ${passed}/${total} passed`);
+ for (const t of testResults) {
+ console.log(` ${t.pass ? "PASS" : "FAIL"} ${t.name}: ${t.detail}`);
+ }
+ console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`);
+ console.log(`Report written to: ${reportPath}`);
+ console.log("===========================\n");
+
+ // Always pass the Playwright test -- results are in the report
+ });
+});
diff --git a/tasks/tetris/eval/gameplay-bot/player.ts b/tasks/tetris/eval/gameplay-bot/player.ts
@@ -0,0 +1,417 @@
+import type { Page } from "@playwright/test";
+import type { Grid, CalibrationResult } from "./types";
+import { readGrid } from "./grid-reader";
+
+// Heuristic weights from the spec
+const W_HEIGHT = -0.510066;
+const W_LINES = 0.760666;
+const W_HOLES = -0.35663;
+const W_BUMPINESS = -0.184483;
+
+const GRID_ROWS = 20;
+const GRID_COLS = 10;
+
+/** The moves needed to place a piece. */
+interface Placement {
+ rotations: number;
+ column: number; // target column for leftmost cell of piece
+ score: number;
+}
+
+/**
+ * Play the game for a specified duration or number of pieces using the
+ * 4-heuristic algorithm. Falls back to random input if grid reading fails.
+ *
+ * Returns the number of pieces placed and lines cleared.
+ */
+export async function playGame(
+ page: Page,
+ cal: CalibrationResult,
+ options: { maxPieces?: number; maxDurationMs?: number }
+): Promise<{ piecesPlaced: number; linesCleared: number; errors: number }> {
+ const maxPieces = options.maxPieces ?? 100;
+ const maxDuration = options.maxDurationMs ?? 30000;
+ const start = Date.now();
+ let piecesPlaced = 0;
+ let linesCleared = 0;
+ let errors = 0;
+ let consecutiveFailures = 0;
+
+ while (piecesPlaced < maxPieces && Date.now() - start < maxDuration) {
+ try {
+ const grid = await readGrid(page, cal);
+
+ if (!grid) {
+ // Fallback: random inputs
+ await playRandomMove(page, cal);
+ piecesPlaced++;
+ consecutiveFailures++;
+ if (consecutiveFailures > 5) {
+ // Grid reading is not working, just do random play for remaining time
+ await playRandomForDuration(page, cal, maxDuration - (Date.now() - start));
+ piecesPlaced += 5;
+ break;
+ }
+ continue;
+ }
+ consecutiveFailures = 0;
+
+ // Count filled cells before the move
+ const filledBefore = countTotalFilled(grid);
+
+ // Find the best placement
+ const placement = findBestPlacement(grid);
+
+ if (placement) {
+ await executePlacement(page, cal, placement);
+ linesCleared += placement.linesCleared ?? 0;
+ } else {
+ // Can't find a good placement, just hard drop
+ await page.keyboard.press(cal.controls.drop);
+ }
+
+ piecesPlaced++;
+
+ // Brief wait for the game to settle
+ await page.waitForTimeout(150);
+
+ // Check if lines were cleared by comparing filled cells
+ const gridAfter = await readGrid(page, cal);
+ if (gridAfter) {
+ const filledAfter = countTotalFilled(gridAfter);
+ // If we placed a piece (added ~4 cells) but total filled went down,
+ // some lines were cleared
+ if (filledAfter < filledBefore) {
+ const possibleClears = Math.round((filledBefore + 4 - filledAfter) / GRID_COLS);
+ if (possibleClears > 0) {
+ linesCleared += possibleClears;
+ }
+ }
+ }
+ } catch {
+ errors++;
+ // Don't crash -- try to keep playing
+ await playRandomMove(page, cal);
+ piecesPlaced++;
+ }
+ }
+
+ return { piecesPlaced, linesCleared, errors };
+}
+
+/**
+ * Execute a single hard drop (for tests that just need to drop a piece).
+ */
+export async function hardDrop(page: Page, cal: CalibrationResult): Promise<void> {
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(200);
+}
+
+/**
+ * Execute a placement: rotate, move to column, then hard drop.
+ */
+async function executePlacement(
+ page: Page,
+ cal: CalibrationResult,
+ placement: Placement
+): Promise<void> {
+ // Rotate
+ for (let i = 0; i < placement.rotations; i++) {
+ await page.keyboard.press(cal.controls.rotate);
+ await page.waitForTimeout(50);
+ }
+
+ // Move to target column
+ // Assume piece spawns at roughly column 3-5 (center)
+ const spawnCol = 4;
+ const diff = placement.column - spawnCol;
+
+ if (diff < 0) {
+ for (let i = 0; i < Math.abs(diff); i++) {
+ await page.keyboard.press(cal.controls.left);
+ await page.waitForTimeout(30);
+ }
+ } else if (diff > 0) {
+ for (let i = 0; i < diff; i++) {
+ await page.keyboard.press(cal.controls.right);
+ await page.waitForTimeout(30);
+ }
+ }
+
+ // Hard drop
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(100);
+}
+
+/**
+ * Play a random move (fallback when grid reading fails).
+ */
+async function playRandomMove(page: Page, cal: CalibrationResult): Promise<void> {
+ const moves = [cal.controls.left, cal.controls.right, cal.controls.rotate, cal.controls.down];
+ const randomMoves = Math.floor(Math.random() * 4) + 1;
+ for (let i = 0; i < randomMoves; i++) {
+ const key = moves[Math.floor(Math.random() * moves.length)];
+ await page.keyboard.press(key);
+ await page.waitForTimeout(50);
+ }
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(100);
+}
+
+/**
+ * Play randomly for a set duration (when grid reading is broken).
+ */
+async function playRandomForDuration(
+ page: Page,
+ cal: CalibrationResult,
+ durationMs: number
+): Promise<void> {
+ const start = Date.now();
+ const moves = [cal.controls.left, cal.controls.right, cal.controls.rotate, cal.controls.down, cal.controls.drop];
+
+ while (Date.now() - start < durationMs) {
+ const key = moves[Math.floor(Math.random() * moves.length)];
+ await page.keyboard.press(key);
+ await page.waitForTimeout(100);
+ }
+}
+
+/**
+ * Try to fill a specific row by placing pieces strategically.
+ * Uses repeated hard drops at different columns to build up the bottom row.
+ */
+export async function tryFillRow(
+ page: Page,
+ cal: CalibrationResult,
+ maxAttempts: number
+): Promise<boolean> {
+ // Strategy: move piece to each column left to right and hard drop
+ // This won't guarantee a line clear but maximizes the chance
+ const columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+ let attempts = 0;
+
+ for (const targetCol of columns) {
+ if (attempts >= maxAttempts) break;
+
+ // Move to far left first
+ for (let i = 0; i < 6; i++) {
+ await page.keyboard.press(cal.controls.left);
+ await page.waitForTimeout(30);
+ }
+
+ // Then move right to target column
+ for (let i = 0; i < targetCol; i++) {
+ await page.keyboard.press(cal.controls.right);
+ await page.waitForTimeout(30);
+ }
+
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(200);
+ attempts++;
+ }
+
+ // Check if a line was cleared
+ const grid = await readGrid(page, cal);
+ if (!grid) return false;
+
+ // If bottom row is now empty after being full, a line was cleared
+ const bottomFilled = grid[GRID_ROWS - 1].filter(Boolean).length;
+ // Heuristic: if bottom row is less full than expected after 10 pieces, lines probably cleared
+ return bottomFilled < 8;
+}
+
+/**
+ * Quickly stack pieces to reach game over.
+ */
+export async function stackToGameOver(
+ page: Page,
+ cal: CalibrationResult,
+ maxAttempts: number
+): Promise<boolean> {
+ // Strategy: hard drop in the same column repeatedly to build a tower
+ for (let i = 0; i < maxAttempts; i++) {
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(150);
+ }
+
+ // Check if the game appears to have stopped
+ const shot1 = await page.screenshot();
+ await page.waitForTimeout(1000);
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+ const shot2 = await page.screenshot();
+
+ // If nothing changed despite input, game is likely over
+ // (or check for game-over text)
+ const screenshotsSame = Buffer.from(shot1).equals(Buffer.from(shot2));
+
+ const hasGameOverText = await page.evaluate(() => {
+ const text = document.body.innerText.toLowerCase();
+ return (
+ text.includes("game over") ||
+ text.includes("gameover") ||
+ text.includes("you lose") ||
+ text.includes("try again") ||
+ text.includes("restart") ||
+ text.includes("play again")
+ );
+ });
+
+ return screenshotsSame || hasGameOverText;
+}
+
+// --- Heuristic evaluation functions ---
+
+interface PlacementWithLines extends Placement {
+ linesCleared?: number;
+}
+
+/**
+ * Find the best column and rotation for the current piece using the
+ * 4-heuristic scoring function.
+ */
+function findBestPlacement(grid: Grid): PlacementWithLines | null {
+ let bestScore = -Infinity;
+ let bestPlacement: PlacementWithLines | null = null;
+
+ // Try each rotation (0-3) and each column position (0-9)
+ for (let rotations = 0; rotations < 4; rotations++) {
+ for (let col = 0; col < GRID_COLS; col++) {
+ // Simulate placing a simple piece (we don't know the exact piece,
+ // so we simulate a 1-wide vertical drop at this column)
+ const simGrid = simulateDrop(grid, col);
+ if (!simGrid) continue;
+
+ const { cleared, board } = clearLines(simGrid);
+ const score =
+ W_HEIGHT * aggregateHeight(board) +
+ W_LINES * cleared +
+ W_HOLES * countHoles(board) +
+ W_BUMPINESS * bumpiness(board);
+
+ if (score > bestScore) {
+ bestScore = score;
+ bestPlacement = { rotations, column: col, score, linesCleared: cleared };
+ }
+ }
+ }
+
+ return bestPlacement;
+}
+
+/**
+ * Simulate dropping a single cell at the given column (simplified --
+ * we don't know the actual piece shape without more complex detection).
+ */
+function simulateDrop(grid: Grid, col: number): Grid | null {
+ if (col < 0 || col >= GRID_COLS) return null;
+
+ // Find the lowest empty row in this column
+ let landRow = -1;
+ for (let r = GRID_ROWS - 1; r >= 0; r--) {
+ if (!grid[r][col]) {
+ landRow = r;
+ break;
+ }
+ }
+ if (landRow < 0) return null;
+
+ // Clone the grid and place the piece
+ const newGrid: Grid = grid.map((row) => [...row]);
+ newGrid[landRow][col] = true;
+
+ return newGrid;
+}
+
+/**
+ * Clear completed lines and return the count + new board.
+ */
+function clearLines(grid: Grid): { cleared: number; board: Grid } {
+ const remaining: boolean[][] = [];
+ let cleared = 0;
+
+ for (const row of grid) {
+ if (row.every(Boolean)) {
+ cleared++;
+ } else {
+ remaining.push([...row]);
+ }
+ }
+
+ // Add empty rows at the top
+ while (remaining.length < GRID_ROWS) {
+ remaining.unshift(new Array(GRID_COLS).fill(false));
+ }
+
+ return { cleared, board: remaining };
+}
+
+/**
+ * Sum of column heights (distance from top to highest filled cell per column).
+ */
+function aggregateHeight(grid: Grid): number {
+ let total = 0;
+ for (let col = 0; col < GRID_COLS; col++) {
+ for (let row = 0; row < GRID_ROWS; row++) {
+ if (grid[row][col]) {
+ total += GRID_ROWS - row;
+ break;
+ }
+ }
+ }
+ return total;
+}
+
+/**
+ * Count holes (empty cells with a filled cell above them in the same column).
+ */
+function countHoles(grid: Grid): number {
+ let holes = 0;
+ for (let col = 0; col < GRID_COLS; col++) {
+ let blockFound = false;
+ for (let row = 0; row < GRID_ROWS; row++) {
+ if (grid[row][col]) {
+ blockFound = true;
+ } else if (blockFound) {
+ holes++;
+ }
+ }
+ }
+ return holes;
+}
+
+/**
+ * Sum of absolute height differences between adjacent columns.
+ */
+function bumpiness(grid: Grid): number {
+ const heights: number[] = [];
+ for (let col = 0; col < GRID_COLS; col++) {
+ let h = 0;
+ for (let row = 0; row < GRID_ROWS; row++) {
+ if (grid[row][col]) {
+ h = GRID_ROWS - row;
+ break;
+ }
+ }
+ heights.push(h);
+ }
+
+ let bump = 0;
+ for (let i = 0; i < heights.length - 1; i++) {
+ bump += Math.abs(heights[i] - heights[i + 1]);
+ }
+ return bump;
+}
+
+/**
+ * Count total filled cells in the grid.
+ */
+function countTotalFilled(grid: Grid): number {
+ let count = 0;
+ for (const row of grid) {
+ for (const cell of row) {
+ if (cell) count++;
+ }
+ }
+ return count;
+}
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -0,0 +1,703 @@
+import type { Page } from "@playwright/test";
+import type { TestResult, CalibrationResult, GameplayStats } from "./types";
+import { readGrid, gridsAreDifferent, countFilled, countFilledInBottomRows, hasFilledInTopRows } from "./grid-reader";
+import { hardDrop, playGame, tryFillRow, stackToGameOver } from "./player";
+import { calibrate } from "./calibrate";
+
+/**
+ * Run all 15 tests sequentially. Each test has its own try/catch
+ * so one failure never stops the others.
+ *
+ * Returns the test results and the calibration result (which may have
+ * been updated during testing).
+ */
+export async function runAllTests(
+ page: Page,
+ serverUrl: string
+): Promise<{
+ testResults: TestResult[];
+ calibration: CalibrationResult;
+ gameplay: GameplayStats;
+}> {
+ const testResults: TestResult[] = [];
+ const gameplay: GameplayStats = {
+ pieces_placed: 0,
+ lines_cleared: 0,
+ max_score_observed: 0,
+ play_duration_seconds: 0,
+ errors_during_play: 0,
+ };
+
+ // Collect console errors across the entire session
+ const consoleErrors: string[] = [];
+ page.on("pageerror", (err) => consoleErrors.push(err.message));
+
+ // ---- Test 1: Game loads ----
+ let pageLoaded = false;
+ try {
+ const result = await testGameLoads(page, serverUrl, consoleErrors);
+ testResults.push(result);
+ pageLoaded = result.pass;
+ } catch (err) {
+ testResults.push({
+ name: "game_loads",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // If the page didn't load at all, fail everything and return
+ if (!pageLoaded) {
+ const remainingTests = [
+ "game_starts", "auto_drop", "move_left", "move_right", "move_down",
+ "rotate", "hard_drop", "piece_locks", "new_piece_spawns",
+ "multiple_pieces", "line_clear", "score_changes", "game_over",
+ "playable_30s",
+ ];
+ for (const name of remainingTests) {
+ testResults.push({ name, pass: false, detail: "skipped: page did not load" });
+ }
+ return {
+ testResults,
+ calibration: emptyCalibration(consoleErrors),
+ gameplay,
+ };
+ }
+
+ // ---- Test 2: Game starts ----
+ let cal: CalibrationResult;
+ try {
+ cal = await calibrate(page);
+ const started = cal.startMechanism !== "unknown";
+ testResults.push({
+ name: "game_starts",
+ pass: started,
+ detail: started
+ ? `started via ${cal.startMechanism}`
+ : "could not start game with any mechanism",
+ });
+ } catch (err) {
+ cal = emptyCalibration(consoleErrors);
+ testResults.push({
+ name: "game_starts",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // Merge console errors from calibration
+ for (const e of cal.consoleErrors) {
+ if (!consoleErrors.includes(e)) consoleErrors.push(e);
+ }
+
+ // ---- Test 3: Auto-drop ----
+ try {
+ const result = await testAutoDrop(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "auto_drop",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 4: Move left ----
+ try {
+ const result = await testMoveDirection(page, cal, "left");
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "move_left",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 5: Move right ----
+ try {
+ const result = await testMoveDirection(page, cal, "right");
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "move_right",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 6: Move down ----
+ try {
+ const result = await testMoveDirection(page, cal, "down");
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "move_down",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 7: Rotate ----
+ try {
+ const result = await testRotate(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "rotate",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 8: Hard drop ----
+ try {
+ const result = await testHardDrop(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "hard_drop",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 9: Piece locks ----
+ try {
+ const result = await testPieceLocks(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "piece_locks",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 10: New piece spawns ----
+ try {
+ const result = await testNewPieceSpawns(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "new_piece_spawns",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 11: Multiple pieces ----
+ try {
+ const result = await testMultiplePieces(page, cal, gameplay);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "multiple_pieces",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // We need a fresh game for line clear and game over tests
+ // Reload the page and re-calibrate
+ try {
+ await loadGamePage(page, serverUrl);
+ cal = await calibrate(page);
+ } catch {
+ // If reload fails, continue with existing state
+ }
+
+ // ---- Test 12: Line clear ----
+ try {
+ const result = await testLineClear(page, cal, gameplay);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "line_clear",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // ---- Test 13: Score changes ----
+ try {
+ const result = await testScoreChanges(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "score_changes",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // Reload for game over test
+ try {
+ await loadGamePage(page, serverUrl);
+ cal = await calibrate(page);
+ } catch {
+ // continue with existing state
+ }
+
+ // ---- Test 14: Game over ----
+ try {
+ const result = await testGameOver(page, cal);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "game_over",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // Reload for 30s play test
+ try {
+ await loadGamePage(page, serverUrl);
+ cal = await calibrate(page);
+ } catch {
+ // continue
+ }
+
+ // ---- Test 15: Playable for 30 seconds ----
+ try {
+ const result = await testPlayable30s(page, cal, gameplay, consoleErrors);
+ testResults.push(result);
+ } catch (err) {
+ testResults.push({
+ name: "playable_30s",
+ pass: false,
+ detail: `exception: ${err instanceof Error ? err.message : String(err)}`,
+ });
+ }
+
+ // Read final score
+ try {
+ if (cal.scoreElementSelector) {
+ const scoreText = await page.textContent(cal.scoreElementSelector);
+ const score = parseInt(scoreText?.replace(/\D/g, "") || "0", 10);
+ if (score > gameplay.max_score_observed) {
+ gameplay.max_score_observed = score;
+ }
+ }
+ } catch { /* ignore */ }
+
+ return { testResults, calibration: cal, gameplay };
+}
+
+// ---- Individual test implementations ----
+
+async function testGameLoads(
+ page: Page,
+ serverUrl: string,
+ consoleErrors: string[]
+): Promise<TestResult> {
+ const errorsBefore = consoleErrors.length;
+
+ await loadGamePage(page, serverUrl);
+ await page.waitForTimeout(3000);
+
+ const newErrors = consoleErrors.slice(errorsBefore);
+ if (newErrors.length === 0) {
+ return { name: "game_loads", pass: true, detail: "no console errors" };
+ }
+ return {
+ name: "game_loads",
+ pass: false,
+ detail: `${newErrors.length} console error(s): ${newErrors[0]}`,
+ };
+}
+
+async function testAutoDrop(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ // Use screenshot comparison: wait 5 seconds with no input
+ const grid1 = await readGrid(page, cal);
+ const shot1 = await page.screenshot();
+ await page.waitForTimeout(5000);
+ const grid2 = await readGrid(page, cal);
+ const shot2 = await page.screenshot();
+
+ // Check grid difference first, fall back to screenshot diff
+ if (grid1 && grid2 && gridsAreDifferent(grid1, grid2)) {
+ return { name: "auto_drop", pass: true, detail: "grid state changed after 5s with no input" };
+ }
+ if (!Buffer.from(shot1).equals(Buffer.from(shot2))) {
+ return { name: "auto_drop", pass: true, detail: "pixels changed after 5s with no input" };
+ }
+ return { name: "auto_drop", pass: false, detail: "piece did not move in 5 seconds" };
+}
+
+async function testMoveDirection(
+ page: Page,
+ cal: CalibrationResult,
+ direction: "left" | "right" | "down"
+): Promise<TestResult> {
+ const keyMap = {
+ left: cal.controls.left,
+ right: cal.controls.right,
+ down: cal.controls.down,
+ };
+
+ const shotBefore = await page.screenshot();
+ const gridBefore = await readGrid(page, cal);
+
+ await page.keyboard.press(keyMap[direction]);
+ await page.waitForTimeout(300);
+
+ const shotAfter = await page.screenshot();
+ const gridAfter = await readGrid(page, cal);
+
+ const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter);
+ const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter));
+
+ if (gridChanged || pixelsChanged) {
+ return { name: `move_${direction}`, pass: true, detail: "grid state changed after key press" };
+ }
+ return { name: `move_${direction}`, pass: false, detail: "no change detected after key press" };
+}
+
+async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ const shotBefore = await page.screenshot();
+ const gridBefore = await readGrid(page, cal);
+
+ await page.keyboard.press(cal.controls.rotate);
+ await page.waitForTimeout(300);
+
+ const shotAfter = await page.screenshot();
+ const gridAfter = await readGrid(page, cal);
+
+ const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter);
+ const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter));
+
+ if (gridChanged || pixelsChanged) {
+ return { name: "rotate", pass: true, detail: "piece shape changed after rotate key" };
+ }
+ return { name: "rotate", pass: false, detail: "no change detected after rotate key" };
+}
+
+async function testHardDrop(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ const gridBefore = await readGrid(page, cal);
+ const shotBefore = await page.screenshot();
+
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+
+ const gridAfter = await readGrid(page, cal);
+ const shotAfter = await page.screenshot();
+
+ // After hard drop, there should be filled cells at the bottom
+ // and the grid should have changed
+ const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter);
+ const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter));
+ const hasBottomCells = gridAfter ? countFilledInBottomRows(gridAfter, 5) > 0 : false;
+
+ if ((gridChanged || pixelsChanged) && (hasBottomCells || !gridAfter)) {
+ return { name: "hard_drop", pass: true, detail: "piece immediately dropped and new piece appeared" };
+ }
+ if (pixelsChanged) {
+ return { name: "hard_drop", pass: true, detail: "visual change detected after hard drop" };
+ }
+ return { name: "hard_drop", pass: false, detail: "no change detected after hard drop key" };
+}
+
+async function testPieceLocks(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ // Wait for auto-drop to bring a piece to the bottom (~15 seconds)
+ // First, hard drop to establish a baseline
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+
+ const gridAfterDrop = await readGrid(page, cal);
+ if (gridAfterDrop) {
+ const bottomFilled = countFilledInBottomRows(gridAfterDrop, 4);
+ if (bottomFilled > 0) {
+ // Verify persistence: wait and check again
+ await page.waitForTimeout(2000);
+ const gridLater = await readGrid(page, cal);
+ if (gridLater) {
+ const bottomFilledLater = countFilledInBottomRows(gridLater, 4);
+ if (bottomFilledLater >= bottomFilled) {
+ return { name: "piece_locks", pass: true, detail: "filled cells persist at bottom" };
+ }
+ }
+ return { name: "piece_locks", pass: true, detail: "filled cells detected at bottom after drop" };
+ }
+ }
+
+ // Fallback: wait for auto-drop
+ await page.waitForTimeout(15000);
+ const gridAfterWait = await readGrid(page, cal);
+ if (gridAfterWait) {
+ const bottomFilled = countFilledInBottomRows(gridAfterWait, 4);
+ if (bottomFilled > 0) {
+ return { name: "piece_locks", pass: true, detail: "piece locked at bottom via auto-drop" };
+ }
+ }
+
+ // Screenshot-based fallback
+ const shot1 = await page.screenshot();
+ await page.waitForTimeout(2000);
+ const shot2 = await page.screenshot();
+ // If screenshots are stable, something probably locked
+ return {
+ name: "piece_locks",
+ pass: false,
+ detail: "could not verify piece locking at bottom",
+ };
+}
+
+async function testNewPieceSpawns(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ // After a piece locks (previous test did a hard drop), check for a piece at the top
+ const grid = await readGrid(page, cal);
+ if (grid) {
+ const topHasFilled = hasFilledInTopRows(grid, 4);
+ if (topHasFilled) {
+ return { name: "new_piece_spawns", pass: true, detail: "new piece detected at top of grid" };
+ }
+
+ // Wait a moment for the new piece to appear
+ await page.waitForTimeout(1000);
+ const grid2 = await readGrid(page, cal);
+ if (grid2 && hasFilledInTopRows(grid2, 4)) {
+ return { name: "new_piece_spawns", pass: true, detail: "new piece appeared at top after delay" };
+ }
+ }
+
+ // Drop another piece and check
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+ const gridAfter = await readGrid(page, cal);
+ if (gridAfter && hasFilledInTopRows(gridAfter, 4)) {
+ return { name: "new_piece_spawns", pass: true, detail: "new piece detected after drop" };
+ }
+
+ // Screenshot fallback
+ const shot1 = await page.screenshot();
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+ const shot2 = await page.screenshot();
+ if (!Buffer.from(shot1).equals(Buffer.from(shot2))) {
+ return { name: "new_piece_spawns", pass: true, detail: "visual change suggests new piece spawned" };
+ }
+
+ return { name: "new_piece_spawns", pass: false, detail: "could not detect new piece at top" };
+}
+
+async function testMultiplePieces(
+ page: Page,
+ cal: CalibrationResult,
+ gameplay: GameplayStats
+): Promise<TestResult> {
+ const gridBefore = await readGrid(page, cal);
+ const filledBefore = gridBefore ? countFilled(gridBefore) : 0;
+
+ // Hard drop 10 pieces
+ for (let i = 0; i < 10; i++) {
+ await hardDrop(page, cal);
+ await page.waitForTimeout(300);
+ }
+ gameplay.pieces_placed += 10;
+
+ const gridAfter = await readGrid(page, cal);
+ if (gridAfter) {
+ const filledAfter = countFilled(gridAfter);
+ if (filledAfter > filledBefore) {
+ return {
+ name: "multiple_pieces",
+ pass: true,
+ detail: `grid accumulated cells: ${filledBefore} -> ${filledAfter}`,
+ };
+ }
+ }
+
+ // Screenshot fallback: if the game is still responding to drops, it's working
+ const shotA = await page.screenshot();
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(300);
+ const shotB = await page.screenshot();
+ if (!Buffer.from(shotA).equals(Buffer.from(shotB))) {
+ return { name: "multiple_pieces", pass: true, detail: "game still responding after 10 piece drops" };
+ }
+
+ return { name: "multiple_pieces", pass: false, detail: "grid did not accumulate filled cells" };
+}
+
+async function testLineClear(
+ page: Page,
+ cal: CalibrationResult,
+ gameplay: GameplayStats
+): Promise<TestResult> {
+ // Strategy: fill a row by placing pieces across the bottom
+ const gridBefore = await readGrid(page, cal);
+ const filledBefore = gridBefore ? countFilled(gridBefore) : 0;
+
+ // Play strategically using the AI to try to clear lines
+ const result = await playGame(page, cal, { maxPieces: 30, maxDurationMs: 20000 });
+ gameplay.pieces_placed += result.piecesPlaced;
+ gameplay.errors_during_play += result.errors;
+
+ if (result.linesCleared > 0) {
+ gameplay.lines_cleared += result.linesCleared;
+ return {
+ name: "line_clear",
+ pass: true,
+ detail: `${result.linesCleared} line(s) cleared during AI play`,
+ };
+ }
+
+ // Try the brute-force row-fill approach
+ const cleared = await tryFillRow(page, cal, 10);
+ gameplay.pieces_placed += 10;
+ if (cleared) {
+ gameplay.lines_cleared += 1;
+ return { name: "line_clear", pass: true, detail: "line cleared via strategic placement" };
+ }
+
+ // Check if total filled decreased (which would indicate clearing happened)
+ const gridAfter = await readGrid(page, cal);
+ const filledAfter = gridAfter ? countFilled(gridAfter) : 0;
+ if (filledAfter < filledBefore && filledBefore > 0) {
+ return { name: "line_clear", pass: true, detail: "total filled cells decreased, indicating line clear" };
+ }
+
+ return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" };
+}
+
+async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ if (!cal.scoreElementSelector) {
+ // Try to find any number on the page that changes
+ const textBefore = await page.evaluate(() => document.body.innerText);
+ const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number);
+
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(500);
+
+ const textAfter = await page.evaluate(() => document.body.innerText);
+ const numbersAfter = (textAfter.match(/\d+/g) || []).map(Number);
+
+ // Check if any number increased
+ for (let i = 0; i < Math.min(numbersBefore.length, numbersAfter.length); i++) {
+ if (numbersAfter[i] > numbersBefore[i]) {
+ return { name: "score_changes", pass: true, detail: "a number on the page increased after play" };
+ }
+ }
+
+ return { name: "score_changes", pass: false, detail: "no score element found and no number changed" };
+ }
+
+ try {
+ const scoreBefore = await page.textContent(cal.scoreElementSelector);
+ const numBefore = parseInt(scoreBefore?.replace(/\D/g, "") || "0", 10);
+
+ // Play a bit to change the score
+ for (let i = 0; i < 5; i++) {
+ await page.keyboard.press(cal.controls.drop);
+ await page.waitForTimeout(300);
+ }
+
+ const scoreAfter = await page.textContent(cal.scoreElementSelector);
+ const numAfter = parseInt(scoreAfter?.replace(/\D/g, "") || "0", 10);
+
+ if (numAfter > numBefore) {
+ return {
+ name: "score_changes",
+ pass: true,
+ detail: `score changed from ${numBefore} to ${numAfter}`,
+ };
+ }
+ return {
+ name: "score_changes",
+ pass: false,
+ detail: `score did not increase: ${numBefore} -> ${numAfter}`,
+ };
+ } catch {
+ return { name: "score_changes", pass: false, detail: "could not read score element" };
+ }
+}
+
+async function testGameOver(page: Page, cal: CalibrationResult): Promise<TestResult> {
+ const isOver = await stackToGameOver(page, cal, 40);
+ if (isOver) {
+ return { name: "game_over", pass: true, detail: "game stopped after stacking to top" };
+ }
+ return { name: "game_over", pass: false, detail: "could not trigger or detect game over" };
+}
+
+async function testPlayable30s(
+ page: Page,
+ cal: CalibrationResult,
+ gameplay: GameplayStats,
+ consoleErrors: string[]
+): Promise<TestResult> {
+ const errorsBefore = consoleErrors.length;
+ const start = Date.now();
+
+ const result = await playGame(page, cal, { maxDurationMs: 30000 });
+
+ const elapsed = Math.round((Date.now() - start) / 1000);
+ gameplay.pieces_placed += result.piecesPlaced;
+ gameplay.lines_cleared += result.linesCleared;
+ gameplay.play_duration_seconds += elapsed;
+ gameplay.errors_during_play += result.errors;
+
+ const newErrors = consoleErrors.slice(errorsBefore);
+ const crashed = newErrors.length > 0 || result.errors > 3;
+
+ if (!crashed) {
+ return {
+ name: "playable_30s",
+ pass: true,
+ detail: `played for ${elapsed}s, placed ${result.piecesPlaced} pieces, no crashes`,
+ };
+ }
+ return {
+ name: "playable_30s",
+ pass: false,
+ detail: `${newErrors.length} console errors, ${result.errors} play errors during ${elapsed}s`,
+ };
+}
+
+// ---- Helpers ----
+
+async function loadGamePage(page: Page, serverUrl: string): Promise<void> {
+ const candidates = [
+ "index.html",
+ "dist/index.html",
+ "public/index.html",
+ "build/index.html",
+ ];
+
+ for (const candidate of candidates) {
+ try {
+ const response = await page.goto(`${serverUrl}/${candidate}`, {
+ timeout: 5000,
+ waitUntil: "domcontentloaded",
+ });
+ if (response && response.ok()) return;
+ } catch {
+ continue;
+ }
+ }
+
+ // Last resort: try root
+ await page.goto(`${serverUrl}/`, { timeout: 5000, waitUntil: "domcontentloaded" });
+}
+
+function emptyCalibration(consoleErrors: string[]): CalibrationResult {
+ return {
+ renderer: "unknown",
+ gridDetected: false,
+ gridBounds: null,
+ cellWidth: 0,
+ cellHeight: 0,
+ controls: {
+ left: "ArrowLeft",
+ right: "ArrowRight",
+ down: "ArrowDown",
+ rotate: "ArrowUp",
+ drop: "Space",
+ },
+ startMechanism: "unknown",
+ scoreElementSelector: null,
+ backgroundColor: null,
+ consoleErrors,
+ };
+}
diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts
@@ -0,0 +1,92 @@
+import type { Page } from "@playwright/test";
+
+/** A 10x20 boolean grid: true = filled cell, false = empty. Row 0 is the top. */
+export type Grid = boolean[][];
+
+/** Pixel bounds of the game grid on the page. */
+export interface GridBounds {
+ x: number;
+ y: number;
+ width: number;
+ height: number;
+}
+
+/** How the game renders its grid. */
+export type RendererType = "canvas" | "dom" | "svg" | "unknown";
+
+/** Key mappings for game controls. */
+export interface Controls {
+ left: string;
+ right: string;
+ down: string;
+ rotate: string;
+ drop: string;
+}
+
+/** How the game was started. */
+export type StartMechanism =
+ | "auto"
+ | "click_canvas"
+ | "enter"
+ | "space"
+ | "button"
+ | "anykey"
+ | "unknown";
+
+/** Result of the calibration phase. */
+export interface CalibrationResult {
+ renderer: RendererType;
+ gridDetected: boolean;
+ gridBounds: GridBounds | null;
+ cellWidth: number;
+ cellHeight: number;
+ controls: Controls;
+ startMechanism: StartMechanism;
+ scoreElementSelector: string | null;
+ backgroundColor: [number, number, number] | null;
+ consoleErrors: string[];
+}
+
+/** Result of an individual test. */
+export interface TestResult {
+ name: string;
+ pass: boolean;
+ detail: string;
+}
+
+/** Gameplay statistics gathered during the play phase. */
+export interface GameplayStats {
+ pieces_placed: number;
+ lines_cleared: number;
+ max_score_observed: number;
+ play_duration_seconds: number;
+ errors_during_play: number;
+}
+
+/** The full JSON report written at the end. */
+export interface BotReport {
+ implementation: {
+ renderer: string;
+ grid_detected: boolean;
+ grid_bounds: GridBounds | null;
+ controls: Record<string, string>;
+ start_mechanism: string;
+ score_element_found: boolean;
+ };
+ tests: Array<{ name: string; pass: boolean; detail: string }>;
+ summary: {
+ total: number;
+ passed: number;
+ failed: number;
+ score: number;
+ };
+ gameplay: GameplayStats;
+}
+
+/** Context passed through calibration, play, and reporting phases. */
+export interface BotContext {
+ page: Page;
+ calibration: CalibrationResult;
+ gameplay: GameplayStats;
+ testResults: TestResult[];
+}
diff --git a/tasks/tetris/eval/playwright.config.ts b/tasks/tetris/eval/playwright.config.ts
@@ -0,0 +1,16 @@
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+ testDir: "./gameplay-bot",
+ testMatch: "index.ts",
+ timeout: 120_000, // 2 minutes per test
+ retries: 0,
+ workers: 1, // sequential -- only one game at a time
+ reporter: [["list"]],
+ use: {
+ headless: true,
+ viewport: { width: 1280, height: 720 },
+ actionTimeout: 10_000,
+ navigationTimeout: 10_000,
+ },
+});
diff --git a/test-results/.last-run.json b/test-results/.last-run.json
@@ -0,0 +1,6 @@
+{
+ "status": "failed",
+ "failedTests": [
+ "a123abce010ea7d96fa7-488977003e1cf02fbef6"
+ ]
+}
+\ No newline at end of file
diff --git a/test-results/index.ts-Tetris-Gameplay-Bot-run-gameplay-bot/error-context.md b/test-results/index.ts-Tetris-Gameplay-Bot-run-gameplay-bot/error-context.md
@@ -0,0 +1,16 @@
+# Instructions
+
+- Following Playwright test failed.
+- Explain why, be concise, respect Playwright best practices.
+- Provide a snippet of code with the fix, if possible.
+
+# Test info
+
+- Name: index.ts >> Tetris Gameplay Bot >> run gameplay bot
+- Location: tasks/tetris/eval/gameplay-bot/index.ts:94:7
+
+# Error details
+
+```
+Test timeout of 120000ms exceeded.
+```
+\ No newline at end of file