loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit fe319f04a3475f06a33c8d380059948714ea6455
parent a1dcd8c3630ee94ad18d319107152852e745631c
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 07:52:22 +0200

Add gameplay bot, language=unspecified option, bump Playwright timeout

Gameplay bot (Opus-built):
- 6 TypeScript files in tasks/tetris/eval/gameplay-bot/
- Calibrates to any Tetris implementation (canvas/DOM, any controls)
- Runs 15 independent tests (game loads, auto-drop, movement, rotation,
  line clear, game over, 30s endurance, etc.)
- Uses 4-heuristic AI (height, lines, holes, bumpiness) for gameplay
- Falls back to random inputs if grid reading fails
- Outputs JSON report with per-test pass/fail and gameplay stats

Tested against existing artifact: 7/15 passed (correctly detected
auto-drop bug where pieces don't fall on their own).

Language axis: added "unspecified" value - agent gets no language
instruction, chooses on its own.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mgrid.yaml | 2+-
Apackage-lock.json | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apackage.json | 22++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibrate.ts | 439+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/grid-reader.ts | 267+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/index.ts | 151++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/player.ts | 417+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/tests.ts | 703+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/types.ts | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/playwright.config.ts | 16++++++++++++++++
Atest-results/.last-run.json | 7+++++++
Atest-results/index.ts-Tetris-Gameplay-Bot-run-gameplay-bot/error-context.md | 17+++++++++++++++++
12 files changed, 2244 insertions(+), 1 deletion(-)

diff --git a/grid.yaml b/grid.yaml @@ -15,7 +15,7 @@ axes: prompt_style: values: [simple, detailed] language: - values: [typescript, javascript] + values: [typescript, javascript, unspecified] human_language: values: [en, es] tool_read: diff --git a/package-lock.json b/package-lock.json @@ -0,0 +1,112 @@ +{ + "name": "loop-benchmarking", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "loop-benchmarking", + "version": "1.0.0", + "license": "ISC", + "devDependencies": { + "@playwright/test": "^1.59.1", + "@types/node": "^25.5.2", + "typescript": "^6.0.2" + } + }, + "node_modules/@playwright/test": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz", + "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@types/node": { + "version": "25.5.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.2.tgz", + "integrity": "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.18.0" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/typescript": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-6.0.2.tgz", + "integrity": "sha512-bGdAIrZ0wiGDo5l8c++HWtbaNCWTS4UTv7RaTH/ThVIgjkveJt83m74bBHMJkuCbslY8ixgLBVZJIOiQlQTjfQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.18.2", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", + "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/package.json b/package.json @@ -0,0 +1,22 @@ +{ + "name": "loop-benchmarking", + "version": "1.0.0", + "description": "An open benchmark for comparing agentic coding loop configurations. Same task, different setups, all data public.", + "main": "index.js", + "devDependencies": { + "@playwright/test": "^1.59.1", + "@types/node": "^25.5.2", + "typescript": "^6.0.2" + }, + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "https://git.statagroup.com/research/loop-benchmarking.git" + }, + "keywords": [], + "author": "", + "license": "ISC", + "type": "commonjs" +} diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -0,0 +1,439 @@ +import type { Page } from "@playwright/test"; +import type { + CalibrationResult, + Controls, + GridBounds, + RendererType, + StartMechanism, +} from "./types"; +import { sampleBackgroundColor } from "./grid-reader"; + +const DEFAULT_CONTROLS: Controls = { + left: "ArrowLeft", + right: "ArrowRight", + down: "ArrowDown", + rotate: "ArrowUp", + drop: "Space", +}; + +/** + * Run all calibration steps. Never throws -- returns a result + * with whatever could be detected. + */ +export async function calibrate(page: Page): Promise<CalibrationResult> { + const consoleErrors: string[] = []; + page.on("pageerror", (err) => consoleErrors.push(err.message)); + + const startMechanism = await detectStartMechanism(page); + const { renderer, gridBounds, cellWidth, cellHeight } = await detectGrid(page); + const backgroundColor = + renderer === "canvas" && gridBounds + ? await sampleBackgroundColor(page, gridBounds, cellWidth, cellHeight) + : null; + const controls = await detectControls(page); + const scoreElementSelector = await detectScoreElement(page); + + return { + renderer, + gridDetected: gridBounds !== null, + gridBounds, + cellWidth, + cellHeight, + controls, + startMechanism, + scoreElementSelector, + backgroundColor, + consoleErrors, + }; +} + +/** + * Try multiple mechanisms to start the game. + * Takes a screenshot before and after each attempt, comparing + * to see if the game state changed. + */ +async function detectStartMechanism(page: Page): Promise<StartMechanism> { + // Take initial screenshot + let prevShot = await page.screenshot(); + + // 1. Wait 3 seconds (auto-start) + await page.waitForTimeout(3000); + let newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "auto"; + } + prevShot = newShot; + + // 2. Click the canvas or game container + try { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) { + await canvas.click(); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "click_canvas"; + } + prevShot = newShot; + } + } catch { /* continue */ } + + // Try clicking any game-like container + try { + const container = page.locator( + '[class*="game"], [class*="board"], [id*="game"], [id*="board"]' + ).first(); + if ((await container.count()) > 0) { + await container.click(); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "click_canvas"; + } + prevShot = newShot; + } + } catch { /* continue */ } + + // 3. Press Enter + await page.keyboard.press("Enter"); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "enter"; + } + prevShot = newShot; + + // 4. Press Space + await page.keyboard.press("Space"); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "space"; + } + prevShot = newShot; + + // 5. Look for a start/play button + try { + const button = page.locator("button, a, [role='button']").filter({ + hasText: /start|play|begin|new game/i, + }).first(); + if ((await button.count()) > 0) { + await button.click(); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "button"; + } + prevShot = newShot; + } + } catch { /* continue */ } + + // Also try elements that aren't buttons but have matching text + try { + const textMatch = page.locator( + ':text-matches("start|play|begin|new.game", "i")' + ).first(); + if ((await textMatch.count()) > 0) { + await textMatch.click(); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "button"; + } + prevShot = newShot; + } + } catch { /* continue */ } + + // 6. Press any key (try a few) + for (const key of ["p", "s", "n", "Escape"]) { + await page.keyboard.press(key); + await page.waitForTimeout(300); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "anykey"; + } + prevShot = newShot; + } + + return "unknown"; +} + +interface GridDetection { + renderer: RendererType; + gridBounds: GridBounds | null; + cellWidth: number; + cellHeight: number; +} + +/** + * Detect the game grid: canvas, DOM-based, or SVG. + */ +async function detectGrid(page: Page): Promise<GridDetection> { + // Check for canvas + try { + const canvasCount = await page.locator("canvas").count(); + if (canvasCount > 0) { + const bounds = await page.locator("canvas").first().boundingBox(); + if (bounds) { + // Try to get the canvas internal dimensions (which may differ from CSS size) + const canvasDims = await page.evaluate(() => { + const c = document.querySelector("canvas"); + if (!c) return null; + return { width: c.width, height: c.height }; + }); + + const internalW = canvasDims ? canvasDims.width : bounds.width; + const internalH = canvasDims ? canvasDims.height : bounds.height; + + // Standard Tetris grid is 10 cols by 20 rows + // The canvas might include sidebars, so try to detect the grid area + // Heuristic: if the aspect ratio is close to 1:2, the whole canvas is the grid + const ratio = internalH / internalW; + + let gridX = 0; + let gridY = 0; + let gridW = internalW; + let gridH = internalH; + + if (ratio >= 1.5 && ratio <= 2.5) { + // Looks like the whole canvas is the grid + gridX = 0; + gridY = 0; + gridW = internalW; + gridH = internalH; + } else if (ratio < 1.5) { + // Canvas is wider than expected -- grid is probably a subset + // Assume grid is centered or left-aligned with 1:2 aspect ratio + gridW = internalH / 2; + gridH = internalH; + gridX = 0; // left-aligned by default + gridY = 0; + } + + const cellWidth = gridW / 10; + const cellHeight = gridH / 20; + + return { + renderer: "canvas" as RendererType, + gridBounds: { x: gridX, y: gridY, width: gridW, height: gridH }, + cellWidth, + cellHeight, + }; + } + } + } catch { /* continue */ } + + // Check for DOM-based grid + try { + const domResult = await page.evaluate(() => { + // Look for table-based grids + const tables = document.querySelectorAll("table"); + for (const table of tables) { + const rows = table.querySelectorAll("tr"); + if (rows.length >= 18) { + // Likely a Tetris grid (might be 18-22 rows) + const firstRow = rows[0].querySelectorAll("td"); + if (firstRow.length >= 8 && firstRow.length <= 12) { + const rect = table.getBoundingClientRect(); + return { + type: "dom" as const, + bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, + rows: rows.length, + cols: firstRow.length, + }; + } + } + } + + // Look for grid/flex containers + const containers = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]' + ); + for (const container of containers) { + const children = container.children; + // Flat list of 200 cells (or close to it) + if (children.length >= 180 && children.length <= 220) { + const cols = 10; + const rows = Math.round(children.length / cols); + const rect = container.getBoundingClientRect(); + return { + type: "dom" as const, + bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, + rows, + cols, + }; + } + // 20 row containers + if (children.length >= 18 && children.length <= 22) { + const firstRowCells = children[0].children; + if (firstRowCells.length >= 8 && firstRowCells.length <= 12) { + const rect = container.getBoundingClientRect(); + return { + type: "dom" as const, + bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, + rows: children.length, + cols: firstRowCells.length, + }; + } + } + } + + return null; + }); + + if (domResult) { + const cellWidth = domResult.bounds.width / domResult.cols; + const cellHeight = domResult.bounds.height / domResult.rows; + return { + renderer: "dom", + gridBounds: domResult.bounds, + cellWidth, + cellHeight, + }; + } + } catch { /* continue */ } + + // Check for SVG + try { + const svgCount = await page.locator("svg").count(); + if (svgCount > 0) { + const bounds = await page.locator("svg").first().boundingBox(); + if (bounds) { + return { + renderer: "svg", + gridBounds: { x: bounds.x, y: bounds.y, width: bounds.width, height: bounds.height }, + cellWidth: bounds.width / 10, + cellHeight: bounds.height / 20, + }; + } + } + } catch { /* continue */ } + + return { renderer: "unknown", gridBounds: null, cellWidth: 0, cellHeight: 0 }; +} + +/** + * Detect which keys the game responds to for movement and rotation. + */ +async function detectControls(page: Page): Promise<Controls> { + const controls: Controls = { ...DEFAULT_CONTROLS }; + + // First, scan the page for control hints + try { + const pageText = await page.evaluate(() => document.body.innerText.toLowerCase()); + + if (pageText.includes("wasd") || pageText.includes("w,a,s,d")) { + controls.left = "a"; + controls.right = "d"; + controls.down = "s"; + controls.rotate = "w"; + } + if (/z\s*(=|:)?\s*rotate/i.test(pageText) || /rotate\s*(=|:)?\s*z/i.test(pageText)) { + controls.rotate = "z"; + } + if (/x\s*(=|:)?\s*rotate/i.test(pageText) || /rotate\s*(=|:)?\s*x/i.test(pageText)) { + controls.rotate = "x"; + } + } catch { /* use defaults */ } + + // Verify left key works by pressing and checking for visual change + try { + const before = await page.screenshot(); + await page.keyboard.press(controls.left); + await page.waitForTimeout(200); + const after = await page.screenshot(); + + if (Buffer.from(before).equals(Buffer.from(after))) { + // ArrowLeft didn't work, try "a" + await page.keyboard.press("a"); + await page.waitForTimeout(200); + const afterA = await page.screenshot(); + if (!Buffer.from(before).equals(Buffer.from(afterA))) { + controls.left = "a"; + controls.right = "d"; + controls.down = "s"; + controls.rotate = "w"; + } + } + } catch { /* use defaults */ } + + // Verify rotate key + try { + const before = await page.screenshot(); + await page.keyboard.press(controls.rotate); + await page.waitForTimeout(200); + const after = await page.screenshot(); + + if (Buffer.from(before).equals(Buffer.from(after))) { + // Try alternative rotate keys + for (const alt of ["z", "x", "ArrowUp"]) { + if (alt === controls.rotate) continue; + await page.keyboard.press(alt); + await page.waitForTimeout(200); + const afterAlt = await page.screenshot(); + if (!Buffer.from(before).equals(Buffer.from(afterAlt))) { + controls.rotate = alt; + break; + } + } + } + } catch { /* use defaults */ } + + return controls; +} + +/** + * Find the score display element on the page. + */ +async function detectScoreElement(page: Page): Promise<string | null> { + try { + const selector = await page.evaluate(() => { + // Look for elements with "score" text nearby + const allElements = document.querySelectorAll("*"); + for (const el of allElements) { + const text = (el as HTMLElement).innerText?.toLowerCase() || ""; + if (text.includes("score") && el.children.length < 5) { + // Find the numeric part -- might be a sibling or child + const numChild = el.querySelector("span, div, p, td"); + if (numChild && /^\d+$/.test(numChild.textContent?.trim() || "")) { + // Build a selector for this element + if (numChild.id) return `#${numChild.id}`; + if (numChild.className) { + const cls = numChild.className.split(" ")[0]; + if (cls) return `.${cls}`; + } + } + // The element itself might contain the score + if (el.id) return `#${el.id}`; + if ((el as HTMLElement).className) { + const cls = (el as HTMLElement).className.split(" ")[0]; + if (cls) return `.${cls}`; + } + } + } + + // Fallback: look for elements that contain just a number + const candidates: HTMLElement[] = []; + for (const el of allElements) { + const text = (el as HTMLElement).textContent?.trim() || ""; + if (/^\d+$/.test(text) && el.children.length === 0) { + candidates.push(el as HTMLElement); + } + } + if (candidates.length > 0) { + const el = candidates[0]; + if (el.id) return `#${el.id}`; + if (el.className) { + const cls = el.className.split(" ")[0]; + if (cls) return `.${cls}`; + } + } + + return null; + }); + + return selector; + } catch { + return null; + } +} diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts @@ -0,0 +1,267 @@ +import type { Page } from "@playwright/test"; +import type { Grid, GridBounds, CalibrationResult } from "./types"; + +const GRID_ROWS = 20; +const GRID_COLS = 10; + +/** + * Read the game grid state. Dispatches to canvas or DOM reader + * based on calibration results. Returns a 10x20 boolean matrix, + * or null if reading fails. + */ +export async function readGrid( + page: Page, + cal: CalibrationResult +): Promise<Grid | null> { + try { + if (cal.renderer === "canvas" && cal.gridBounds) { + return await readCanvasGrid(page, cal.gridBounds, cal.cellWidth, cal.cellHeight, cal.backgroundColor); + } + if (cal.renderer === "dom") { + return await readDomGrid(page); + } + // Fallback: try canvas anyway if bounds exist + if (cal.gridBounds) { + return await readCanvasGrid(page, cal.gridBounds, cal.cellWidth, cal.cellHeight, cal.backgroundColor); + } + return null; + } catch { + return null; + } +} + +/** + * Read grid from a canvas element using getImageData. + * Samples the center pixel of each cell and compares to the background color. + */ +async function readCanvasGrid( + page: Page, + bounds: GridBounds, + cellW: number, + cellH: number, + bgColor: [number, number, number] | null +): Promise<Grid | null> { + const bgR = bgColor ? bgColor[0] : 0; + const bgG = bgColor ? bgColor[1] : 0; + const bgB = bgColor ? bgColor[2] : 0; + const threshold = 50; // color distance threshold + + const grid = await page.evaluate( + ({ x, y, cellW, cellH, rows, cols, bgR, bgG, bgB, threshold }) => { + const canvas = document.querySelector("canvas") as HTMLCanvasElement | null; + if (!canvas) return null; + const ctx = canvas.getContext("2d"); + if (!ctx) return null; + + const result: boolean[][] = []; + for (let row = 0; row < rows; row++) { + const rowData: boolean[] = []; + for (let col = 0; col < cols; col++) { + const px = Math.floor(x + col * cellW + cellW / 2); + const py = Math.floor(y + row * cellH + cellH / 2); + const pixel = ctx.getImageData(px, py, 1, 1).data; + // Euclidean distance from background color + const dr = pixel[0] - bgR; + const dg = pixel[1] - bgG; + const db = pixel[2] - bgB; + const dist = Math.sqrt(dr * dr + dg * dg + db * db); + rowData.push(dist > threshold); + } + result.push(rowData); + } + return result; + }, + { x: bounds.x, y: bounds.y, cellW, cellH, rows: GRID_ROWS, cols: GRID_COLS, bgR, bgG, bgB, threshold } + ); + + return grid; +} + +/** + * Read grid from DOM elements. Looks for a grid-like structure and checks + * background colors or class names to determine filled vs empty cells. + */ +async function readDomGrid(page: Page): Promise<Grid | null> { + const grid = await page.evaluate(({ rows, cols }) => { + // Strategy 1: look for a table-based grid + const tables = document.querySelectorAll("table"); + for (const table of tables) { + const trs = table.querySelectorAll("tr"); + if (trs.length >= rows) { + const result: boolean[][] = []; + for (let r = 0; r < rows; r++) { + const tds = trs[r].querySelectorAll("td"); + const rowData: boolean[] = []; + for (let c = 0; c < cols; c++) { + if (c < tds.length) { + const td = tds[c] as HTMLElement; + const style = window.getComputedStyle(td); + const bg = style.backgroundColor; + const cls = td.className.toLowerCase(); + // Filled if it has a non-default background or a class suggesting a piece + const isFilled = + (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") || + cls.includes("filled") || + cls.includes("active") || + cls.includes("block") || + cls.includes("piece") || + td.dataset.filled === "true"; + rowData.push(isFilled); + } else { + rowData.push(false); + } + } + result.push(rowData); + } + return result; + } + } + + // Strategy 2: look for a grid/flex container with child cells + const containers = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]' + ); + for (const container of containers) { + const children = container.children; + // Could be a flat list of 200 cells (10x20) or 20 rows of 10 cells + if (children.length === rows * cols) { + const result: boolean[][] = []; + for (let r = 0; r < rows; r++) { + const rowData: boolean[] = []; + for (let c = 0; c < cols; c++) { + const cell = children[r * cols + c] as HTMLElement; + const style = window.getComputedStyle(cell); + const bg = style.backgroundColor; + const cls = cell.className.toLowerCase(); + const isFilled = + (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") || + cls.includes("filled") || + cls.includes("active") || + cls.includes("block") || + cls.includes("piece") || + cell.dataset.filled === "true"; + rowData.push(isFilled); + } + result.push(rowData); + } + return result; + } + // Could be 20 row containers each with 10 cells + if (children.length === rows) { + let valid = true; + const result: boolean[][] = []; + for (let r = 0; r < rows; r++) { + const rowEl = children[r]; + const cells = rowEl.children; + if (cells.length < cols) { valid = false; break; } + const rowData: boolean[] = []; + for (let c = 0; c < cols; c++) { + const cell = cells[c] as HTMLElement; + const style = window.getComputedStyle(cell); + const bg = style.backgroundColor; + const cls = cell.className.toLowerCase(); + const isFilled = + (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") || + cls.includes("filled") || + cls.includes("active") || + cls.includes("block") || + cls.includes("piece") || + cell.dataset.filled === "true"; + rowData.push(isFilled); + } + result.push(rowData); + } + if (valid) return result; + } + } + + return null; + }, { rows: GRID_ROWS, cols: GRID_COLS }); + + return grid; +} + +/** + * Sample the background color from the top-left cell of an empty grid. + * Called during calibration before the game has pieces. + */ +export async function sampleBackgroundColor( + page: Page, + bounds: GridBounds, + cellW: number, + cellH: number +): Promise<[number, number, number] | null> { + try { + const color = await page.evaluate( + ({ x, y, cellW, cellH }) => { + const canvas = document.querySelector("canvas") as HTMLCanvasElement | null; + if (!canvas) return null; + const ctx = canvas.getContext("2d"); + if (!ctx) return null; + // Sample from the center of the first cell + const px = Math.floor(x + cellW / 2); + const py = Math.floor(y + cellH / 2); + const pixel = ctx.getImageData(px, py, 1, 1).data; + return [pixel[0], pixel[1], pixel[2]] as [number, number, number]; + }, + { x: bounds.x, y: bounds.y, cellW, cellH } + ); + return color; + } catch { + return null; + } +} + +/** + * Compare two grids and return true if they differ. + */ +export function gridsAreDifferent(a: Grid | null, b: Grid | null): boolean { + if (a === null || b === null) return a !== b; + if (a.length !== b.length) return true; + for (let r = 0; r < a.length; r++) { + if (a[r].length !== b[r].length) return true; + for (let c = 0; c < a[r].length; c++) { + if (a[r][c] !== b[r][c]) return true; + } + } + return false; +} + +/** + * Count the number of filled cells in the bottom N rows of the grid. + */ +export function countFilledInBottomRows(grid: Grid, rows: number): number { + let count = 0; + const startRow = Math.max(0, grid.length - rows); + for (let r = startRow; r < grid.length; r++) { + for (let c = 0; c < grid[r].length; c++) { + if (grid[r][c]) count++; + } + } + return count; +} + +/** + * Count total filled cells in the grid. + */ +export function countFilled(grid: Grid): number { + let count = 0; + for (const row of grid) { + for (const cell of row) { + if (cell) count++; + } + } + return count; +} + +/** + * Check if there are filled cells in the top few rows (near game over). + */ +export function hasFilledInTopRows(grid: Grid, rows: number): boolean { + for (let r = 0; r < Math.min(rows, grid.length); r++) { + for (let c = 0; c < grid[r].length; c++) { + if (grid[r][c]) return true; + } + } + return false; +} diff --git a/tasks/tetris/eval/gameplay-bot/index.ts b/tasks/tetris/eval/gameplay-bot/index.ts @@ -0,0 +1,151 @@ +import { test } from "@playwright/test"; +import { execSync, spawn, type ChildProcess } from "node:child_process"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import * as net from "node:net"; +import type { BotReport } from "./types"; +import { runAllTests } from "./tests"; + +/** + * Find an available port by briefly binding to port 0. + */ +async function findFreePort(): Promise<number> { + return new Promise((resolve, reject) => { + const server = net.createServer(); + server.listen(0, () => { + const addr = server.address(); + if (addr && typeof addr === "object") { + const port = addr.port; + server.close(() => resolve(port)); + } else { + server.close(() => reject(new Error("could not determine port"))); + } + }); + server.on("error", reject); + }); +} + +/** + * Start a simple HTTP server to serve workspace files. + * Tries `npx serve` first, then `python3 -m http.server`. + */ +async function startServer(workspacePath: string, port: number): Promise<ChildProcess> { + let serverProc: ChildProcess; + + // Try npx serve first + try { + execSync("npx serve --version", { stdio: "ignore", timeout: 5000 }); + serverProc = spawn("npx", ["serve", "-l", String(port), "-s", "--no-clipboard"], { + cwd: workspacePath, + stdio: "ignore", + }); + } catch { + // Fallback to python + serverProc = spawn("python3", ["-m", "http.server", String(port)], { + cwd: workspacePath, + stdio: "ignore", + }); + } + + // Wait for the server to be ready + const maxWait = 10000; + const start = Date.now(); + while (Date.now() - start < maxWait) { + try { + await new Promise<void>((resolve, reject) => { + const socket = net.createConnection({ port, host: "127.0.0.1" }, () => { + socket.destroy(); + resolve(); + }); + socket.on("error", reject); + socket.setTimeout(500, () => { + socket.destroy(); + reject(new Error("timeout")); + }); + }); + return serverProc; + } catch { + await new Promise((r) => setTimeout(r, 200)); + } + } + + throw new Error(`server did not start on port ${port} within ${maxWait}ms`); +} + +test.describe("Tetris Gameplay Bot", () => { + let serverProc: ChildProcess | null = null; + let serverUrl: string; + + test.beforeAll(async () => { + const workspacePath = + process.env.WORKSPACE_PATH || process.env.TETRIS_WORKSPACE || process.cwd(); + const port = await findFreePort(); + serverProc = await startServer(workspacePath, port); + serverUrl = `http://127.0.0.1:${port}`; + }); + + test.afterAll(async () => { + if (serverProc) { + serverProc.kill("SIGTERM"); + serverProc = null; + } + }); + + test("run gameplay bot", async ({ page }) => { + test.setTimeout(120_000); // 2-minute total timeout + + const { testResults, calibration, gameplay } = await runAllTests(page, serverUrl); + + const passed = testResults.filter((t) => t.pass).length; + const failed = testResults.filter((t) => !t.pass).length; + const total = testResults.length; + + const report: BotReport = { + implementation: { + renderer: calibration.renderer, + grid_detected: calibration.gridDetected, + grid_bounds: calibration.gridBounds, + controls: calibration.controls as unknown as Record<string, string>, + start_mechanism: calibration.startMechanism, + score_element_found: calibration.scoreElementSelector !== null, + }, + tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })), + summary: { + total, + passed, + failed, + score: total > 0 ? Math.round((passed / total) * 100) / 100 : 0, + }, + gameplay, + }; + + // Write report to file + const reportPath = + process.env.REPORT_OUTPUT_PATH || + path.join(process.cwd(), "gameplay-bot-report.json"); + + // Ensure output directory exists + const reportDir = path.dirname(reportPath); + if (!fs.existsSync(reportDir)) { + fs.mkdirSync(reportDir, { recursive: true }); + } + + fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), "utf-8"); + + // Log summary to console for visibility + console.log("\n=== Gameplay Bot Report ==="); + console.log(`Renderer: ${calibration.renderer}`); + console.log(`Grid detected: ${calibration.gridDetected}`); + console.log(`Start mechanism: ${calibration.startMechanism}`); + console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`); + console.log(`\nTests: ${passed}/${total} passed`); + for (const t of testResults) { + console.log(` ${t.pass ? "PASS" : "FAIL"} ${t.name}: ${t.detail}`); + } + console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`); + console.log(`Report written to: ${reportPath}`); + console.log("===========================\n"); + + // Always pass the Playwright test -- results are in the report + }); +}); diff --git a/tasks/tetris/eval/gameplay-bot/player.ts b/tasks/tetris/eval/gameplay-bot/player.ts @@ -0,0 +1,417 @@ +import type { Page } from "@playwright/test"; +import type { Grid, CalibrationResult } from "./types"; +import { readGrid } from "./grid-reader"; + +// Heuristic weights from the spec +const W_HEIGHT = -0.510066; +const W_LINES = 0.760666; +const W_HOLES = -0.35663; +const W_BUMPINESS = -0.184483; + +const GRID_ROWS = 20; +const GRID_COLS = 10; + +/** The moves needed to place a piece. */ +interface Placement { + rotations: number; + column: number; // target column for leftmost cell of piece + score: number; +} + +/** + * Play the game for a specified duration or number of pieces using the + * 4-heuristic algorithm. Falls back to random input if grid reading fails. + * + * Returns the number of pieces placed and lines cleared. + */ +export async function playGame( + page: Page, + cal: CalibrationResult, + options: { maxPieces?: number; maxDurationMs?: number } +): Promise<{ piecesPlaced: number; linesCleared: number; errors: number }> { + const maxPieces = options.maxPieces ?? 100; + const maxDuration = options.maxDurationMs ?? 30000; + const start = Date.now(); + let piecesPlaced = 0; + let linesCleared = 0; + let errors = 0; + let consecutiveFailures = 0; + + while (piecesPlaced < maxPieces && Date.now() - start < maxDuration) { + try { + const grid = await readGrid(page, cal); + + if (!grid) { + // Fallback: random inputs + await playRandomMove(page, cal); + piecesPlaced++; + consecutiveFailures++; + if (consecutiveFailures > 5) { + // Grid reading is not working, just do random play for remaining time + await playRandomForDuration(page, cal, maxDuration - (Date.now() - start)); + piecesPlaced += 5; + break; + } + continue; + } + consecutiveFailures = 0; + + // Count filled cells before the move + const filledBefore = countTotalFilled(grid); + + // Find the best placement + const placement = findBestPlacement(grid); + + if (placement) { + await executePlacement(page, cal, placement); + linesCleared += placement.linesCleared ?? 0; + } else { + // Can't find a good placement, just hard drop + await page.keyboard.press(cal.controls.drop); + } + + piecesPlaced++; + + // Brief wait for the game to settle + await page.waitForTimeout(150); + + // Check if lines were cleared by comparing filled cells + const gridAfter = await readGrid(page, cal); + if (gridAfter) { + const filledAfter = countTotalFilled(gridAfter); + // If we placed a piece (added ~4 cells) but total filled went down, + // some lines were cleared + if (filledAfter < filledBefore) { + const possibleClears = Math.round((filledBefore + 4 - filledAfter) / GRID_COLS); + if (possibleClears > 0) { + linesCleared += possibleClears; + } + } + } + } catch { + errors++; + // Don't crash -- try to keep playing + await playRandomMove(page, cal); + piecesPlaced++; + } + } + + return { piecesPlaced, linesCleared, errors }; +} + +/** + * Execute a single hard drop (for tests that just need to drop a piece). + */ +export async function hardDrop(page: Page, cal: CalibrationResult): Promise<void> { + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(200); +} + +/** + * Execute a placement: rotate, move to column, then hard drop. + */ +async function executePlacement( + page: Page, + cal: CalibrationResult, + placement: Placement +): Promise<void> { + // Rotate + for (let i = 0; i < placement.rotations; i++) { + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(50); + } + + // Move to target column + // Assume piece spawns at roughly column 3-5 (center) + const spawnCol = 4; + const diff = placement.column - spawnCol; + + if (diff < 0) { + for (let i = 0; i < Math.abs(diff); i++) { + await page.keyboard.press(cal.controls.left); + await page.waitForTimeout(30); + } + } else if (diff > 0) { + for (let i = 0; i < diff; i++) { + await page.keyboard.press(cal.controls.right); + await page.waitForTimeout(30); + } + } + + // Hard drop + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(100); +} + +/** + * Play a random move (fallback when grid reading fails). + */ +async function playRandomMove(page: Page, cal: CalibrationResult): Promise<void> { + const moves = [cal.controls.left, cal.controls.right, cal.controls.rotate, cal.controls.down]; + const randomMoves = Math.floor(Math.random() * 4) + 1; + for (let i = 0; i < randomMoves; i++) { + const key = moves[Math.floor(Math.random() * moves.length)]; + await page.keyboard.press(key); + await page.waitForTimeout(50); + } + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(100); +} + +/** + * Play randomly for a set duration (when grid reading is broken). + */ +async function playRandomForDuration( + page: Page, + cal: CalibrationResult, + durationMs: number +): Promise<void> { + const start = Date.now(); + const moves = [cal.controls.left, cal.controls.right, cal.controls.rotate, cal.controls.down, cal.controls.drop]; + + while (Date.now() - start < durationMs) { + const key = moves[Math.floor(Math.random() * moves.length)]; + await page.keyboard.press(key); + await page.waitForTimeout(100); + } +} + +/** + * Try to fill a specific row by placing pieces strategically. + * Uses repeated hard drops at different columns to build up the bottom row. + */ +export async function tryFillRow( + page: Page, + cal: CalibrationResult, + maxAttempts: number +): Promise<boolean> { + // Strategy: move piece to each column left to right and hard drop + // This won't guarantee a line clear but maximizes the chance + const columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let attempts = 0; + + for (const targetCol of columns) { + if (attempts >= maxAttempts) break; + + // Move to far left first + for (let i = 0; i < 6; i++) { + await page.keyboard.press(cal.controls.left); + await page.waitForTimeout(30); + } + + // Then move right to target column + for (let i = 0; i < targetCol; i++) { + await page.keyboard.press(cal.controls.right); + await page.waitForTimeout(30); + } + + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(200); + attempts++; + } + + // Check if a line was cleared + const grid = await readGrid(page, cal); + if (!grid) return false; + + // If bottom row is now empty after being full, a line was cleared + const bottomFilled = grid[GRID_ROWS - 1].filter(Boolean).length; + // Heuristic: if bottom row is less full than expected after 10 pieces, lines probably cleared + return bottomFilled < 8; +} + +/** + * Quickly stack pieces to reach game over. + */ +export async function stackToGameOver( + page: Page, + cal: CalibrationResult, + maxAttempts: number +): Promise<boolean> { + // Strategy: hard drop in the same column repeatedly to build a tower + for (let i = 0; i < maxAttempts; i++) { + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(150); + } + + // Check if the game appears to have stopped + const shot1 = await page.screenshot(); + await page.waitForTimeout(1000); + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + const shot2 = await page.screenshot(); + + // If nothing changed despite input, game is likely over + // (or check for game-over text) + const screenshotsSame = Buffer.from(shot1).equals(Buffer.from(shot2)); + + const hasGameOverText = await page.evaluate(() => { + const text = document.body.innerText.toLowerCase(); + return ( + text.includes("game over") || + text.includes("gameover") || + text.includes("you lose") || + text.includes("try again") || + text.includes("restart") || + text.includes("play again") + ); + }); + + return screenshotsSame || hasGameOverText; +} + +// --- Heuristic evaluation functions --- + +interface PlacementWithLines extends Placement { + linesCleared?: number; +} + +/** + * Find the best column and rotation for the current piece using the + * 4-heuristic scoring function. + */ +function findBestPlacement(grid: Grid): PlacementWithLines | null { + let bestScore = -Infinity; + let bestPlacement: PlacementWithLines | null = null; + + // Try each rotation (0-3) and each column position (0-9) + for (let rotations = 0; rotations < 4; rotations++) { + for (let col = 0; col < GRID_COLS; col++) { + // Simulate placing a simple piece (we don't know the exact piece, + // so we simulate a 1-wide vertical drop at this column) + const simGrid = simulateDrop(grid, col); + if (!simGrid) continue; + + const { cleared, board } = clearLines(simGrid); + const score = + W_HEIGHT * aggregateHeight(board) + + W_LINES * cleared + + W_HOLES * countHoles(board) + + W_BUMPINESS * bumpiness(board); + + if (score > bestScore) { + bestScore = score; + bestPlacement = { rotations, column: col, score, linesCleared: cleared }; + } + } + } + + return bestPlacement; +} + +/** + * Simulate dropping a single cell at the given column (simplified -- + * we don't know the actual piece shape without more complex detection). + */ +function simulateDrop(grid: Grid, col: number): Grid | null { + if (col < 0 || col >= GRID_COLS) return null; + + // Find the lowest empty row in this column + let landRow = -1; + for (let r = GRID_ROWS - 1; r >= 0; r--) { + if (!grid[r][col]) { + landRow = r; + break; + } + } + if (landRow < 0) return null; + + // Clone the grid and place the piece + const newGrid: Grid = grid.map((row) => [...row]); + newGrid[landRow][col] = true; + + return newGrid; +} + +/** + * Clear completed lines and return the count + new board. + */ +function clearLines(grid: Grid): { cleared: number; board: Grid } { + const remaining: boolean[][] = []; + let cleared = 0; + + for (const row of grid) { + if (row.every(Boolean)) { + cleared++; + } else { + remaining.push([...row]); + } + } + + // Add empty rows at the top + while (remaining.length < GRID_ROWS) { + remaining.unshift(new Array(GRID_COLS).fill(false)); + } + + return { cleared, board: remaining }; +} + +/** + * Sum of column heights (distance from top to highest filled cell per column). + */ +function aggregateHeight(grid: Grid): number { + let total = 0; + for (let col = 0; col < GRID_COLS; col++) { + for (let row = 0; row < GRID_ROWS; row++) { + if (grid[row][col]) { + total += GRID_ROWS - row; + break; + } + } + } + return total; +} + +/** + * Count holes (empty cells with a filled cell above them in the same column). + */ +function countHoles(grid: Grid): number { + let holes = 0; + for (let col = 0; col < GRID_COLS; col++) { + let blockFound = false; + for (let row = 0; row < GRID_ROWS; row++) { + if (grid[row][col]) { + blockFound = true; + } else if (blockFound) { + holes++; + } + } + } + return holes; +} + +/** + * Sum of absolute height differences between adjacent columns. + */ +function bumpiness(grid: Grid): number { + const heights: number[] = []; + for (let col = 0; col < GRID_COLS; col++) { + let h = 0; + for (let row = 0; row < GRID_ROWS; row++) { + if (grid[row][col]) { + h = GRID_ROWS - row; + break; + } + } + heights.push(h); + } + + let bump = 0; + for (let i = 0; i < heights.length - 1; i++) { + bump += Math.abs(heights[i] - heights[i + 1]); + } + return bump; +} + +/** + * Count total filled cells in the grid. + */ +function countTotalFilled(grid: Grid): number { + let count = 0; + for (const row of grid) { + for (const cell of row) { + if (cell) count++; + } + } + return count; +} diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -0,0 +1,703 @@ +import type { Page } from "@playwright/test"; +import type { TestResult, CalibrationResult, GameplayStats } from "./types"; +import { readGrid, gridsAreDifferent, countFilled, countFilledInBottomRows, hasFilledInTopRows } from "./grid-reader"; +import { hardDrop, playGame, tryFillRow, stackToGameOver } from "./player"; +import { calibrate } from "./calibrate"; + +/** + * Run all 15 tests sequentially. Each test has its own try/catch + * so one failure never stops the others. + * + * Returns the test results and the calibration result (which may have + * been updated during testing). + */ +export async function runAllTests( + page: Page, + serverUrl: string +): Promise<{ + testResults: TestResult[]; + calibration: CalibrationResult; + gameplay: GameplayStats; +}> { + const testResults: TestResult[] = []; + const gameplay: GameplayStats = { + pieces_placed: 0, + lines_cleared: 0, + max_score_observed: 0, + play_duration_seconds: 0, + errors_during_play: 0, + }; + + // Collect console errors across the entire session + const consoleErrors: string[] = []; + page.on("pageerror", (err) => consoleErrors.push(err.message)); + + // ---- Test 1: Game loads ---- + let pageLoaded = false; + try { + const result = await testGameLoads(page, serverUrl, consoleErrors); + testResults.push(result); + pageLoaded = result.pass; + } catch (err) { + testResults.push({ + name: "game_loads", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // If the page didn't load at all, fail everything and return + if (!pageLoaded) { + const remainingTests = [ + "game_starts", "auto_drop", "move_left", "move_right", "move_down", + "rotate", "hard_drop", "piece_locks", "new_piece_spawns", + "multiple_pieces", "line_clear", "score_changes", "game_over", + "playable_30s", + ]; + for (const name of remainingTests) { + testResults.push({ name, pass: false, detail: "skipped: page did not load" }); + } + return { + testResults, + calibration: emptyCalibration(consoleErrors), + gameplay, + }; + } + + // ---- Test 2: Game starts ---- + let cal: CalibrationResult; + try { + cal = await calibrate(page); + const started = cal.startMechanism !== "unknown"; + testResults.push({ + name: "game_starts", + pass: started, + detail: started + ? `started via ${cal.startMechanism}` + : "could not start game with any mechanism", + }); + } catch (err) { + cal = emptyCalibration(consoleErrors); + testResults.push({ + name: "game_starts", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // Merge console errors from calibration + for (const e of cal.consoleErrors) { + if (!consoleErrors.includes(e)) consoleErrors.push(e); + } + + // ---- Test 3: Auto-drop ---- + try { + const result = await testAutoDrop(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "auto_drop", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 4: Move left ---- + try { + const result = await testMoveDirection(page, cal, "left"); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "move_left", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 5: Move right ---- + try { + const result = await testMoveDirection(page, cal, "right"); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "move_right", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 6: Move down ---- + try { + const result = await testMoveDirection(page, cal, "down"); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "move_down", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 7: Rotate ---- + try { + const result = await testRotate(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "rotate", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 8: Hard drop ---- + try { + const result = await testHardDrop(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "hard_drop", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 9: Piece locks ---- + try { + const result = await testPieceLocks(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "piece_locks", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 10: New piece spawns ---- + try { + const result = await testNewPieceSpawns(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "new_piece_spawns", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 11: Multiple pieces ---- + try { + const result = await testMultiplePieces(page, cal, gameplay); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "multiple_pieces", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // We need a fresh game for line clear and game over tests + // Reload the page and re-calibrate + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { + // If reload fails, continue with existing state + } + + // ---- Test 12: Line clear ---- + try { + const result = await testLineClear(page, cal, gameplay); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "line_clear", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // ---- Test 13: Score changes ---- + try { + const result = await testScoreChanges(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "score_changes", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // Reload for game over test + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { + // continue with existing state + } + + // ---- Test 14: Game over ---- + try { + const result = await testGameOver(page, cal); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "game_over", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // Reload for 30s play test + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { + // continue + } + + // ---- Test 15: Playable for 30 seconds ---- + try { + const result = await testPlayable30s(page, cal, gameplay, consoleErrors); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "playable_30s", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + + // Read final score + try { + if (cal.scoreElementSelector) { + const scoreText = await page.textContent(cal.scoreElementSelector); + const score = parseInt(scoreText?.replace(/\D/g, "") || "0", 10); + if (score > gameplay.max_score_observed) { + gameplay.max_score_observed = score; + } + } + } catch { /* ignore */ } + + return { testResults, calibration: cal, gameplay }; +} + +// ---- Individual test implementations ---- + +async function testGameLoads( + page: Page, + serverUrl: string, + consoleErrors: string[] +): Promise<TestResult> { + const errorsBefore = consoleErrors.length; + + await loadGamePage(page, serverUrl); + await page.waitForTimeout(3000); + + const newErrors = consoleErrors.slice(errorsBefore); + if (newErrors.length === 0) { + return { name: "game_loads", pass: true, detail: "no console errors" }; + } + return { + name: "game_loads", + pass: false, + detail: `${newErrors.length} console error(s): ${newErrors[0]}`, + }; +} + +async function testAutoDrop(page: Page, cal: CalibrationResult): Promise<TestResult> { + // Use screenshot comparison: wait 5 seconds with no input + const grid1 = await readGrid(page, cal); + const shot1 = await page.screenshot(); + await page.waitForTimeout(5000); + const grid2 = await readGrid(page, cal); + const shot2 = await page.screenshot(); + + // Check grid difference first, fall back to screenshot diff + if (grid1 && grid2 && gridsAreDifferent(grid1, grid2)) { + return { name: "auto_drop", pass: true, detail: "grid state changed after 5s with no input" }; + } + if (!Buffer.from(shot1).equals(Buffer.from(shot2))) { + return { name: "auto_drop", pass: true, detail: "pixels changed after 5s with no input" }; + } + return { name: "auto_drop", pass: false, detail: "piece did not move in 5 seconds" }; +} + +async function testMoveDirection( + page: Page, + cal: CalibrationResult, + direction: "left" | "right" | "down" +): Promise<TestResult> { + const keyMap = { + left: cal.controls.left, + right: cal.controls.right, + down: cal.controls.down, + }; + + const shotBefore = await page.screenshot(); + const gridBefore = await readGrid(page, cal); + + await page.keyboard.press(keyMap[direction]); + await page.waitForTimeout(300); + + const shotAfter = await page.screenshot(); + const gridAfter = await readGrid(page, cal); + + const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); + const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); + + if (gridChanged || pixelsChanged) { + return { name: `move_${direction}`, pass: true, detail: "grid state changed after key press" }; + } + return { name: `move_${direction}`, pass: false, detail: "no change detected after key press" }; +} + +async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResult> { + const shotBefore = await page.screenshot(); + const gridBefore = await readGrid(page, cal); + + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(300); + + const shotAfter = await page.screenshot(); + const gridAfter = await readGrid(page, cal); + + const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); + const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); + + if (gridChanged || pixelsChanged) { + return { name: "rotate", pass: true, detail: "piece shape changed after rotate key" }; + } + return { name: "rotate", pass: false, detail: "no change detected after rotate key" }; +} + +async function testHardDrop(page: Page, cal: CalibrationResult): Promise<TestResult> { + const gridBefore = await readGrid(page, cal); + const shotBefore = await page.screenshot(); + + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + + const gridAfter = await readGrid(page, cal); + const shotAfter = await page.screenshot(); + + // After hard drop, there should be filled cells at the bottom + // and the grid should have changed + const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); + const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); + const hasBottomCells = gridAfter ? countFilledInBottomRows(gridAfter, 5) > 0 : false; + + if ((gridChanged || pixelsChanged) && (hasBottomCells || !gridAfter)) { + return { name: "hard_drop", pass: true, detail: "piece immediately dropped and new piece appeared" }; + } + if (pixelsChanged) { + return { name: "hard_drop", pass: true, detail: "visual change detected after hard drop" }; + } + return { name: "hard_drop", pass: false, detail: "no change detected after hard drop key" }; +} + +async function testPieceLocks(page: Page, cal: CalibrationResult): Promise<TestResult> { + // Wait for auto-drop to bring a piece to the bottom (~15 seconds) + // First, hard drop to establish a baseline + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + + const gridAfterDrop = await readGrid(page, cal); + if (gridAfterDrop) { + const bottomFilled = countFilledInBottomRows(gridAfterDrop, 4); + if (bottomFilled > 0) { + // Verify persistence: wait and check again + await page.waitForTimeout(2000); + const gridLater = await readGrid(page, cal); + if (gridLater) { + const bottomFilledLater = countFilledInBottomRows(gridLater, 4); + if (bottomFilledLater >= bottomFilled) { + return { name: "piece_locks", pass: true, detail: "filled cells persist at bottom" }; + } + } + return { name: "piece_locks", pass: true, detail: "filled cells detected at bottom after drop" }; + } + } + + // Fallback: wait for auto-drop + await page.waitForTimeout(15000); + const gridAfterWait = await readGrid(page, cal); + if (gridAfterWait) { + const bottomFilled = countFilledInBottomRows(gridAfterWait, 4); + if (bottomFilled > 0) { + return { name: "piece_locks", pass: true, detail: "piece locked at bottom via auto-drop" }; + } + } + + // Screenshot-based fallback + const shot1 = await page.screenshot(); + await page.waitForTimeout(2000); + const shot2 = await page.screenshot(); + // If screenshots are stable, something probably locked + return { + name: "piece_locks", + pass: false, + detail: "could not verify piece locking at bottom", + }; +} + +async function testNewPieceSpawns(page: Page, cal: CalibrationResult): Promise<TestResult> { + // After a piece locks (previous test did a hard drop), check for a piece at the top + const grid = await readGrid(page, cal); + if (grid) { + const topHasFilled = hasFilledInTopRows(grid, 4); + if (topHasFilled) { + return { name: "new_piece_spawns", pass: true, detail: "new piece detected at top of grid" }; + } + + // Wait a moment for the new piece to appear + await page.waitForTimeout(1000); + const grid2 = await readGrid(page, cal); + if (grid2 && hasFilledInTopRows(grid2, 4)) { + return { name: "new_piece_spawns", pass: true, detail: "new piece appeared at top after delay" }; + } + } + + // Drop another piece and check + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + const gridAfter = await readGrid(page, cal); + if (gridAfter && hasFilledInTopRows(gridAfter, 4)) { + return { name: "new_piece_spawns", pass: true, detail: "new piece detected after drop" }; + } + + // Screenshot fallback + const shot1 = await page.screenshot(); + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + const shot2 = await page.screenshot(); + if (!Buffer.from(shot1).equals(Buffer.from(shot2))) { + return { name: "new_piece_spawns", pass: true, detail: "visual change suggests new piece spawned" }; + } + + return { name: "new_piece_spawns", pass: false, detail: "could not detect new piece at top" }; +} + +async function testMultiplePieces( + page: Page, + cal: CalibrationResult, + gameplay: GameplayStats +): Promise<TestResult> { + const gridBefore = await readGrid(page, cal); + const filledBefore = gridBefore ? countFilled(gridBefore) : 0; + + // Hard drop 10 pieces + for (let i = 0; i < 10; i++) { + await hardDrop(page, cal); + await page.waitForTimeout(300); + } + gameplay.pieces_placed += 10; + + const gridAfter = await readGrid(page, cal); + if (gridAfter) { + const filledAfter = countFilled(gridAfter); + if (filledAfter > filledBefore) { + return { + name: "multiple_pieces", + pass: true, + detail: `grid accumulated cells: ${filledBefore} -> ${filledAfter}`, + }; + } + } + + // Screenshot fallback: if the game is still responding to drops, it's working + const shotA = await page.screenshot(); + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(300); + const shotB = await page.screenshot(); + if (!Buffer.from(shotA).equals(Buffer.from(shotB))) { + return { name: "multiple_pieces", pass: true, detail: "game still responding after 10 piece drops" }; + } + + return { name: "multiple_pieces", pass: false, detail: "grid did not accumulate filled cells" }; +} + +async function testLineClear( + page: Page, + cal: CalibrationResult, + gameplay: GameplayStats +): Promise<TestResult> { + // Strategy: fill a row by placing pieces across the bottom + const gridBefore = await readGrid(page, cal); + const filledBefore = gridBefore ? countFilled(gridBefore) : 0; + + // Play strategically using the AI to try to clear lines + const result = await playGame(page, cal, { maxPieces: 30, maxDurationMs: 20000 }); + gameplay.pieces_placed += result.piecesPlaced; + gameplay.errors_during_play += result.errors; + + if (result.linesCleared > 0) { + gameplay.lines_cleared += result.linesCleared; + return { + name: "line_clear", + pass: true, + detail: `${result.linesCleared} line(s) cleared during AI play`, + }; + } + + // Try the brute-force row-fill approach + const cleared = await tryFillRow(page, cal, 10); + gameplay.pieces_placed += 10; + if (cleared) { + gameplay.lines_cleared += 1; + return { name: "line_clear", pass: true, detail: "line cleared via strategic placement" }; + } + + // Check if total filled decreased (which would indicate clearing happened) + const gridAfter = await readGrid(page, cal); + const filledAfter = gridAfter ? countFilled(gridAfter) : 0; + if (filledAfter < filledBefore && filledBefore > 0) { + return { name: "line_clear", pass: true, detail: "total filled cells decreased, indicating line clear" }; + } + + return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" }; +} + +async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> { + if (!cal.scoreElementSelector) { + // Try to find any number on the page that changes + const textBefore = await page.evaluate(() => document.body.innerText); + const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number); + + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); + + const textAfter = await page.evaluate(() => document.body.innerText); + const numbersAfter = (textAfter.match(/\d+/g) || []).map(Number); + + // Check if any number increased + for (let i = 0; i < Math.min(numbersBefore.length, numbersAfter.length); i++) { + if (numbersAfter[i] > numbersBefore[i]) { + return { name: "score_changes", pass: true, detail: "a number on the page increased after play" }; + } + } + + return { name: "score_changes", pass: false, detail: "no score element found and no number changed" }; + } + + try { + const scoreBefore = await page.textContent(cal.scoreElementSelector); + const numBefore = parseInt(scoreBefore?.replace(/\D/g, "") || "0", 10); + + // Play a bit to change the score + for (let i = 0; i < 5; i++) { + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(300); + } + + const scoreAfter = await page.textContent(cal.scoreElementSelector); + const numAfter = parseInt(scoreAfter?.replace(/\D/g, "") || "0", 10); + + if (numAfter > numBefore) { + return { + name: "score_changes", + pass: true, + detail: `score changed from ${numBefore} to ${numAfter}`, + }; + } + return { + name: "score_changes", + pass: false, + detail: `score did not increase: ${numBefore} -> ${numAfter}`, + }; + } catch { + return { name: "score_changes", pass: false, detail: "could not read score element" }; + } +} + +async function testGameOver(page: Page, cal: CalibrationResult): Promise<TestResult> { + const isOver = await stackToGameOver(page, cal, 40); + if (isOver) { + return { name: "game_over", pass: true, detail: "game stopped after stacking to top" }; + } + return { name: "game_over", pass: false, detail: "could not trigger or detect game over" }; +} + +async function testPlayable30s( + page: Page, + cal: CalibrationResult, + gameplay: GameplayStats, + consoleErrors: string[] +): Promise<TestResult> { + const errorsBefore = consoleErrors.length; + const start = Date.now(); + + const result = await playGame(page, cal, { maxDurationMs: 30000 }); + + const elapsed = Math.round((Date.now() - start) / 1000); + gameplay.pieces_placed += result.piecesPlaced; + gameplay.lines_cleared += result.linesCleared; + gameplay.play_duration_seconds += elapsed; + gameplay.errors_during_play += result.errors; + + const newErrors = consoleErrors.slice(errorsBefore); + const crashed = newErrors.length > 0 || result.errors > 3; + + if (!crashed) { + return { + name: "playable_30s", + pass: true, + detail: `played for ${elapsed}s, placed ${result.piecesPlaced} pieces, no crashes`, + }; + } + return { + name: "playable_30s", + pass: false, + detail: `${newErrors.length} console errors, ${result.errors} play errors during ${elapsed}s`, + }; +} + +// ---- Helpers ---- + +async function loadGamePage(page: Page, serverUrl: string): Promise<void> { + const candidates = [ + "index.html", + "dist/index.html", + "public/index.html", + "build/index.html", + ]; + + for (const candidate of candidates) { + try { + const response = await page.goto(`${serverUrl}/${candidate}`, { + timeout: 5000, + waitUntil: "domcontentloaded", + }); + if (response && response.ok()) return; + } catch { + continue; + } + } + + // Last resort: try root + await page.goto(`${serverUrl}/`, { timeout: 5000, waitUntil: "domcontentloaded" }); +} + +function emptyCalibration(consoleErrors: string[]): CalibrationResult { + return { + renderer: "unknown", + gridDetected: false, + gridBounds: null, + cellWidth: 0, + cellHeight: 0, + controls: { + left: "ArrowLeft", + right: "ArrowRight", + down: "ArrowDown", + rotate: "ArrowUp", + drop: "Space", + }, + startMechanism: "unknown", + scoreElementSelector: null, + backgroundColor: null, + consoleErrors, + }; +} diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -0,0 +1,92 @@ +import type { Page } from "@playwright/test"; + +/** A 10x20 boolean grid: true = filled cell, false = empty. Row 0 is the top. */ +export type Grid = boolean[][]; + +/** Pixel bounds of the game grid on the page. */ +export interface GridBounds { + x: number; + y: number; + width: number; + height: number; +} + +/** How the game renders its grid. */ +export type RendererType = "canvas" | "dom" | "svg" | "unknown"; + +/** Key mappings for game controls. */ +export interface Controls { + left: string; + right: string; + down: string; + rotate: string; + drop: string; +} + +/** How the game was started. */ +export type StartMechanism = + | "auto" + | "click_canvas" + | "enter" + | "space" + | "button" + | "anykey" + | "unknown"; + +/** Result of the calibration phase. */ +export interface CalibrationResult { + renderer: RendererType; + gridDetected: boolean; + gridBounds: GridBounds | null; + cellWidth: number; + cellHeight: number; + controls: Controls; + startMechanism: StartMechanism; + scoreElementSelector: string | null; + backgroundColor: [number, number, number] | null; + consoleErrors: string[]; +} + +/** Result of an individual test. */ +export interface TestResult { + name: string; + pass: boolean; + detail: string; +} + +/** Gameplay statistics gathered during the play phase. */ +export interface GameplayStats { + pieces_placed: number; + lines_cleared: number; + max_score_observed: number; + play_duration_seconds: number; + errors_during_play: number; +} + +/** The full JSON report written at the end. */ +export interface BotReport { + implementation: { + renderer: string; + grid_detected: boolean; + grid_bounds: GridBounds | null; + controls: Record<string, string>; + start_mechanism: string; + score_element_found: boolean; + }; + tests: Array<{ name: string; pass: boolean; detail: string }>; + summary: { + total: number; + passed: number; + failed: number; + score: number; + }; + gameplay: GameplayStats; +} + +/** Context passed through calibration, play, and reporting phases. */ +export interface BotContext { + page: Page; + calibration: CalibrationResult; + gameplay: GameplayStats; + testResults: TestResult[]; +} diff --git a/tasks/tetris/eval/playwright.config.ts b/tasks/tetris/eval/playwright.config.ts @@ -0,0 +1,16 @@ +import { defineConfig } from "@playwright/test"; + +export default defineConfig({ + testDir: "./gameplay-bot", + testMatch: "index.ts", + timeout: 120_000, // 2 minutes per test + retries: 0, + workers: 1, // sequential -- only one game at a time + reporter: [["list"]], + use: { + headless: true, + viewport: { width: 1280, height: 720 }, + actionTimeout: 10_000, + navigationTimeout: 10_000, + }, +}); diff --git a/test-results/.last-run.json b/test-results/.last-run.json @@ -0,0 +1,6 @@ +{ + "status": "failed", + "failedTests": [ + "a123abce010ea7d96fa7-488977003e1cf02fbef6" + ] +} +\ No newline at end of file diff --git a/test-results/index.ts-Tetris-Gameplay-Bot-run-gameplay-bot/error-context.md b/test-results/index.ts-Tetris-Gameplay-Bot-run-gameplay-bot/error-context.md @@ -0,0 +1,16 @@ +# Instructions + +- Following Playwright test failed. +- Explain why, be concise, respect Playwright best practices. +- Provide a snippet of code with the fix, if possible. + +# Test info + +- Name: index.ts >> Tetris Gameplay Bot >> run gameplay bot +- Location: tasks/tetris/eval/gameplay-bot/index.ts:94:7 + +# Error details + +``` +Test timeout of 120000ms exceeded. +``` +\ No newline at end of file

Impressum · Datenschutz