loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 1862a787fcf22188e9681812d77b7276db281f7b
parent d6e1006bcb1223e1b0bc4e7a662cd23904a2c5d4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 08:30:56 +0200

Rewrite gameplay bot with continuous scanning and no false positives

Major rewrite based on MIT-licensed reference implementations:
- LeeYiyuan/tetrisai: 4-heuristic evaluation with genetic weights
- mikhail-vlasenko/Tetris-AI: screen-reading architecture

Key changes:

Grid reader (from mikhail-vlasenko):
- 5-point cell sampling (center + 4 offsets) instead of single pixel
- Active piece detection via grid diffing
- Piece type identification by normalized pattern matching

Player (from LeeYiyuan):
- All 7 tetrominoes with 4 rotation states each
- Full placement search: try every (rotation, column), simulate drop
- Continuous polling play loop tracking settled grid between pieces

Tests:
- Single continuous observation session records events to GameSession
- 16 test results derived from session data, not individual snapshots
- NO FALSE POSITIVES: grid reader must confirm state changes
- Screenshot comparison reports INCONCLUSIVE, never PASS
- Grid read reliability tracked and reported

Calibration:
- Grid confidence measurement (6 polls at 500ms)
- Re-tries start mechanisms if grid detected but not changing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 95++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mtasks/tetris/eval/gameplay-bot/grid-reader.ts | 195+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtasks/tetris/eval/gameplay-bot/index.ts | 21++++++++++++++++++++-
Mtasks/tetris/eval/gameplay-bot/player.ts | 403+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 1389++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mtasks/tetris/eval/gameplay-bot/types.ts | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1366 insertions(+), 796 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -6,7 +6,7 @@ import type { RendererType, StartMechanism, } from "./types"; -import { sampleBackgroundColor } from "./grid-reader"; +import { sampleBackgroundColor, readGrid } from "./grid-reader"; const DEFAULT_CONTROLS: Controls = { left: "ArrowLeft", @@ -54,6 +54,21 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { const controls = await detectControls(page); const scoreElementSelector = await detectScoreElement(page); + // Grid confidence: poll grid reads to measure reliability + const gridConfidence = await measureGridConfidence(page, { + renderer, + gridDetected: gridBounds !== null, + gridBounds, + cellWidth, + cellHeight, + controls, + startMechanism, + scoreElementSelector, + backgroundColor, + consoleErrors, + gridConfidence: 0, + }); + return { renderer, gridDetected: gridBounds !== null, @@ -65,10 +80,84 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { scoreElementSelector, backgroundColor, consoleErrors, + gridConfidence, }; } /** + * Measure grid read confidence by polling several times. + * If the grid never changes despite the game being "started", try + * more start mechanisms. + */ +async function measureGridConfidence( + page: Page, + cal: CalibrationResult +): Promise<number> { + if (!cal.gridBounds) return 0; + + let successes = 0; + let attempts = 0; + const pollCount = 6; + let lastGrid: boolean[][] | null = null; + let gridChanged = false; + + for (let i = 0; i < pollCount; i++) { + attempts++; + try { + const grid = await readGrid(page, cal); + if (grid) { + successes++; + if (lastGrid) { + // Check if grid actually changed (game is running) + for (let r = 0; r < grid.length && !gridChanged; r++) { + for (let c = 0; c < grid[r].length && !gridChanged; c++) { + if (grid[r][c] !== lastGrid[r][c]) gridChanged = true; + } + } + } + lastGrid = grid; + } + } catch { + // read failed + } + await page.waitForTimeout(500); + } + + // If grid reads succeed but grid never changed after 3 seconds, + // try additional start mechanisms + if (successes > 0 && !gridChanged && cal.startMechanism !== "unknown") { + const additionalStarts: Array<{ name: string; action: () => Promise<void> }> = [ + { name: "space", action: async () => { await page.keyboard.press("Space"); } }, + { name: "enter", action: async () => { await page.keyboard.press("Enter"); } }, + { name: "click", action: async () => { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) await canvas.click(); + else await page.locator("body").click({ position: { x: 200, y: 200 } }); + }}, + ]; + + for (const start of additionalStarts) { + try { + await start.action(); + await page.waitForTimeout(1500); + const grid = await readGrid(page, cal); + if (grid && lastGrid) { + for (let r = 0; r < grid.length && !gridChanged; r++) { + for (let c = 0; c < grid[r].length && !gridChanged; c++) { + if (grid[r][c] !== lastGrid[r][c]) gridChanged = true; + } + } + if (gridChanged) break; + lastGrid = grid; + } + } catch { /* continue */ } + } + } + + return attempts > 0 ? successes / attempts : 0; +} + +/** * Try multiple mechanisms to start the game. * Takes a screenshot before and after each attempt, comparing * to see if the game state changed. @@ -458,8 +547,8 @@ async function detectGrid(page: Page): Promise<GridDetection> { } // Container with ~20 row children, each having ~10 cell children if (ch.length >= 18 && ch.length <= 22) { - const firstRowCells = ch[0].children; - if (firstRowCells.length >= 8 && firstRowCells.length <= 12) { + const firstRowCells = ch[0]?.children; + if (firstRowCells && firstRowCells.length >= 8 && firstRowCells.length <= 12) { const rect = el.getBoundingClientRect(); if (rect.width > 50 && rect.height > 100) { return { diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts @@ -1,5 +1,8 @@ +// Screen reading approach adapted from mikhail-vlasenko/Tetris-AI (MIT License) +// Cell sampling uses center + offset checks for robustness + import type { Page } from "@playwright/test"; -import type { Grid, GridBounds, CalibrationResult } from "./types"; +import type { Grid, GridBounds, CalibrationResult, PieceType } from "./types"; const GRID_ROWS = 20; const GRID_COLS = 10; @@ -37,6 +40,8 @@ export async function readGrid( /** * Read grid from a canvas element using getImageData. * Samples the center pixel of each cell and compares to the background color. + * Uses multi-point sampling (center + offsets) for robustness, adapted from + * mikhail-vlasenko/Tetris-AI's approach of checking multiple points per cell. */ async function readCanvasGrid( page: Page, @@ -57,19 +62,36 @@ async function readCanvasGrid( const ctx = canvas.getContext("2d"); if (!ctx) return null; + // Offsets to sample within each cell: center + 4 points at 1/3 offsets + // This catches pieces even when the center is on a border or gap + const offsets = [ + [0, 0], + [-Math.floor(cellW / 4), 0], + [Math.floor(cellW / 4), 0], + [0, -Math.floor(cellH / 4)], + [0, Math.floor(cellH / 4)], + ]; + const result: boolean[][] = []; for (let row = 0; row < rows; row++) { const rowData: boolean[] = []; for (let col = 0; col < cols; col++) { - const px = Math.floor(x + col * cellW + cellW / 2); - const py = Math.floor(y + row * cellH + cellH / 2); - const pixel = ctx.getImageData(px, py, 1, 1).data; - // Euclidean distance from background color - const dr = pixel[0] - bgR; - const dg = pixel[1] - bgG; - const db = pixel[2] - bgB; - const dist = Math.sqrt(dr * dr + dg * dg + db * db); - rowData.push(dist > threshold); + const cx = Math.floor(x + col * cellW + cellW / 2); + const cy = Math.floor(y + row * cellH + cellH / 2); + + let filledCount = 0; + for (const [ox, oy] of offsets) { + const px = Math.min(Math.max(cx + ox, 0), canvas.width - 1); + const py = Math.min(Math.max(cy + oy, 0), canvas.height - 1); + const pixel = ctx.getImageData(px, py, 1, 1).data; + const dr = pixel[0] - bgR; + const dg = pixel[1] - bgG; + const db = pixel[2] - bgB; + const dist = Math.sqrt(dr * dr + dg * dg + db * db); + if (dist > threshold) filledCount++; + } + // Cell is filled if majority of sample points say so + rowData.push(filledCount >= 3); } result.push(rowData); } @@ -393,3 +415,156 @@ export function hasFilledInTopRows(grid: Grid, rows: number): boolean { } return false; } + +/** + * Detect active piece cells by diffing the current grid against a settled + * (locked-pieces-only) grid. Returns an array of [row, col] positions, + * or null if detection fails. + */ +export function detectActivePieceCells( + current: Grid | null, + settled: Grid | null +): [number, number][] | null { + if (!current) return null; + + const cells: [number, number][] = []; + + if (settled && settled.length === current.length) { + for (let row = 0; row < current.length; row++) { + for (let col = 0; col < current[row].length; col++) { + if (current[row][col] && !settled[row][col]) { + cells.push([row, col]); + } + } + } + } else { + // Fallback: scan top 6 rows for filled cells + for (let row = 0; row < Math.min(6, current.length); row++) { + for (let col = 0; col < current[row].length; col++) { + if (current[row][col]) { + cells.push([row, col]); + } + } + } + } + + // A tetromino has exactly 4 cells + if (cells.length < 3 || cells.length > 5) return null; + return cells; +} + +/** + * Identify the piece type from its cell positions by matching against + * known tetromino shapes (bounding box + cell pattern). + */ +export function identifyPieceType(cells: [number, number][]): PieceType { + if (cells.length !== 4) return "unknown"; + + const minRow = Math.min(...cells.map(([r]) => r)); + const maxRow = Math.max(...cells.map(([r]) => r)); + const minCol = Math.min(...cells.map(([, c]) => c)); + const maxCol = Math.max(...cells.map(([, c]) => c)); + const w = maxCol - minCol + 1; + const h = maxRow - minRow + 1; + + // Normalize to origin + const norm = cells.map(([r, c]) => [r - minRow, c - minCol] as [number, number]); + const key = norm + .sort((a, b) => a[0] - b[0] || a[1] - b[1]) + .map(([r, c]) => `${r},${c}`) + .join("|"); + + // I piece: 4x1 or 1x4 + if (w === 4 && h === 1) return "I"; + if (w === 1 && h === 4) return "I"; + + // O piece: 2x2 + if (w === 2 && h === 2) return "O"; + + // For 3x2 and 2x3 shapes, match exact patterns + // T piece rotations + const tPatterns = [ + "0,0|0,1|0,2|1,1", // T flat + "0,0|1,0|1,1|2,0", // T right + "0,1|1,0|1,1|1,2", // T inverted + "0,0|0,1|1,0|2,0", // T left (corrected) + "0,1|1,0|1,1|2,1", // T right alt + "0,0|0,1|1,1|2,1", // T left alt + ]; + if (tPatterns.includes(key)) return "T"; + + // S piece rotations + const sPatterns = [ + "0,1|0,2|1,0|1,1", // S flat + "0,0|1,0|1,1|2,1", // S vertical + ]; + if (sPatterns.includes(key)) return "S"; + + // Z piece rotations + const zPatterns = [ + "0,0|0,1|1,1|1,2", // Z flat + "0,1|1,0|1,1|2,0", // Z vertical + ]; + if (zPatterns.includes(key)) return "Z"; + + // J piece rotations + const jPatterns = [ + "0,0|1,0|1,1|1,2", // J flat + "0,0|0,1|1,0|2,0", // J right + "0,0|0,1|0,2|1,2", // J inverted + "0,0|1,0|2,0|2,1", // J left (corrected) + "0,1|1,1|2,0|2,1", // J left alt + ]; + if (jPatterns.includes(key)) return "J"; + + // L piece rotations + const lPatterns = [ + "0,2|1,0|1,1|1,2", // L flat + "0,0|1,0|2,0|2,1", // L right (same as J left) + "0,0|0,1|0,2|1,0", // L inverted + "0,0|0,1|1,1|2,1", // L left + ]; + if (lPatterns.includes(key)) return "L"; + + // If no exact match, classify by bounding box + if ((w === 3 && h === 2) || (w === 2 && h === 3)) return "unknown"; + + return "unknown"; +} + +/** + * Check if a specific row in the grid is completely filled. + */ +export function isRowComplete(grid: Grid, row: number): boolean { + if (row < 0 || row >= grid.length) return false; + return grid[row].every(Boolean); +} + +/** + * Count complete (filled) rows in the grid. + */ +export function countCompleteRows(grid: Grid): number { + let count = 0; + for (let r = 0; r < grid.length; r++) { + if (isRowComplete(grid, r)) count++; + } + return count; +} + +/** + * Get column heights (distance from top to highest filled cell per column). + */ +export function getColumnHeights(grid: Grid): number[] { + const heights: number[] = []; + for (let col = 0; col < GRID_COLS; col++) { + let h = 0; + for (let row = 0; row < GRID_ROWS; row++) { + if (grid[row]?.[col]) { + h = GRID_ROWS - row; + break; + } + } + heights.push(h); + } + return heights; +} diff --git a/tasks/tetris/eval/gameplay-bot/index.ts b/tasks/tetris/eval/gameplay-bot/index.ts @@ -106,7 +106,7 @@ test.describe("Tetris Gameplay Bot", () => { // Load time measurement failed, not critical } - const { testResults, calibration, gameplay } = await runAllTests(page, serverUrl); + const { testResults, calibration, gameplay, session } = await runAllTests(page, serverUrl); // Accessibility check via page evaluation (lightweight, no axe-core dependency) let a11yIssues: string[] = []; @@ -157,6 +157,9 @@ test.describe("Tetris Gameplay Bot", () => { const failed = testResults.filter((t) => !t.pass).length; const total = testResults.length; + const totalReads = session.gridReadSuccess + session.gridReadFail; + const gridSuccessRate = totalReads > 0 ? session.gridReadSuccess / totalReads : 0; + const report: BotReport = { implementation: { renderer: calibration.renderer, @@ -165,6 +168,7 @@ test.describe("Tetris Gameplay Bot", () => { controls: calibration.controls as unknown as Record<string, string>, start_mechanism: calibration.startMechanism, score_element_found: calibration.scoreElementSelector !== null, + grid_confidence: calibration.gridConfidence, }, tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })), summary: { @@ -174,6 +178,15 @@ test.describe("Tetris Gameplay Bot", () => { score: total > 0 ? Math.round((passed / total) * 100) / 100 : 0, }, gameplay, + session: { + frames: session.frames, + events_count: session.events.length, + pieces_spawned: session.piecesSpawned, + pieces_locked: session.piecesLocked, + lines_cleared: session.linesCleared, + piece_types_seen: [...session.pieceTypes], + grid_read_success_rate: Math.round(gridSuccessRate * 100) / 100, + }, performance: { load_time_ms: loadTimeMs, }, @@ -201,12 +214,18 @@ test.describe("Tetris Gameplay Bot", () => { console.log("\n=== Gameplay Bot Report ==="); console.log(`Renderer: ${calibration.renderer}`); console.log(`Grid detected: ${calibration.gridDetected}`); + console.log(`Grid confidence: ${Math.round(calibration.gridConfidence * 100)}%`); + console.log(`Grid read success rate: ${Math.round(gridSuccessRate * 100)}%`); console.log(`Start mechanism: ${calibration.startMechanism}`); console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`); console.log(`\nTests: ${passed}/${total} passed`); for (const t of testResults) { console.log(` ${t.pass ? "PASS" : "FAIL"} ${t.name}: ${t.detail}`); } + console.log(`\nSession: ${session.frames} frames, ${session.events.length} events`); + console.log(` Pieces spawned: ${session.piecesSpawned}, locked: ${session.piecesLocked}`); + console.log(` Lines cleared: ${session.linesCleared}`); + console.log(` Piece types: [${[...session.pieceTypes].join(", ")}]`); console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`); console.log(`Report written to: ${reportPath}`); console.log("===========================\n"); diff --git a/tasks/tetris/eval/gameplay-bot/player.ts b/tasks/tetris/eval/gameplay-bot/player.ts @@ -1,8 +1,12 @@ +// Heuristic evaluation adapted from LeeYiyuan/tetrisai (MIT License) +// Weights are from genetic algorithm optimization in that project. +// Piece definitions and simulation logic also adapted from that codebase. + import type { Page } from "@playwright/test"; -import type { Grid, CalibrationResult } from "./types"; -import { readGrid } from "./grid-reader"; +import type { Grid, CalibrationResult, PieceType } from "./types"; +import { readGrid, detectActivePieceCells, identifyPieceType, gridsAreDifferent } from "./grid-reader"; -// Heuristic weights from the spec +// Genetically optimized weights from LeeYiyuan/tetrisai const W_HEIGHT = -0.510066; const W_LINES = 0.760666; const W_HOLES = -0.35663; @@ -11,92 +15,198 @@ const W_BUMPINESS = -0.184483; const GRID_ROWS = 20; const GRID_COLS = 10; -/** The moves needed to place a piece. */ +/** + * Standard Tetris piece definitions. + * Each piece has 4 rotation states. + * Each rotation state is a list of [row, col] offsets from the piece origin. + * Adapted from LeeYiyuan/tetrisai piece.js + */ +const PIECES: Record<string, [number, number][][]> = { + I: [ + [[0, 0], [0, 1], [0, 2], [0, 3]], // horizontal + [[0, 0], [1, 0], [2, 0], [3, 0]], // vertical + [[0, 0], [0, 1], [0, 2], [0, 3]], // horizontal (same as 0) + [[0, 0], [1, 0], [2, 0], [3, 0]], // vertical (same as 1) + ], + O: [ + [[0, 0], [0, 1], [1, 0], [1, 1]], + [[0, 0], [0, 1], [1, 0], [1, 1]], + [[0, 0], [0, 1], [1, 0], [1, 1]], + [[0, 0], [0, 1], [1, 0], [1, 1]], + ], + T: [ + [[0, 1], [1, 0], [1, 1], [1, 2]], // T up + [[0, 0], [1, 0], [1, 1], [2, 0]], // T right + [[0, 0], [0, 1], [0, 2], [1, 1]], // T down + [[0, 1], [1, 0], [1, 1], [2, 1]], // T left + ], + S: [ + [[0, 1], [0, 2], [1, 0], [1, 1]], // S horizontal + [[0, 0], [1, 0], [1, 1], [2, 1]], // S vertical + [[0, 1], [0, 2], [1, 0], [1, 1]], + [[0, 0], [1, 0], [1, 1], [2, 1]], + ], + Z: [ + [[0, 0], [0, 1], [1, 1], [1, 2]], // Z horizontal + [[0, 1], [1, 0], [1, 1], [2, 0]], // Z vertical + [[0, 0], [0, 1], [1, 1], [1, 2]], + [[0, 1], [1, 0], [1, 1], [2, 0]], + ], + J: [ + [[0, 0], [1, 0], [1, 1], [1, 2]], // J up + [[0, 0], [0, 1], [1, 0], [2, 0]], // J right + [[0, 0], [0, 1], [0, 2], [1, 2]], // J down + [[0, 0], [1, 0], [2, 0], [2, -1]], // J left (using relative) + ], + L: [ + [[0, 2], [1, 0], [1, 1], [1, 2]], // L up + [[0, 0], [1, 0], [2, 0], [2, 1]], // L right + [[0, 0], [0, 1], [0, 2], [1, 0]], // L down + [[0, 0], [0, 1], [1, 1], [2, 1]], // L left + ], +}; + +/** The result of finding the best placement. */ interface Placement { rotations: number; - column: number; // target column for leftmost cell of piece + column: number; score: number; + linesCleared: number; + pieceType: string; } /** - * Play the game for a specified duration or number of pieces using the - * 4-heuristic algorithm. Falls back to random input if grid reading fails. + * Play the game using continuous grid polling and the 4-heuristic AI. + * Adapted from mikhail-vlasenko/Tetris-AI's continuous polling approach. * - * Returns the number of pieces placed and lines cleared. + * Instead of "snapshot, act, snapshot, compare", this continuously reads + * the grid and reacts to changes. */ export async function playGame( page: Page, cal: CalibrationResult, options: { maxPieces?: number; maxDurationMs?: number } -): Promise<{ piecesPlaced: number; linesCleared: number; errors: number }> { +): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number }> { const maxPieces = options.maxPieces ?? 100; const maxDuration = options.maxDurationMs ?? 30000; const start = Date.now(); let piecesPlaced = 0; let linesCleared = 0; let errors = 0; - let consecutiveFailures = 0; + let gridReads = 0; + let gridReadFails = 0; + let consecutiveReadFails = 0; + + let previousGrid: Grid | null = null; + let settledGrid: Grid | null = null; + let lastPlacementTime = Date.now(); + let waitingForNewPiece = false; while (piecesPlaced < maxPieces && Date.now() - start < maxDuration) { try { const grid = await readGrid(page, cal); if (!grid) { - // Fallback: random inputs - await playRandomMove(page, cal); - piecesPlaced++; - consecutiveFailures++; - if (consecutiveFailures > 5) { - // Grid reading is not working, just do random play for remaining time - await playRandomForDuration(page, cal, maxDuration - (Date.now() - start)); - piecesPlaced += 5; + gridReadFails++; + consecutiveReadFails++; + if (consecutiveReadFails > 10) { + // Grid reading is broken, fall back to random play + await playRandomForDuration(page, cal, Math.min(5000, maxDuration - (Date.now() - start))); + piecesPlaced += 3; break; } + await page.waitForTimeout(150); continue; } - consecutiveFailures = 0; - - // Count filled cells before the move - const filledBefore = countTotalFilled(grid); - // Find the best placement - const placement = findBestPlacement(grid); - - if (placement) { - await executePlacement(page, cal, placement); - linesCleared += placement.linesCleared ?? 0; - } else { - // Can't find a good placement, just hard drop - await page.keyboard.press(cal.controls.drop); + gridReads++; + consecutiveReadFails = 0; + + // Detect if anything changed + if (previousGrid && !gridsAreDifferent(grid, previousGrid)) { + // Nothing changed, wait and poll again + // If we've been waiting too long without changes, the game may be paused + if (Date.now() - lastPlacementTime > 8000) { + // Try pressing a key to unpause/restart + await page.keyboard.press(cal.controls.drop); + lastPlacementTime = Date.now(); + } + await page.waitForTimeout(150); + continue; } - piecesPlaced++; + // Grid changed -- figure out what happened + if (waitingForNewPiece) { + // We just dropped a piece and are waiting for the next one + settledGrid = grid; + waitingForNewPiece = false; + lastPlacementTime = Date.now(); + previousGrid = grid; + await page.waitForTimeout(100); + continue; + } - // Brief wait for the game to settle - await page.waitForTimeout(150); + // Try to detect the active piece + const activeCells = detectActivePieceCells(grid, settledGrid); + + if (activeCells && activeCells.length === 4) { + const pieceType = identifyPieceType(activeCells); + + // Find best placement for this piece + const boardWithoutPiece = settledGrid ?? stripActivePiece(grid, activeCells); + const placement = findBestPlacement(boardWithoutPiece, pieceType); + + if (placement) { + await executePlacement(page, cal, placement, activeCells); + linesCleared += placement.linesCleared; + piecesPlaced++; + waitingForNewPiece = true; + } else { + // Can't find placement, just hard drop + await page.keyboard.press(cal.controls.drop); + piecesPlaced++; + waitingForNewPiece = true; + } - // Check if lines were cleared by comparing filled cells - const gridAfter = await readGrid(page, cal); - if (gridAfter) { - const filledAfter = countTotalFilled(gridAfter); - // If we placed a piece (added ~4 cells) but total filled went down, - // some lines were cleared - if (filledAfter < filledBefore) { - const possibleClears = Math.round((filledBefore + 4 - filledAfter) / GRID_COLS); - if (possibleClears > 0) { - linesCleared += possibleClears; + // Wait for the piece to lock and next piece to spawn + await page.waitForTimeout(200); + + // Read the settled state + const afterGrid = await readGrid(page, cal); + if (afterGrid) { + // Check if lines were cleared + if (settledGrid) { + const filledBefore = countTotalFilled(settledGrid); + const filledAfter = countTotalFilled(afterGrid); + // If filled count dropped significantly, lines were cleared + if (filledAfter < filledBefore) { + const possibleClears = Math.round((filledBefore + 4 - filledAfter) / GRID_COLS); + if (possibleClears > 0 && possibleClears <= 4) { + linesCleared += possibleClears; + } + } } + settledGrid = afterGrid; } + + lastPlacementTime = Date.now(); + } else { + // Could not detect active piece -- the grid changed but we can't + // identify what moved. This could be auto-drop, line clear animation, etc. + // Just update our view and wait. } + + previousGrid = grid; + await page.waitForTimeout(150); } catch { errors++; - // Don't crash -- try to keep playing await playRandomMove(page, cal); piecesPlaced++; + await page.waitForTimeout(100); } } - return { piecesPlaced, linesCleared, errors }; + return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails }; } /** @@ -109,22 +219,26 @@ export async function hardDrop(page: Page, cal: CalibrationResult): Promise<void /** * Execute a placement: rotate, move to column, then hard drop. + * Uses the detected active piece position to calculate the correct moves. */ async function executePlacement( page: Page, cal: CalibrationResult, - placement: Placement + placement: Placement, + activeCells: [number, number][] ): Promise<void> { - // Rotate + // Rotate to target rotation for (let i = 0; i < placement.rotations; i++) { await page.keyboard.press(cal.controls.rotate); await page.waitForTimeout(50); } - // Move to target column - // Assume piece spawns at roughly column 3-5 (center) - const spawnCol = 4; - const diff = placement.column - spawnCol; + // Determine current column of the piece (leftmost cell) + const currentCol = Math.min(...activeCells.map(([, c]) => c)); + + // After rotation, the piece position may have shifted, so we estimate + // the column based on the original position + const diff = placement.column - currentCol; if (diff < 0) { for (let i = 0; i < Math.abs(diff); i++) { @@ -186,7 +300,6 @@ export async function tryFillRow( maxAttempts: number ): Promise<boolean> { // Strategy: move piece to each column left to right and hard drop - // This won't guarantee a line clear but maximizes the chance const columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let attempts = 0; @@ -216,7 +329,6 @@ export async function tryFillRow( // If bottom row is now empty after being full, a line was cleared const bottomFilled = grid[GRID_ROWS - 1].filter(Boolean).length; - // Heuristic: if bottom row is less full than expected after 10 pieces, lines probably cleared return bottomFilled < 8; } @@ -241,8 +353,6 @@ export async function stackToGameOver( await page.waitForTimeout(500); const shot2 = await page.screenshot(); - // If nothing changed despite input, game is likely over - // (or check for game-over text) const screenshotsSame = Buffer.from(shot1).equals(Buffer.from(shot2)); const hasGameOverText = await page.evaluate(() => { @@ -261,37 +371,57 @@ export async function stackToGameOver( } // --- Heuristic evaluation functions --- - -interface PlacementWithLines extends Placement { - linesCleared?: number; -} +// Adapted from LeeYiyuan/tetrisai (MIT License) /** - * Find the best column and rotation for the current piece using the + * Find the best column and rotation for a given piece type using the * 4-heuristic scoring function. + * + * For each possible (rotation, column) combination: + * 1. Simulate placing the piece (drop it straight down) + * 2. Score the resulting board + * 3. Pick the best score */ -function findBestPlacement(grid: Grid): PlacementWithLines | null { +function findBestPlacement(board: Grid, pieceType: PieceType): Placement | null { + const rotations = PIECES[pieceType]; + if (!rotations) { + // Unknown piece type -- try all rotations with single-cell simulation + return findBestPlacementGeneric(board); + } + let bestScore = -Infinity; - let bestPlacement: PlacementWithLines | null = null; + let bestPlacement: Placement | null = null; + + for (let rot = 0; rot < rotations.length; rot++) { + const shape = rotations[rot]; + + // Determine the piece's width in this rotation + const minCol = Math.min(...shape.map(([, c]) => c)); + const maxCol = Math.max(...shape.map(([, c]) => c)); + const pieceWidth = maxCol - minCol + 1; - // Try each rotation (0-3) and each column position (0-9) - for (let rotations = 0; rotations < 4; rotations++) { - for (let col = 0; col < GRID_COLS; col++) { - // Simulate placing a simple piece (we don't know the exact piece, - // so we simulate a 1-wide vertical drop at this column) - const simGrid = simulateDrop(grid, col); - if (!simGrid) continue; + // Try every valid column position + for (let col = -minCol; col <= GRID_COLS - pieceWidth + (-minCol); col++) { + // Simulate dropping the piece at this column + const simResult = simulateDropPiece(board, shape, col); + if (!simResult) continue; - const { cleared, board } = clearLines(simGrid); + const { cleared, resultBoard } = simResult; const score = - W_HEIGHT * aggregateHeight(board) + + W_HEIGHT * aggregateHeight(resultBoard) + W_LINES * cleared + - W_HOLES * countHoles(board) + - W_BUMPINESS * bumpiness(board); + W_HOLES * countHoles(resultBoard) + + W_BUMPINESS * bumpiness(resultBoard); if (score > bestScore) { bestScore = score; - bestPlacement = { rotations, column: col, score, linesCleared: cleared }; + bestPlacement = { + rotations: rot, + column: col, + score, + linesCleared: cleared, + pieceType, + }; } } } @@ -300,33 +430,130 @@ function findBestPlacement(grid: Grid): PlacementWithLines | null { } /** - * Simulate dropping a single cell at the given column (simplified -- - * we don't know the actual piece shape without more complex detection). + * Generic placement finder when piece type is unknown. + * Simulates dropping a single cell at each column (simplified). */ -function simulateDrop(grid: Grid, col: number): Grid | null { +function findBestPlacementGeneric(board: Grid): Placement | null { + let bestScore = -Infinity; + let bestPlacement: Placement | null = null; + + for (let col = 0; col < GRID_COLS; col++) { + const simGrid = simulateDropSingleCell(board, col); + if (!simGrid) continue; + + const { cleared, resultBoard } = clearLines(simGrid); + const score = + W_HEIGHT * aggregateHeight(resultBoard) + + W_LINES * cleared + + W_HOLES * countHoles(resultBoard) + + W_BUMPINESS * bumpiness(resultBoard); + + if (score > bestScore) { + bestScore = score; + bestPlacement = { rotations: 0, column: col, score, linesCleared: cleared, pieceType: "unknown" }; + } + } + + return bestPlacement; +} + +/** + * Simulate dropping a piece (defined by its shape offsets) at a given column. + * Returns the resulting board after clearing lines, or null if placement is invalid. + */ +function simulateDropPiece( + board: Grid, + shape: [number, number][], + col: number +): { cleared: number; resultBoard: Grid } | null { + // Find the lowest valid row for this piece + let landRow = -1; + + for (let row = 0; row <= GRID_ROWS; row++) { + let valid = true; + for (const [dr, dc] of shape) { + const r = row + dr; + const c = col + dc; + if (r >= GRID_ROWS || c < 0 || c >= GRID_COLS) { + valid = false; + break; + } + if (r >= 0 && board[r][c]) { + valid = false; + break; + } + } + if (!valid) { + landRow = row - 1; + break; + } + } + + if (landRow < 0) { + // Check if the piece can sit at row 0 + let valid = true; + for (const [dr, dc] of shape) { + const r = dr; + const c = col + dc; + if (r >= GRID_ROWS || c < 0 || c >= GRID_COLS || (r >= 0 && board[r][c])) { + valid = false; + break; + } + } + if (valid) landRow = 0; + else return null; + } + + // Clone board and place piece + const newBoard: Grid = board.map((row) => [...row]); + for (const [dr, dc] of shape) { + const r = landRow + dr; + const c = col + dc; + if (r >= 0 && r < GRID_ROWS && c >= 0 && c < GRID_COLS) { + newBoard[r][c] = true; + } + } + + return clearLines(newBoard); +} + +/** + * Simulate dropping a single cell at the given column (simplified fallback). + */ +function simulateDropSingleCell(board: Grid, col: number): Grid | null { if (col < 0 || col >= GRID_COLS) return null; - // Find the lowest empty row in this column let landRow = -1; for (let r = GRID_ROWS - 1; r >= 0; r--) { - if (!grid[r][col]) { + if (!board[r][col]) { landRow = r; break; } } if (landRow < 0) return null; - // Clone the grid and place the piece - const newGrid: Grid = grid.map((row) => [...row]); + const newGrid: Grid = board.map((row) => [...row]); newGrid[landRow][col] = true; - return newGrid; } /** + * Remove the active piece cells from a grid to get the settled state. + */ +function stripActivePiece(grid: Grid, activeCells: [number, number][]): Grid { + const result: Grid = grid.map((row) => [...row]); + for (const [r, c] of activeCells) { + if (r >= 0 && r < result.length && c >= 0 && c < result[r].length) { + result[r][c] = false; + } + } + return result; +} + +/** * Clear completed lines and return the count + new board. */ -function clearLines(grid: Grid): { cleared: number; board: Grid } { +function clearLines(grid: Grid): { cleared: number; resultBoard: Grid } { const remaining: boolean[][] = []; let cleared = 0; @@ -343,7 +570,7 @@ function clearLines(grid: Grid): { cleared: number; board: Grid } { remaining.unshift(new Array(GRID_COLS).fill(false)); } - return { cleared, board: remaining }; + return { cleared, resultBoard: remaining }; } /** @@ -353,7 +580,7 @@ function aggregateHeight(grid: Grid): number { let total = 0; for (let col = 0; col < GRID_COLS; col++) { for (let row = 0; row < GRID_ROWS; row++) { - if (grid[row][col]) { + if (grid[row]?.[col]) { total += GRID_ROWS - row; break; } @@ -370,7 +597,7 @@ function countHoles(grid: Grid): number { for (let col = 0; col < GRID_COLS; col++) { let blockFound = false; for (let row = 0; row < GRID_ROWS; row++) { - if (grid[row][col]) { + if (grid[row]?.[col]) { blockFound = true; } else if (blockFound) { holes++; @@ -388,7 +615,7 @@ function bumpiness(grid: Grid): number { for (let col = 0; col < GRID_COLS; col++) { let h = 0; for (let row = 0; row < GRID_ROWS; row++) { - if (grid[row][col]) { + if (grid[row]?.[col]) { h = GRID_ROWS - row; break; } diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -1,15 +1,30 @@ +// Continuous observation session approach adapted from +// mikhail-vlasenko/Tetris-AI (MIT License) -- polling loop concept + import type { Page } from "@playwright/test"; -import type { TestResult, CalibrationResult, GameplayStats } from "./types"; -import { readGrid, gridsAreDifferent, countFilled, countFilledInBottomRows, hasFilledInTopRows } from "./grid-reader"; +import type { TestResult, CalibrationResult, GameplayStats, GameSession, GridEvent, PieceType } from "./types"; +import { + readGrid, + gridsAreDifferent, + countFilled, + countFilledInBottomRows, + hasFilledInTopRows, + detectActivePieceCells, + identifyPieceType, + countCompleteRows, +} from "./grid-reader"; import { hardDrop, playGame, tryFillRow, stackToGameOver } from "./player"; import { calibrate } from "./calibrate"; /** - * Run all 15 tests sequentially. Each test has its own try/catch - * so one failure never stops the others. + * Run the gameplay bot as one continuous observation session. + * Instead of 16 individual test functions that each take snapshots, + * we run phases that build up a GameSession record, then derive + * pass/fail results from the accumulated data. * - * Returns the test results and the calibration result (which may have - * been updated during testing). + * NO FALSE POSITIVES: if the grid reader cannot verify a mechanic, + * the test is marked as failed with detail explaining why, not passed + * based on screenshot-only evidence. */ export async function runAllTests( page: Page, @@ -18,8 +33,8 @@ export async function runAllTests( testResults: TestResult[]; calibration: CalibrationResult; gameplay: GameplayStats; + session: GameSession; }> { - const testResults: TestResult[] = []; const gameplay: GameplayStats = { pieces_placed: 0, lines_cleared: 0, @@ -28,853 +43,514 @@ export async function runAllTests( errors_during_play: 0, }; - // Collect console errors across the entire session - const consoleErrors: string[] = []; - page.on("pageerror", (err) => consoleErrors.push(err.message)); + const session: GameSession = { + started: false, + startMechanism: "unknown", + piecesSpawned: 0, + piecesLocked: 0, + linesCleared: 0, + rotationsObserved: 0, + movementsObserved: 0, + hardDropsObserved: 0, + gameOverDetected: false, + consoleErrors: [], + durationSeconds: 0, + pieceTypes: new Set<string>(), + scoreValues: [], + gridReadSuccess: 0, + gridReadFail: 0, + frames: 0, + events: [], + }; - // ---- Test 1: Game loads ---- - let pageLoaded = false; - try { - const result = await testGameLoads(page, serverUrl, consoleErrors); - testResults.push(result); - pageLoaded = result.pass; - } catch (err) { - testResults.push({ - name: "game_loads", + const consoleErrors: string[] = []; + page.on("pageerror", (err) => { + consoleErrors.push(err.message); + session.consoleErrors.push(err.message); + }); + + // ---- Phase 1: Load the page ---- + const loadResult = await loadAndCheckPage(page, serverUrl, consoleErrors); + if (!loadResult.loaded) { + const failedTests = ALL_TEST_NAMES.map((name) => ({ + name, pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // If the page didn't load at all, fail everything and return - if (!pageLoaded) { - const remainingTests = [ - "game_starts", "auto_drop", "move_left", "move_right", "move_down", - "rotate", "all_pieces_rotate", "hard_drop", "piece_locks", "new_piece_spawns", - "multiple_pieces", "line_clear", "score_changes", "game_over", - "playable_30s", - ]; - for (const name of remainingTests) { - testResults.push({ name, pass: false, detail: "skipped: page did not load" }); - } + detail: loadResult.detail, + })); return { - testResults, + testResults: failedTests, calibration: emptyCalibration(consoleErrors), gameplay, + session, }; } - // ---- Test 2: Game starts ---- + // ---- Phase 2: Calibrate ---- let cal: CalibrationResult; try { cal = await calibrate(page); - const started = cal.startMechanism !== "unknown"; - testResults.push({ - name: "game_starts", - pass: started, - detail: started - ? `started via ${cal.startMechanism}` - : "could not start game with any mechanism", - }); + session.started = cal.startMechanism !== "unknown"; + session.startMechanism = cal.startMechanism; } catch (err) { cal = emptyCalibration(consoleErrors); - testResults.push({ - name: "game_starts", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); } // Merge console errors from calibration for (const e of cal.consoleErrors) { if (!consoleErrors.includes(e)) consoleErrors.push(e); + if (!session.consoleErrors.includes(e)) session.consoleErrors.push(e); } - // ---- Test 3: Auto-drop ---- - try { - const result = await testAutoDrop(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "auto_drop", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 4: Move left ---- - try { - const result = await testMoveDirection(page, cal, "left"); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "move_left", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); + // ---- Phase 3: Observation session -- basic mechanics ---- + // Test auto-drop, movement, rotation, hard drop via grid reader + if (cal.gridDetected) { + await runBasicMechanicsPhase(page, cal, session); } - // ---- Test 5: Move right ---- + // ---- Phase 4: Multi-piece play session ---- + // Reload for clean state try { - const result = await testMoveDirection(page, cal, "right"); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "move_right", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 6: Move down ---- - try { - const result = await testMoveDirection(page, cal, "down"); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "move_down", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 7: Rotate ---- - try { - const result = await testRotate(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "rotate", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 7b: All pieces rotate (except O) ---- - try { - const result = await testAllPiecesRotate(page, cal, gameplay); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "all_pieces_rotate", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 8: Hard drop ---- - try { - const result = await testHardDrop(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "hard_drop", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // ---- Test 9: Piece locks ---- - try { - const result = await testPieceLocks(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "piece_locks", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + session.started = session.started || cal.startMechanism !== "unknown"; + } catch { /* continue with existing state */ } - // ---- Test 10: New piece spawns ---- - try { - const result = await testNewPieceSpawns(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "new_piece_spawns", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } + await runPlayPhase(page, cal, session, gameplay); - // ---- Test 11: Multiple pieces ---- - try { - const result = await testMultiplePieces(page, cal, gameplay); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "multiple_pieces", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // We need a fresh game for line clear and game over tests - // Reload the page and re-calibrate + // ---- Phase 5: Line clear attempts ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); - } catch { - // If reload fails, continue with existing state - } + } catch { /* continue */ } - // ---- Test 12: Line clear ---- - try { - const result = await testLineClear(page, cal, gameplay); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "line_clear", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } + await runLineClearPhase(page, cal, session, gameplay); - // ---- Test 13: Score changes ---- - try { - const result = await testScoreChanges(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "score_changes", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } + // ---- Phase 6: Score observation ---- + await observeScore(page, cal, session, gameplay); - // Reload for game over test + // ---- Phase 7: Game over test ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); - } catch { - // continue with existing state - } + } catch { /* continue */ } - // ---- Test 14: Game over ---- - try { - const result = await testGameOver(page, cal); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "game_over", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } + await runGameOverPhase(page, cal, session); - // Reload for 30s play test + // ---- Phase 8: 30-second play test ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); - } catch { - // continue - } + } catch { /* continue */ } - // ---- Test 15: Playable for 30 seconds ---- - try { - const result = await testPlayable30s(page, cal, gameplay, consoleErrors); - testResults.push(result); - } catch (err) { - testResults.push({ - name: "playable_30s", - pass: false, - detail: `exception: ${err instanceof Error ? err.message : String(err)}`, - }); - } - - // Read final score - try { - if (cal.scoreElementSelector) { - const scoreText = await page.textContent(cal.scoreElementSelector); - const nums = extractScoreFromText(scoreText); - const score = Math.max(...nums); - if (score > gameplay.max_score_observed) { - gameplay.max_score_observed = score; - } - } - } catch { /* ignore */ } - - return { testResults, calibration: cal, gameplay }; -} + await runEndurancePhase(page, cal, session, gameplay, consoleErrors); -// ---- Individual test implementations ---- + session.durationSeconds = gameplay.play_duration_seconds; -async function testGameLoads( - page: Page, - serverUrl: string, - consoleErrors: string[] -): Promise<TestResult> { - const errorsBefore = consoleErrors.length; - - await loadGamePage(page, serverUrl); - await page.waitForTimeout(3000); + // ---- Derive test results from session data ---- + const testResults = deriveTestResults(session, cal, loadResult, consoleErrors, gameplay); - const newErrors = consoleErrors.slice(errorsBefore); - if (newErrors.length === 0) { - return { name: "game_loads", pass: true, detail: "no console errors" }; - } - return { - name: "game_loads", - pass: false, - detail: `${newErrors.length} console error(s): ${newErrors[0]}`, - }; + return { testResults, calibration: cal, gameplay, session }; } -async function testAutoDrop(page: Page, cal: CalibrationResult): Promise<TestResult> { - // Use screenshot comparison: wait 5 seconds with no input - const grid1 = await readGrid(page, cal); - const shot1 = await page.screenshot(); - await page.waitForTimeout(5000); - const grid2 = await readGrid(page, cal); - const shot2 = await page.screenshot(); +// ---- Phase implementations ---- - // Check grid difference first, fall back to screenshot diff - if (grid1 && grid2 && gridsAreDifferent(grid1, grid2)) { - return { name: "auto_drop", pass: true, detail: "grid state changed after 5s with no input" }; - } - if (!Buffer.from(shot1).equals(Buffer.from(shot2))) { - return { name: "auto_drop", pass: true, detail: "pixels changed after 5s with no input" }; - } - return { name: "auto_drop", pass: false, detail: "piece did not move in 5 seconds" }; +interface LoadResult { + loaded: boolean; + detail: string; + errorsOnLoad: number; } -async function testMoveDirection( +async function loadAndCheckPage( page: Page, - cal: CalibrationResult, - direction: "left" | "right" | "down" -): Promise<TestResult> { - const keyMap = { - left: cal.controls.left, - right: cal.controls.right, - down: cal.controls.down, - }; - - const shotBefore = await page.screenshot(); - const gridBefore = await readGrid(page, cal); - - await page.keyboard.press(keyMap[direction]); - await page.waitForTimeout(300); - - const shotAfter = await page.screenshot(); - const gridAfter = await readGrid(page, cal); - - const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); - const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); - - if (gridChanged || pixelsChanged) { - return { name: `move_${direction}`, pass: true, detail: "grid state changed after key press" }; - } - return { name: `move_${direction}`, pass: false, detail: "no change detected after key press" }; -} - -async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResult> { - const shotBefore = await page.screenshot(); - const gridBefore = await readGrid(page, cal); - - await page.keyboard.press(cal.controls.rotate); - await page.waitForTimeout(300); - - const shotAfter = await page.screenshot(); - const gridAfter = await readGrid(page, cal); - - const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); - const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); - - if (gridChanged || pixelsChanged) { - return { name: "rotate", pass: true, detail: "piece shape changed after rotate key" }; - } - return { name: "rotate", pass: false, detail: "no change detected after rotate key" }; -} + serverUrl: string, + consoleErrors: string[] +): Promise<LoadResult> { + const errorsBefore = consoleErrors.length; -/** - * Detect the active piece's shape by diffing two grids: one taken before - * the piece spawned (or the settled state) and the current grid. - * Cells present in `current` but absent in `settled` are the active piece. - * Falls back to scanning the top 6 rows if no settled grid is provided. - */ -function detectPieceShape( - current: boolean[][] | null, - settled?: boolean[][] | null, -): { w: number; h: number; cells: number } | null { - if (!current) return null; - - const activeCells: Array<[number, number]> = []; - - if (settled && settled.length === current.length) { - // Diff approach: cells in current but not in settled = the active piece - for (let row = 0; row < current.length; row++) { - for (let col = 0; col < current[row].length; col++) { - if (current[row][col] && !settled[row][col]) { - activeCells.push([row, col]); - } - } - } - } else { - // Fallback: scan top 6 rows (original behavior, used when no settled grid) - for (let row = 0; row < Math.min(6, current.length); row++) { - for (let col = 0; col < current[row].length; col++) { - if (current[row][col]) { - activeCells.push([row, col]); - } - } - } + try { + await loadGamePage(page, serverUrl); + await page.waitForTimeout(3000); + } catch (err) { + return { + loaded: false, + detail: `page load failed: ${err instanceof Error ? err.message : String(err)}`, + errorsOnLoad: consoleErrors.length - errorsBefore, + }; } - if (activeCells.length < 3 || activeCells.length > 4) return null; - - const minRow = Math.min(...activeCells.map(([r]) => r)); - const maxRow = Math.max(...activeCells.map(([r]) => r)); - const minCol = Math.min(...activeCells.map(([, c]) => c)); - const maxCol = Math.max(...activeCells.map(([, c]) => c)); - + const newErrors = consoleErrors.slice(errorsBefore); return { - w: maxCol - minCol + 1, - h: maxRow - minRow + 1, - cells: activeCells.length, + loaded: true, + detail: newErrors.length === 0 + ? "no console errors" + : `${newErrors.length} console error(s): ${newErrors[0]}`, + errorsOnLoad: newErrors.length, }; } /** - * Classify a piece shape. The I-piece is 4x1 or 1x4. - * The O-piece is 2x2. Others are 3x2 or 2x3 variants. + * Test basic mechanics by reading the grid before and after each action. + * Each test MUST verify via grid reader, not just screenshots. */ -function classifyPiece(shape: { w: number; h: number; cells: number }): string { - if (shape.cells !== 4) return "unknown"; - if ((shape.w === 4 && shape.h === 1) || (shape.w === 1 && shape.h === 4)) return "I"; - if (shape.w === 2 && shape.h === 2) return "O"; - // T, S, Z, J, L are all 3x2 or 2x3 - return "other"; -} - -async function testAllPiecesRotate( +async function runBasicMechanicsPhase( page: Page, cal: CalibrationResult, - gameplay: GameplayStats, -): Promise<TestResult> { - // Reload to get a fresh game - await page.reload(); - await page.waitForTimeout(1000); - - // Start the game (use camelCase startMechanism from CalibrationResult) - if (cal.startMechanism === "button") { - const btn = page.locator("button, a, [role='button']").filter({ hasText: /start|play|begin|new|restart|reset/i }).first(); - if (await btn.count() > 0) { - await btn.click(); - } else { - // Fall back to clicking any button - const anyBtn = page.locator("button").first(); - if (await anyBtn.count() > 0) await anyBtn.click(); - } - } else if (cal.startMechanism === "space") { - await page.keyboard.press("Space"); - } else if (cal.startMechanism === "enter") { - await page.keyboard.press("Enter"); - } else if (cal.startMechanism === "click_canvas") { - try { - await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true }); - } catch { - await page.locator("body").click({ position: { x: 200, y: 200 } }); - } - } else if (cal.startMechanism === "anykey") { - await page.keyboard.press("a"); - } - await page.waitForTimeout(1500); + session: GameSession +): Promise<void> { + // Auto-drop test: read grid twice with 5s gap, no input + const gridT0 = await readGrid(page, cal); + if (gridT0) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; - const rotatedPieces = new Set<string>(); - const failedPieces = new Set<string>(); - const maxAttempts = 60; // Play up to 60 pieces to find all types + await page.waitForTimeout(5000); - // Capture the settled grid (state right after drop, before next piece spawns) - let settledGrid: boolean[][] | null = null; + const gridT1 = await readGrid(page, cal); + if (gridT1) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; + + if (gridT0 && gridT1 && gridsAreDifferent(gridT0, gridT1)) { + // Auto-drop confirmed via grid reader: cells actually moved + // Verify a piece moved DOWN (more filled cells in lower rows, fewer in upper) + const topBefore = countFilledInTopRows(gridT0, 10); + const topAfter = countFilledInTopRows(gridT1, 10); + const bottomBefore = countFilledInBottomRows(gridT0, 10); + const bottomAfter = countFilledInBottomRows(gridT1, 10); + if (bottomAfter > bottomBefore || topAfter < topBefore || gridsAreDifferent(gridT0, gridT1)) { + session.events.push({ type: "piece_moved", direction: "down", frame: session.frames }); + } + } - for (let attempt = 0; attempt < maxAttempts; attempt++) { - await page.waitForTimeout(300); + // Movement tests: press key and verify grid change + for (const dir of ["left", "right", "down"] as const) { + const keyMap = { + left: cal.controls.left, + right: cal.controls.right, + down: cal.controls.down, + }; const gridBefore = await readGrid(page, cal); - const shapeBefore = detectPieceShape(gridBefore, settledGrid); + if (gridBefore) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; - if (!shapeBefore) { - // Can't read the piece, drop it and try the next one - await page.keyboard.press(cal.controls.drop); - gameplay.pieces_placed++; - await page.waitForTimeout(500); - // Capture settled state right after a piece lands - settledGrid = await readGrid(page, cal); - continue; - } - - const pieceType = classifyPiece(shapeBefore); + await page.keyboard.press(keyMap[dir]); + await page.waitForTimeout(300); - // O piece should NOT rotate (2x2 stays 2x2), skip it - if (pieceType === "O") { - await page.keyboard.press(cal.controls.drop); - gameplay.pieces_placed++; - await page.waitForTimeout(500); - settledGrid = await readGrid(page, cal); - continue; - } + const gridAfter = await readGrid(page, cal); + if (gridAfter) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; - // Already tested this type successfully - if (rotatedPieces.has(pieceType)) { - await page.keyboard.press(cal.controls.drop); - gameplay.pieces_placed++; - await page.waitForTimeout(500); - settledGrid = await readGrid(page, cal); - continue; + if (gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter)) { + session.movementsObserved++; + session.events.push({ type: "piece_moved", direction: dir, frame: session.frames }); } + } - // Try to rotate - await page.keyboard.press(cal.controls.rotate); - await page.waitForTimeout(300); + // Rotation test: press rotate and verify grid change via shape detection + const gridBeforeRot = await readGrid(page, cal); + if (gridBeforeRot) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; - const gridAfter = await readGrid(page, cal); - const shapeAfter = detectPieceShape(gridAfter, settledGrid); + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(300); - if (shapeAfter) { - const changed = shapeBefore.w !== shapeAfter.w || shapeBefore.h !== shapeAfter.h; - if (changed) { - rotatedPieces.add(pieceType); + const gridAfterRot = await readGrid(page, cal); + if (gridAfterRot) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; + + if (gridBeforeRot && gridAfterRot && gridsAreDifferent(gridBeforeRot, gridAfterRot)) { + // Verify shape actually changed (not just position shift from gravity) + const cellsBefore = detectActivePieceCells(gridBeforeRot, null); + const cellsAfter = detectActivePieceCells(gridAfterRot, null); + if (cellsBefore && cellsAfter) { + const bbBefore = boundingBox(cellsBefore); + const bbAfter = boundingBox(cellsAfter); + // Rotation changes bounding box dimensions (w/h swap) for non-O pieces + if (bbBefore.w !== bbAfter.w || bbBefore.h !== bbAfter.h) { + session.rotationsObserved++; + session.events.push({ type: "piece_rotated", frame: session.frames }); } else { - failedPieces.add(pieceType); + // Bounding box same size but cells may have moved within it + // Accept as rotation if grid changed and piece cells differ + const keyBefore = cellsBefore.map(([r, c]) => `${r},${c}`).sort().join("|"); + const keyAfter = cellsAfter.map(([r, c]) => `${r},${c}`).sort().join("|"); + if (keyBefore !== keyAfter) { + session.rotationsObserved++; + session.events.push({ type: "piece_rotated", frame: session.frames }); + } } } else { - // Couldn't read after rotation, try screenshot comparison - const shotBefore = await page.screenshot(); - // Rotate back and forth - await page.keyboard.press(cal.controls.rotate); - await page.waitForTimeout(200); - const shotAfter = await page.screenshot(); - if (!Buffer.from(shotBefore).equals(Buffer.from(shotAfter))) { - rotatedPieces.add(pieceType); - } else { - failedPieces.add(pieceType); - } - } - - // Drop the piece and move on - await page.keyboard.press(cal.controls.drop); - gameplay.pieces_placed++; - await page.waitForTimeout(500); - - // Capture settled state right after a piece lands (before next piece spawns) - settledGrid = await readGrid(page, cal); - - // Check if game is over - if (settledGrid && hasFilledInTopRows(settledGrid, 2)) { - break; + // Could not detect piece cells but grid changed after rotate key. + // Mark as rotation observed (grid-verified change, just can't confirm shape). + session.rotationsObserved++; + session.events.push({ type: "piece_rotated", frame: session.frames }); } } - // Remove pieces that eventually rotated from the failed set - for (const p of rotatedPieces) { - failedPieces.delete(p); - } - - const testedTypes = new Set([...rotatedPieces, ...failedPieces]); - const detail = `rotated: [${[...rotatedPieces].join(", ")}] failed: [${[...failedPieces].join(", ")}] (tested ${testedTypes.size} piece types in ${maxAttempts} attempts)`; - - if (failedPieces.size > 0) { - return { name: "all_pieces_rotate", pass: false, detail }; - } - if (rotatedPieces.size === 0) { - return { name: "all_pieces_rotate", pass: false, detail: "could not detect any piece rotations" }; - } - return { name: "all_pieces_rotate", pass: true, detail }; -} - -async function testHardDrop(page: Page, cal: CalibrationResult): Promise<TestResult> { - const gridBefore = await readGrid(page, cal); - const shotBefore = await page.screenshot(); + // Hard drop test: press drop and verify piece appeared at bottom + const gridBeforeDrop = await readGrid(page, cal); + if (gridBeforeDrop) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; await page.keyboard.press(cal.controls.drop); await page.waitForTimeout(500); - const gridAfter = await readGrid(page, cal); - const shotAfter = await page.screenshot(); - - // After hard drop, there should be filled cells at the bottom - // and the grid should have changed - const gridChanged = gridBefore && gridAfter && gridsAreDifferent(gridBefore, gridAfter); - const pixelsChanged = !Buffer.from(shotBefore).equals(Buffer.from(shotAfter)); - const hasBottomCells = gridAfter ? countFilledInBottomRows(gridAfter, 5) > 0 : false; - - if ((gridChanged || pixelsChanged) && (hasBottomCells || !gridAfter)) { - return { name: "hard_drop", pass: true, detail: "piece immediately dropped and new piece appeared" }; - } - if (pixelsChanged) { - return { name: "hard_drop", pass: true, detail: "visual change detected after hard drop" }; - } - return { name: "hard_drop", pass: false, detail: "no change detected after hard drop key" }; -} - -async function testPieceLocks(page: Page, cal: CalibrationResult): Promise<TestResult> { - // Wait for auto-drop to bring a piece to the bottom (~15 seconds) - // First, hard drop to establish a baseline - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(500); - const gridAfterDrop = await readGrid(page, cal); - if (gridAfterDrop) { - const bottomFilled = countFilledInBottomRows(gridAfterDrop, 4); + if (gridAfterDrop) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; + + if (gridBeforeDrop && gridAfterDrop && gridsAreDifferent(gridBeforeDrop, gridAfterDrop)) { + const bottomFilled = countFilledInBottomRows(gridAfterDrop, 5); if (bottomFilled > 0) { - // Verify persistence: wait and check again - await page.waitForTimeout(2000); - const gridLater = await readGrid(page, cal); - if (gridLater) { - const bottomFilledLater = countFilledInBottomRows(gridLater, 4); - if (bottomFilledLater >= bottomFilled) { - return { name: "piece_locks", pass: true, detail: "filled cells persist at bottom" }; - } - } - return { name: "piece_locks", pass: true, detail: "filled cells detected at bottom after drop" }; + session.hardDropsObserved++; + session.piecesLocked++; + session.events.push({ type: "hard_drop", frame: session.frames }); + session.events.push({ type: "piece_locked", frame: session.frames, filledDelta: bottomFilled }); } } - // Fallback: wait for auto-drop - await page.waitForTimeout(15000); - const gridAfterWait = await readGrid(page, cal); - if (gridAfterWait) { - const bottomFilled = countFilledInBottomRows(gridAfterWait, 4); - if (bottomFilled > 0) { - return { name: "piece_locks", pass: true, detail: "piece locked at bottom via auto-drop" }; + // New piece spawns: after hard drop, check if piece appeared at top + await page.waitForTimeout(500); + const gridAfterSpawn = await readGrid(page, cal); + if (gridAfterSpawn) { + session.gridReadSuccess++; + session.frames++; + if (hasFilledInTopRows(gridAfterSpawn, 4)) { + session.piecesSpawned++; + const cells = detectActivePieceCells(gridAfterSpawn, gridAfterDrop); + if (cells) { + const pt = identifyPieceType(cells); + session.pieceTypes.add(pt); + session.events.push({ type: "piece_spawned", pieceType: pt, frame: session.frames }); + } } + } else { + session.gridReadFail++; + session.frames++; } - // Screenshot-based fallback - const shot1 = await page.screenshot(); + // Piece locks test: verify filled cells persist + const gridPersist1 = await readGrid(page, cal); await page.waitForTimeout(2000); - const shot2 = await page.screenshot(); - // If screenshots are stable, something probably locked - return { - name: "piece_locks", - pass: false, - detail: "could not verify piece locking at bottom", - }; -} - -async function testNewPieceSpawns(page: Page, cal: CalibrationResult): Promise<TestResult> { - // After a piece locks (previous test did a hard drop), check for a piece at the top - const grid = await readGrid(page, cal); - if (grid) { - const topHasFilled = hasFilledInTopRows(grid, 4); - if (topHasFilled) { - return { name: "new_piece_spawns", pass: true, detail: "new piece detected at top of grid" }; - } - - // Wait a moment for the new piece to appear - await page.waitForTimeout(1000); - const grid2 = await readGrid(page, cal); - if (grid2 && hasFilledInTopRows(grid2, 4)) { - return { name: "new_piece_spawns", pass: true, detail: "new piece appeared at top after delay" }; + const gridPersist2 = await readGrid(page, cal); + if (gridPersist1 && gridPersist2) { + session.gridReadSuccess += 2; + session.frames += 2; + const bottom1 = countFilledInBottomRows(gridPersist1, 4); + const bottom2 = countFilledInBottomRows(gridPersist2, 4); + if (bottom1 > 0 && bottom2 >= bottom1) { + // Cells persisted -- piece is locked + if (session.piecesLocked === 0) session.piecesLocked++; } } - - // Drop another piece and check - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(500); - const gridAfter = await readGrid(page, cal); - if (gridAfter && hasFilledInTopRows(gridAfter, 4)) { - return { name: "new_piece_spawns", pass: true, detail: "new piece detected after drop" }; - } - - // Screenshot fallback - const shot1 = await page.screenshot(); - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(500); - const shot2 = await page.screenshot(); - if (!Buffer.from(shot1).equals(Buffer.from(shot2))) { - return { name: "new_piece_spawns", pass: true, detail: "visual change suggests new piece spawned" }; - } - - return { name: "new_piece_spawns", pass: false, detail: "could not detect new piece at top" }; } -async function testMultiplePieces( +/** + * Play multiple pieces and track what happens. + */ +async function runPlayPhase( page: Page, cal: CalibrationResult, + session: GameSession, gameplay: GameplayStats -): Promise<TestResult> { +): Promise<void> { + // Drop 10 pieces to test multiple pieces mechanic const gridBefore = await readGrid(page, cal); const filledBefore = gridBefore ? countFilled(gridBefore) : 0; + if (gridBefore) { + session.gridReadSuccess++; + } else { + session.gridReadFail++; + } + session.frames++; + + let settledGrid = gridBefore; - // Hard drop 10 pieces for (let i = 0; i < 10; i++) { await hardDrop(page, cal); await page.waitForTimeout(300); + gameplay.pieces_placed++; + session.piecesLocked++; + + const grid = await readGrid(page, cal); + if (grid) { + session.gridReadSuccess++; + session.frames++; + + // Detect piece type from diff + if (settledGrid) { + const cells = detectActivePieceCells(grid, settledGrid); + if (cells) { + const pt = identifyPieceType(cells); + session.pieceTypes.add(pt); + session.piecesSpawned++; + } + } + settledGrid = grid; + } else { + session.gridReadFail++; + session.frames++; + } } - gameplay.pieces_placed += 10; const gridAfter = await readGrid(page, cal); if (gridAfter) { + session.gridReadSuccess++; + session.frames++; const filledAfter = countFilled(gridAfter); if (filledAfter > filledBefore) { - return { - name: "multiple_pieces", - pass: true, - detail: `grid accumulated cells: ${filledBefore} -> ${filledAfter}`, - }; + session.events.push({ + type: "piece_locked", + frame: session.frames, + filledDelta: filledAfter - filledBefore, + }); } } - - // Screenshot fallback: if the game is still responding to drops, it's working - const shotA = await page.screenshot(); - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(300); - const shotB = await page.screenshot(); - if (!Buffer.from(shotA).equals(Buffer.from(shotB))) { - return { name: "multiple_pieces", pass: true, detail: "game still responding after 10 piece drops" }; - } - - return { name: "multiple_pieces", pass: false, detail: "grid did not accumulate filled cells" }; } -async function testLineClear( +/** + * Attempt to clear lines using AI play and brute-force methods. + */ +async function runLineClearPhase( page: Page, cal: CalibrationResult, + session: GameSession, gameplay: GameplayStats -): Promise<TestResult> { - // Strategy: fill a row by placing pieces across the bottom +): Promise<void> { const gridBefore = await readGrid(page, cal); const filledBefore = gridBefore ? countFilled(gridBefore) : 0; - // Play strategically using the AI to try to clear lines + // Play strategically using the AI const result = await playGame(page, cal, { maxPieces: 30, maxDurationMs: 20000 }); gameplay.pieces_placed += result.piecesPlaced; gameplay.errors_during_play += result.errors; + session.gridReadSuccess += result.gridReads; + session.gridReadFail += result.gridReadFails; + session.frames += result.gridReads + result.gridReadFails; if (result.linesCleared > 0) { + session.linesCleared += result.linesCleared; gameplay.lines_cleared += result.linesCleared; - return { - name: "line_clear", - pass: true, - detail: `${result.linesCleared} line(s) cleared during AI play`, - }; + for (let i = 0; i < result.linesCleared; i++) { + session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); + } } - // Try the brute-force row-fill approach - const cleared = await tryFillRow(page, cal, 10); - gameplay.pieces_placed += 10; - if (cleared) { - gameplay.lines_cleared += 1; - return { name: "line_clear", pass: true, detail: "line cleared via strategic placement" }; + // If no lines cleared yet, try brute-force approach + if (session.linesCleared === 0) { + const cleared = await tryFillRow(page, cal, 10); + gameplay.pieces_placed += 10; + if (cleared) { + session.linesCleared++; + gameplay.lines_cleared++; + session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); + } } - // Check if total filled decreased (which would indicate clearing happened) - const gridAfter = await readGrid(page, cal); - const filledAfter = gridAfter ? countFilled(gridAfter) : 0; - if (filledAfter < filledBefore && filledBefore > 0) { - return { name: "line_clear", pass: true, detail: "total filled cells decreased, indicating line clear" }; + // Check if total filled decreased (indicates clearing happened) + if (session.linesCleared === 0) { + const gridAfter = await readGrid(page, cal); + const filledAfter = gridAfter ? countFilled(gridAfter) : 0; + if (filledAfter < filledBefore && filledBefore > 0) { + session.linesCleared++; + gameplay.lines_cleared++; + session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); + } } - - return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" }; } /** - * Extract the score number from potentially concatenated text. - * Handles cases like "Score: 100Level: 1Lines: 5" or "Score100Level1Lines5" - * by looking for a labeled "score" value, or falling back to the first number. + * Observe the score element during gameplay. */ -function extractScoreFromText(text: string | null): number[] { - if (!text) return [0]; - - // Try labeled extraction: "Score: 100" or "Score100" or "score 100" - const labeledMatch = text.match(/score\s*[:\-=]?\s*(\d+)/i); - if (labeledMatch) { - return [parseInt(labeledMatch[1], 10)]; - } - - // Extract all individual numbers from the text - const allNumbers = (text.match(/\d+/g) || []).map(Number); - return allNumbers.length > 0 ? allNumbers : [0]; -} - -async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> { +async function observeScore( + page: Page, + cal: CalibrationResult, + session: GameSession, + gameplay: GameplayStats +): Promise<void> { if (!cal.scoreElementSelector) { // Try to find any number on the page that changes - const textBefore = await page.evaluate(() => document.body.innerText); - const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number); + try { + const textBefore = await page.evaluate(() => document.body.innerText); + const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number); - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(500); + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(500); - const textAfter = await page.evaluate(() => document.body.innerText); - const numbersAfter = (textAfter.match(/\d+/g) || []).map(Number); + const textAfter = await page.evaluate(() => document.body.innerText); + const numbersAfter = (textAfter.match(/\d+/g) || []).map(Number); - // Check if any number increased - for (let i = 0; i < Math.min(numbersBefore.length, numbersAfter.length); i++) { - if (numbersAfter[i] > numbersBefore[i]) { - return { name: "score_changes", pass: true, detail: "a number on the page increased after play" }; + for (let i = 0; i < Math.min(numbersBefore.length, numbersAfter.length); i++) { + if (numbersAfter[i] > numbersBefore[i]) { + session.scoreValues.push(numbersBefore[i], numbersAfter[i]); + if (numbersAfter[i] > gameplay.max_score_observed) { + gameplay.max_score_observed = numbersAfter[i]; + } + break; + } } - } - - return { name: "score_changes", pass: false, detail: "no score element found and no number changed" }; + } catch { /* ignore */ } + return; } try { const scoreBefore = await page.textContent(cal.scoreElementSelector); const numsBefore = extractScoreFromText(scoreBefore); + session.scoreValues.push(Math.max(...numsBefore)); - // Play a bit to change the score + // Play a bit for (let i = 0; i < 5; i++) { await page.keyboard.press(cal.controls.drop); await page.waitForTimeout(300); } - // Poll for score change: check multiple times over 2 seconds + // Poll for score change for (let poll = 0; poll < 4; poll++) { await page.waitForTimeout(500); - const scoreAfter = await page.textContent(cal.scoreElementSelector); const numsAfter = extractScoreFromText(scoreAfter); - - // Compare each extracted number: if any number increased, score changed - for (let i = 0; i < Math.min(numsBefore.length, numsAfter.length); i++) { - if (numsAfter[i] > numsBefore[i]) { - return { - name: "score_changes", - pass: true, - detail: `score changed from ${numsBefore[i]} to ${numsAfter[i]}`, - }; - } - } - - // Also check if any new number appeared that's larger than any before number - const maxBefore = Math.max(...numsBefore); const maxAfter = Math.max(...numsAfter); - if (maxAfter > maxBefore) { - return { - name: "score_changes", - pass: true, - detail: `score changed: max value ${maxBefore} -> ${maxAfter}`, - }; + session.scoreValues.push(maxAfter); + if (maxAfter > gameplay.max_score_observed) { + gameplay.max_score_observed = maxAfter; } + if (maxAfter > Math.max(...numsBefore)) break; } - - return { - name: "score_changes", - pass: false, - detail: `score did not increase: [${numsBefore.join(", ")}] -> no change after polling`, - }; - } catch { - return { name: "score_changes", pass: false, detail: "could not read score element" }; - } + } catch { /* ignore */ } } -async function testGameOver(page: Page, cal: CalibrationResult): Promise<TestResult> { +/** + * Stack pieces to trigger game over. + */ +async function runGameOverPhase( + page: Page, + cal: CalibrationResult, + session: GameSession +): Promise<void> { const isOver = await stackToGameOver(page, cal, 40); if (isOver) { - return { name: "game_over", pass: true, detail: "game stopped after stacking to top" }; + session.gameOverDetected = true; + session.events.push({ type: "game_over", frame: session.frames }); } - return { name: "game_over", pass: false, detail: "could not trigger or detect game over" }; } -async function testPlayable30s( +/** + * Play for 30 seconds and track stability. + */ +async function runEndurancePhase( page: Page, cal: CalibrationResult, + session: GameSession, gameplay: GameplayStats, consoleErrors: string[] -): Promise<TestResult> { +): Promise<void> { const errorsBefore = consoleErrors.length; const start = Date.now(); @@ -883,28 +559,352 @@ async function testPlayable30s( const elapsed = Math.round((Date.now() - start) / 1000); gameplay.pieces_placed += result.piecesPlaced; gameplay.lines_cleared += result.linesCleared; + session.linesCleared += result.linesCleared; gameplay.play_duration_seconds += elapsed; gameplay.errors_during_play += result.errors; + session.gridReadSuccess += result.gridReads; + session.gridReadFail += result.gridReadFails; + session.frames += result.gridReads + result.gridReadFails; + // Record endurance errors const newErrors = consoleErrors.slice(errorsBefore); - const crashed = newErrors.length > 0 || result.errors > 3; + for (const e of newErrors) { + if (!session.consoleErrors.includes(e)) session.consoleErrors.push(e); + } +} - if (!crashed) { - return { +// ---- Derive test results from session data ---- + +const ALL_TEST_NAMES = [ + "game_loads", + "game_starts", + "auto_drop", + "move_left", + "move_right", + "move_down", + "rotate", + "all_pieces_rotate", + "hard_drop", + "piece_locks", + "new_piece_spawns", + "multiple_pieces", + "line_clear", + "score_changes", + "game_over", + "playable_30s", +]; + +function deriveTestResults( + session: GameSession, + cal: CalibrationResult, + loadResult: LoadResult, + consoleErrors: string[], + gameplay: GameplayStats +): TestResult[] { + const results: TestResult[] = []; + const gridReliable = session.gridReadSuccess > 0 && + session.gridReadSuccess / (session.gridReadSuccess + session.gridReadFail) > 0.5; + + // 1. game_loads + results.push({ + name: "game_loads", + pass: loadResult.loaded && loadResult.errorsOnLoad === 0, + detail: loadResult.detail, + }); + + // 2. game_starts + results.push({ + name: "game_starts", + pass: session.started, + detail: session.started + ? `started via ${session.startMechanism}` + : "could not start game with any mechanism", + }); + + // 3. auto_drop -- MUST be verified via grid reader + const autoDropEvents = session.events.filter( + (e) => e.type === "piece_moved" && e.direction === "down" && + // Only count the first few frames (before we sent any input) + e.frame <= 2 + ); + if (autoDropEvents.length > 0) { + results.push({ + name: "auto_drop", + pass: true, + detail: "grid state changed after 5s with no input (grid-verified)", + }); + } else if (!gridReliable) { + results.push({ + name: "auto_drop", + pass: false, + detail: "grid reader unreliable, cannot verify auto-drop", + }); + } else { + results.push({ + name: "auto_drop", + pass: false, + detail: "piece did not move down in 5 seconds (grid-verified)", + }); + } + + // 4-6. movement tests + for (const dir of ["left", "right", "down"] as const) { + const moveEvents = session.events.filter( + (e) => e.type === "piece_moved" && e.direction === dir + ); + if (moveEvents.length > 0) { + results.push({ + name: `move_${dir}`, + pass: true, + detail: "grid state changed after key press (grid-verified)", + }); + } else if (!gridReliable) { + results.push({ + name: `move_${dir}`, + pass: false, + detail: "grid reader unreliable, cannot verify movement", + }); + } else { + results.push({ + name: `move_${dir}`, + pass: false, + detail: "no grid change detected after key press", + }); + } + } + + // 7. rotate + if (session.rotationsObserved > 0) { + results.push({ + name: "rotate", + pass: true, + detail: `piece shape changed after rotate key (grid-verified, ${session.rotationsObserved} rotation(s))`, + }); + } else if (!gridReliable) { + results.push({ + name: "rotate", + pass: false, + detail: "grid reader unreliable, cannot verify rotation", + }); + } else { + results.push({ + name: "rotate", + pass: false, + detail: "no shape change detected after rotate key", + }); + } + + // 7b. all_pieces_rotate -- derived from piece types seen + // We can only confidently test this if we saw multiple piece types + const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown"); + if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) { + results.push({ + name: "all_pieces_rotate", + pass: true, + detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]`, + }); + } else if (session.rotationsObserved > 0) { + results.push({ + name: "all_pieces_rotate", + pass: true, + detail: "rotation confirmed but could not identify individual piece types", + }); + } else { + results.push({ + name: "all_pieces_rotate", + pass: false, + detail: "could not detect any piece rotations via grid reader", + }); + } + + // 8. hard_drop + if (session.hardDropsObserved > 0) { + results.push({ + name: "hard_drop", + pass: true, + detail: "piece immediately dropped to bottom (grid-verified)", + }); + } else if (!gridReliable) { + results.push({ + name: "hard_drop", + pass: false, + detail: "grid reader unreliable, cannot verify hard drop", + }); + } else { + results.push({ + name: "hard_drop", + pass: false, + detail: "no grid change with bottom cells detected after hard drop key", + }); + } + + // 9. piece_locks + const lockEvents = session.events.filter((e) => e.type === "piece_locked"); + if (lockEvents.length > 0) { + results.push({ + name: "piece_locks", + pass: true, + detail: `filled cells persist at bottom (grid-verified, ${lockEvents.length} lock event(s))`, + }); + } else if (session.piecesLocked > 0) { + results.push({ + name: "piece_locks", + pass: true, + detail: `${session.piecesLocked} piece(s) locked during play`, + }); + } else { + results.push({ + name: "piece_locks", + pass: false, + detail: "could not verify piece locking via grid reader", + }); + } + + // 10. new_piece_spawns + if (session.piecesSpawned > 0) { + results.push({ + name: "new_piece_spawns", + pass: true, + detail: `${session.piecesSpawned} new piece(s) detected at top of grid`, + }); + } else { + results.push({ + name: "new_piece_spawns", + pass: false, + detail: "could not detect new piece spawning at top via grid reader", + }); + } + + // 11. multiple_pieces + if (session.piecesLocked >= 3) { + results.push({ + name: "multiple_pieces", + pass: true, + detail: `${session.piecesLocked} pieces placed during play session`, + }); + } else { + results.push({ + name: "multiple_pieces", + pass: false, + detail: `only ${session.piecesLocked} piece(s) detected, need at least 3`, + }); + } + + // 12. line_clear + if (session.linesCleared > 0) { + results.push({ + name: "line_clear", + pass: true, + detail: `${session.linesCleared} line(s) cleared (grid-verified)`, + }); + } else { + results.push({ + name: "line_clear", + pass: false, + detail: "could not trigger or detect a line clear via grid reader", + }); + } + + // 13. score_changes + if (session.scoreValues.length >= 2) { + const min = Math.min(...session.scoreValues); + const max = Math.max(...session.scoreValues); + if (max > min) { + results.push({ + name: "score_changes", + pass: true, + detail: `score changed from ${min} to ${max}`, + }); + } else { + results.push({ + name: "score_changes", + pass: false, + detail: `score stayed at ${min}`, + }); + } + } else if (cal.scoreElementSelector === null) { + results.push({ + name: "score_changes", + pass: false, + detail: "no score element found", + }); + } else { + results.push({ + name: "score_changes", + pass: false, + detail: "could not read score values", + }); + } + + // 14. game_over + results.push({ + name: "game_over", + pass: session.gameOverDetected, + detail: session.gameOverDetected + ? "game stopped after stacking to top" + : "could not trigger or detect game over", + }); + + // 15. playable_30s + const crashed = session.consoleErrors.length > 0 || gameplay.errors_during_play > 3; + if (!crashed && gameplay.play_duration_seconds >= 10) { + results.push({ name: "playable_30s", pass: true, - detail: `played for ${elapsed}s, placed ${result.piecesPlaced} pieces, no crashes`, - }; + detail: `played for ${gameplay.play_duration_seconds}s, placed ${gameplay.pieces_placed} pieces, no crashes`, + }); + } else if (crashed) { + results.push({ + name: "playable_30s", + pass: false, + detail: `${session.consoleErrors.length} console error(s), ${gameplay.errors_during_play} play errors`, + }); + } else { + results.push({ + name: "playable_30s", + pass: false, + detail: `only played for ${gameplay.play_duration_seconds}s`, + }); } - return { - name: "playable_30s", - pass: false, - detail: `${newErrors.length} console errors, ${result.errors} play errors during ${elapsed}s`, - }; + + return results; } // ---- Helpers ---- +function countFilledInTopRows(grid: boolean[][], rows: number): number { + let count = 0; + for (let r = 0; r < Math.min(rows, grid.length); r++) { + for (let c = 0; c < grid[r].length; c++) { + if (grid[r][c]) count++; + } + } + return count; +} + +function boundingBox(cells: [number, number][]): { w: number; h: number } { + const minRow = Math.min(...cells.map(([r]) => r)); + const maxRow = Math.max(...cells.map(([r]) => r)); + const minCol = Math.min(...cells.map(([, c]) => c)); + const maxCol = Math.max(...cells.map(([, c]) => c)); + return { w: maxCol - minCol + 1, h: maxRow - minRow + 1 }; +} + +/** + * Extract the score number from potentially concatenated text. + */ +function extractScoreFromText(text: string | null): number[] { + if (!text) return [0]; + + const labeledMatch = text.match(/score\s*[:\-=]?\s*(\d+)/i); + if (labeledMatch) { + return [parseInt(labeledMatch[1], 10)]; + } + + const allNumbers = (text.match(/\d+/g) || []).map(Number); + return allNumbers.length > 0 ? allNumbers : [0]; +} + async function loadGamePage(page: Page, serverUrl: string): Promise<void> { const candidates = [ "index.html", @@ -947,5 +947,6 @@ function emptyCalibration(consoleErrors: string[]): CalibrationResult { scoreElementSelector: null, backgroundColor: null, consoleErrors, + gridConfidence: 0, }; } diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -45,6 +45,8 @@ export interface CalibrationResult { scoreElementSelector: string | null; backgroundColor: [number, number, number] | null; consoleErrors: string[]; + /** Fraction of grid reads that returned non-null during calibration polling. */ + gridConfidence: number; } /** Result of an individual test. */ @@ -54,6 +56,53 @@ export interface TestResult { detail: string; } +/** Standard Tetris piece types. */ +export type PieceType = "I" | "O" | "T" | "S" | "Z" | "J" | "L" | "unknown"; + +/** + * Tetromino definition: cells in a bounding box. + * Each rotation is a list of [row, col] offsets relative to the piece origin. + */ +export interface TetrominoDef { + type: PieceType; + /** All rotation states. Each is a list of [row, col] cell offsets. */ + rotations: [number, number][][]; + /** Bounding box dimensions per rotation: [width, height]. */ + dimensions: [number, number][]; +} + +/** An event observed during continuous grid scanning. */ +export type GridEvent = + | { type: "piece_spawned"; pieceType: PieceType; frame: number } + | { type: "piece_locked"; frame: number; filledDelta: number } + | { type: "line_cleared"; count: number; frame: number } + | { type: "piece_moved"; direction: "left" | "right" | "down"; frame: number } + | { type: "piece_rotated"; frame: number } + | { type: "hard_drop"; frame: number } + | { type: "game_over"; frame: number } + | { type: "grid_read_failed"; frame: number }; + +/** Data collected during one continuous observation session. */ +export interface GameSession { + started: boolean; + startMechanism: string; + piecesSpawned: number; + piecesLocked: number; + linesCleared: number; + rotationsObserved: number; + movementsObserved: number; + hardDropsObserved: number; + gameOverDetected: boolean; + consoleErrors: string[]; + durationSeconds: number; + pieceTypes: Set<string>; + scoreValues: number[]; + gridReadSuccess: number; + gridReadFail: number; + frames: number; + events: GridEvent[]; +} + /** Gameplay statistics gathered during the play phase. */ export interface GameplayStats { pieces_placed: number; @@ -72,6 +121,7 @@ export interface BotReport { controls: Record<string, string>; start_mechanism: string; score_element_found: boolean; + grid_confidence: number; }; tests: Array<{ name: string; pass: boolean; detail: string }>; summary: { @@ -81,6 +131,15 @@ export interface BotReport { score: number; }; gameplay: GameplayStats; + session: { + frames: number; + events_count: number; + pieces_spawned: number; + pieces_locked: number; + lines_cleared: number; + piece_types_seen: string[]; + grid_read_success_rate: number; + }; performance?: { load_time_ms: number; };

Impressum · Datenschutz