loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d3de069d27a61685d29370f5ede9d3d99486d6b3
parent 17a4bada036386de83204177a3af6db3546666c3
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 07:37:20 +0200

Rewrite bot start detection: falling piece detector, conditional phases

Start detection: detects a falling piece (downward-moving cluster of
colored pixels) instead of screenshot comparison. Eliminates false
positives from title screens, hover effects, canvas-rendered buttons.

Conditional phases: mechanics requires start, gameplay requires mechanics,
game over requires gameplay. Skipped phases report "skipped: prerequisite
not met" instead of false positives.

Game over: stacks pieces via hard drops + grid reader verification of
filled top rows. Removes screenshot comparison approach.

piece_locks: requires grid reader reliability, no longer passes on
static screens with gridDetected=false.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 451+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 383++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
2 files changed, 584 insertions(+), 250 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -158,136 +158,344 @@ async function measureGridConfidence( } /** - * Try multiple mechanisms to start the game. - * Takes a screenshot before and after each attempt, comparing - * to see if the game state changed. + * Detect a falling piece by taking 3 screenshots ~800ms apart and looking + * for a cluster of colored pixels that moved downward between frames. + * + * This works for canvas, DOM, SVG, WebGL -- any rendering approach. + * It does NOT require the grid reader or calibrated grid bounds. + * + * Implementation: divide the visible page into a grid of sample points + * (~20 columns x ~40 rows). Read pixel colors at each point via screenshot + * buffer. Between consecutive frames, look for a group of colored + * (non-background) points that disappeared from one position and appeared + * lower -- a "falling cluster" of roughly 4 cells (2x2 to 4x1 bounding box). */ -async function detectStartMechanism(page: Page): Promise<StartMechanism> { - // Take initial screenshot - let prevShot = await page.screenshot(); - - // 1. Wait 3 seconds (auto-start) - await page.waitForTimeout(3000); - let newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "auto"; +async function detectFallingPiece(page: Page): Promise<boolean> { + const SAMPLE_COLS = 20; + const SAMPLE_ROWS = 40; + const SCREENSHOTS = 3; + const INTERVAL_MS = 800; + // Minimum downward shift in sample-grid rows to count as "falling" + const MIN_DOWN_SHIFT = 1; + // Cluster size bounds (roughly a tetromino: 3-6 sample points) + const MIN_CLUSTER = 3; + const MAX_CLUSTER = 12; + // Color distance threshold to distinguish filled from background + const COLOR_THRESHOLD = 40; + + // Take screenshots + const shots: Buffer[] = []; + for (let i = 0; i < SCREENSHOTS; i++) { + shots.push(await page.screenshot()); + if (i < SCREENSHOTS - 1) await page.waitForTimeout(INTERVAL_MS); } - prevShot = newShot; - // 2. Click the canvas or game container - try { - const canvas = page.locator("canvas").first(); - if ((await canvas.count()) > 0) { - await canvas.click(); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "click_canvas"; - } - prevShot = newShot; - } - } catch { /* continue */ } + // Parse each screenshot into a grid of "colored" (true) / "background" (false) + // by sampling pixel colors at evenly spaced points. + // We use page.evaluate to decode the PNG in the browser via canvas. + const grids: boolean[][][] = []; + + for (const shot of shots) { + const base64 = shot.toString("base64"); + const grid = await page.evaluate( + async ({ base64, sampleCols, sampleRows, colorThreshold }) => { + // Decode the screenshot PNG into pixel data + const img = new Image(); + const loaded = new Promise<void>((resolve, reject) => { + img.onload = () => resolve(); + img.onerror = () => reject(new Error("image decode failed")); + }); + img.src = `data:image/png;base64,${base64}`; + await loaded; + + const canvas = document.createElement("canvas"); + canvas.width = img.width; + canvas.height = img.height; + const ctx = canvas.getContext("2d")!; + ctx.drawImage(img, 0, 0); + + const stepX = img.width / sampleCols; + const stepY = img.height / sampleRows; + + // First pass: sample all pixel colors + const colors: number[][] = []; + for (let r = 0; r < sampleRows; r++) { + const row: number[] = []; + for (let c = 0; c < sampleCols; c++) { + const px = Math.floor(c * stepX + stepX / 2); + const py = Math.floor(r * stepY + stepY / 2); + const pixel = ctx.getImageData(px, py, 1, 1).data; + // Store as a single luminance-like value for quick background detection + // and the full RGB for distance checks + row.push(pixel[0] * 1000000 + pixel[1] * 1000 + pixel[2]); + } + colors.push(row); + } - // Try clicking any game-like container - try { - const container = page.locator( - '[class*="game"], [class*="board"], [id*="game"], [id*="board"]' - ).first(); - if ((await container.count()) > 0) { - await container.click(); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "click_canvas"; + // Determine background color: the most common color in the sample grid + const colorCounts = new Map<number, number>(); + for (const row of colors) { + for (const c of row) { + colorCounts.set(c, (colorCounts.get(c) || 0) + 1); + } + } + let bgColor = 0; + let bgCount = 0; + for (const [color, count] of colorCounts) { + if (count > bgCount) { + bgCount = count; + bgColor = color; + } + } + const bgR = Math.floor(bgColor / 1000000); + const bgG = Math.floor((bgColor % 1000000) / 1000); + const bgB = bgColor % 1000; + + // Second pass: mark cells as "colored" if they differ from background + const result: boolean[][] = []; + for (let r = 0; r < sampleRows; r++) { + const row: boolean[] = []; + for (let c = 0; c < sampleCols; c++) { + const v = colors[r][c]; + const pR = Math.floor(v / 1000000); + const pG = Math.floor((v % 1000000) / 1000); + const pB = v % 1000; + const dist = Math.sqrt( + (pR - bgR) ** 2 + (pG - bgG) ** 2 + (pB - bgB) ** 2 + ); + row.push(dist > colorThreshold); + } + result.push(row); + } + return result; + }, + { base64, sampleCols: SAMPLE_COLS, sampleRows: SAMPLE_ROWS, colorThreshold: COLOR_THRESHOLD } + ); + grids.push(grid); + } + + // Compare consecutive frame pairs to find downward-moving clusters + for (let f = 0; f < grids.length - 1; f++) { + const prev = grids[f]; + const curr = grids[f + 1]; + + // Find cells that were colored in prev but not in curr ("disappeared") + const disappeared: [number, number][] = []; + // Find cells that are colored in curr but not in prev ("appeared") + const appeared: [number, number][] = []; + + for (let r = 0; r < SAMPLE_ROWS; r++) { + for (let c = 0; c < SAMPLE_COLS; c++) { + if (prev[r][c] && !curr[r][c]) disappeared.push([r, c]); + if (!prev[r][c] && curr[r][c]) appeared.push([r, c]); } - prevShot = newShot; } - } catch { /* continue */ } - // 3. Press Enter - await page.keyboard.press("Enter"); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "enter"; - } - prevShot = newShot; - - // 4. Press Space - await page.keyboard.press("Space"); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "space"; - } - prevShot = newShot; + // If nothing changed, no falling piece in this frame pair + if (disappeared.length === 0 || appeared.length === 0) continue; - // 5. Click the body/document (some games start on any click) - try { - await page.locator("body").click({ position: { x: 100, y: 100 } }); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "click_canvas"; - } - prevShot = newShot; - } catch { /* continue */ } + // Cluster the disappeared points using simple flood fill + const disappearedClusters = clusterPoints(disappeared); + const appearedClusters = clusterPoints(appeared); - // 6. Look for a start/play/restart button - try { - const button = page.locator("button, a, [role='button']").filter({ - hasText: /start|play|begin|new game|restart|reset|new/i, - }).first(); - if ((await button.count()) > 0) { - await button.click(); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "button"; + // For each disappeared cluster, look for a matching appeared cluster + // that is shifted downward (same rough column range, higher row numbers) + for (const dCluster of disappearedClusters) { + if (dCluster.length < MIN_CLUSTER || dCluster.length > MAX_CLUSTER) continue; + + const dMinCol = Math.min(...dCluster.map(([, c]) => c)); + const dMaxCol = Math.max(...dCluster.map(([, c]) => c)); + const dMinRow = Math.min(...dCluster.map(([r]) => r)); + const dCenterCol = (dMinCol + dMaxCol) / 2; + + for (const aCluster of appearedClusters) { + if (aCluster.length < MIN_CLUSTER || aCluster.length > MAX_CLUSTER) continue; + + const aMinCol = Math.min(...aCluster.map(([, c]) => c)); + const aMaxCol = Math.max(...aCluster.map(([, c]) => c)); + const aMinRow = Math.min(...aCluster.map(([r]) => r)); + const aCenterCol = (aMinCol + aMaxCol) / 2; + + // Check: appeared cluster is below disappeared cluster + // and in roughly the same column range + const colOverlap = Math.abs(dCenterCol - aCenterCol) <= 3; + const movedDown = aMinRow > dMinRow && (aMinRow - dMinRow) >= MIN_DOWN_SHIFT; + + if (colOverlap && movedDown) { + return true; + } } - prevShot = newShot; } - } catch { /* continue */ } - // Also try elements that aren't buttons but have matching text - try { - const textMatch = page.locator( - ':text-matches("start|play|begin|new.game|restart|reset", "i")' - ).first(); - if ((await textMatch.count()) > 0) { - await textMatch.click(); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "button"; + // Also check if the overall set of colored points shifted down + // (handles cases where clusters partially overlap between frames) + if (disappeared.length >= MIN_CLUSTER && appeared.length >= MIN_CLUSTER) { + const dAvgRow = disappeared.reduce((s, [r]) => s + r, 0) / disappeared.length; + const aAvgRow = appeared.reduce((s, [r]) => s + r, 0) / appeared.length; + const dAvgCol = disappeared.reduce((s, [, c]) => s + c, 0) / disappeared.length; + const aAvgCol = appeared.reduce((s, [, c]) => s + c, 0) / appeared.length; + + if ( + aAvgRow > dAvgRow + MIN_DOWN_SHIFT && + Math.abs(aAvgCol - dAvgCol) <= 3 && + Math.abs(disappeared.length - appeared.length) <= 4 + ) { + return true; } - prevShot = newShot; } - } catch { /* continue */ } + } - // 7. Try clicking any <button> element regardless of text - try { - const anyButton = page.locator("button").first(); - if ((await anyButton.count()) > 0) { - await anyButton.click(); - await page.waitForTimeout(500); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "button"; + return false; +} + +/** + * Cluster adjacent points using flood fill. + * Two points are adjacent if they differ by at most 1 in both row and column. + */ +function clusterPoints(points: [number, number][]): [number, number][][] { + const clusters: [number, number][][] = []; + const visited = new Set<string>(); + + for (const [r, c] of points) { + const key = `${r},${c}`; + if (visited.has(key)) continue; + + const cluster: [number, number][] = []; + const stack: [number, number][] = [[r, c]]; + visited.add(key); + + while (stack.length > 0) { + const [cr, cc] = stack.pop()!; + cluster.push([cr, cc]); + + // Check all 8 neighbors + for (let dr = -1; dr <= 1; dr++) { + for (let dc = -1; dc <= 1; dc++) { + if (dr === 0 && dc === 0) continue; + const nr = cr + dr; + const nc = cc + dc; + const nk = `${nr},${nc}`; + if (!visited.has(nk) && points.some(([pr, pc]) => pr === nr && pc === nc)) { + visited.add(nk); + stack.push([nr, nc]); + } + } } - prevShot = newShot; } - } catch { /* continue */ } - // 8. Press any key (try a few -- catches games that start on any keydown) - for (const key of ["a", "p", "s", "n", "Escape"]) { - await page.keyboard.press(key); - await page.waitForTimeout(300); - newShot = await page.screenshot(); - if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { - return "anykey"; - } - prevShot = newShot; + clusters.push(cluster); + } + + return clusters; +} + +/** + * Try multiple mechanisms to start the game. + * After each trigger, runs the falling piece detector to confirm + * the game actually started (not just a title screen animation). + * + * The ONLY way to confirm start is detecting a falling piece. + */ +async function detectStartMechanism(page: Page): Promise<StartMechanism> { + // Ordered list of triggers to try + const triggers: Array<{ name: StartMechanism; action: () => Promise<void> }> = [ + // 1. Wait 3s (auto-start games) + { + name: "auto", + action: async () => { + await page.waitForTimeout(3000); + }, + }, + // 2. Click the canvas + { + name: "click_canvas", + action: async () => { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) await canvas.click(); + }, + }, + // 3. Click any game-like container + { + name: "click_canvas", + action: async () => { + const container = page.locator( + '[class*="game"], [class*="board"], [id*="game"], [id*="board"]' + ).first(); + if ((await container.count()) > 0) await container.click(); + }, + }, + // 4. Press Enter + { + name: "enter", + action: async () => { await page.keyboard.press("Enter"); }, + }, + // 5. Press Space + { + name: "space", + action: async () => { await page.keyboard.press("Space"); }, + }, + // 6. Click body + { + name: "click_canvas", + action: async () => { + await page.locator("body").click({ position: { x: 100, y: 100 } }); + }, + }, + // 7. Click a start/play button + { + name: "button", + action: async () => { + const button = page.locator("button, a, [role='button']").filter({ + hasText: /start|play|begin|new game|restart|reset|new/i, + }).first(); + if ((await button.count()) > 0) await button.click(); + }, + }, + // 8. Click text that looks like a start prompt + { + name: "button", + action: async () => { + const textMatch = page.locator( + ':text-matches("start|play|begin|new.game|restart|reset", "i")' + ).first(); + if ((await textMatch.count()) > 0) await textMatch.click(); + }, + }, + // 9. Click any button regardless of text + { + name: "button", + action: async () => { + const anyButton = page.locator("button").first(); + if ((await anyButton.count()) > 0) await anyButton.click(); + }, + }, + // 10. Press various keys (catches games that start on any keydown) + { + name: "anykey", + action: async () => { + for (const key of ["a", "p", "s", "n", "Escape"]) { + await page.keyboard.press(key); + await page.waitForTimeout(100); + } + }, + }, + // 11. Press ArrowDown (some games start on directional input) + { + name: "anykey", + action: async () => { await page.keyboard.press("ArrowDown"); }, + }, + ]; + + for (const trigger of triggers) { + try { + await trigger.action(); + // Give the game a moment to react before checking for a falling piece + await page.waitForTimeout(300); + + if (await detectFallingPiece(page)) { + return trigger.name; + } + } catch { /* continue to next trigger */ } } return "unknown"; @@ -297,6 +505,9 @@ async function detectStartMechanism(page: Page): Promise<StartMechanism> { * Re-calibration fallback: try ALL start mechanisms again with longer waits, * re-scanning for the grid after each attempt. Used when the first pass * failed to detect the start mechanism or the grid. + * + * Uses the falling piece detector (not screenshot comparison) to confirm + * the game actually started. */ async function recalibrateWithRetry( page: Page, @@ -360,18 +571,16 @@ async function recalibrateWithRetry( }, ]; - let prevShot = await page.screenshot(); - for (const attempt of attempts) { try { await attempt.action(); - await page.waitForTimeout(1500); - - const newShot = await page.screenshot(); - const changed = !Buffer.from(prevShot).equals(Buffer.from(newShot)); + await page.waitForTimeout(500); - if (changed && startMechanism === "unknown") { - startMechanism = attempt.name; + // Use falling piece detector instead of screenshot comparison + if (startMechanism === "unknown") { + if (await detectFallingPiece(page)) { + startMechanism = attempt.name; + } } // Re-scan for grid after each attempt @@ -386,8 +595,6 @@ async function recalibrateWithRetry( if (startMechanism !== "unknown" && gridResult.gridBounds) { break; } - - prevShot = newShot; } catch { /* continue */ } } diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -13,7 +13,7 @@ import { identifyPieceType, countCompleteRows, } from "./grid-reader"; -import { hardDrop, playGame, tryFillRow, stackToGameOver } from "./player"; +import { hardDrop, playGame, tryFillRow } from "./player"; import { calibrate } from "./calibrate"; /** @@ -85,7 +85,7 @@ export async function runAllTests( }; } - // ---- Phase 2: Calibrate ---- + // ---- Phase 2: Calibrate + detect start (always runs) ---- let cal: CalibrationResult; try { cal = await calibrate(page); @@ -101,42 +101,56 @@ export async function runAllTests( if (!session.consoleErrors.includes(e)) session.consoleErrors.push(e); } - // ---- Phase 3: Observation session -- basic mechanics ---- - // Test auto-drop, movement, rotation, hard drop via grid reader - if (cal.gridDetected) { + // ---- Phase 3: Basic mechanics -- ONLY if game started (falling piece detected) ---- + let mechanicsSucceeded = false; + if (session.started && cal.gridDetected) { await runBasicMechanicsPhase(page, cal, session); + // Mechanics succeeded if we observed at least 1 event + mechanicsSucceeded = + session.movementsObserved > 0 || + session.rotationsObserved > 0 || + session.hardDropsObserved > 0 || + session.events.some((e) => e.type === "piece_moved"); } - // ---- Phase 4: Reload + calibrate for gameplay ---- - try { - await loadGamePage(page, serverUrl); - cal = await calibrate(page); - session.started = session.started || cal.startMechanism !== "unknown"; - } catch { /* continue with existing state */ } + // ---- Phase 4: Gameplay (play to win) -- ONLY if mechanics had at least 1 success ---- + let gameplayPlacedPieces = false; + if (mechanicsSucceeded) { + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + session.started = session.started || cal.startMechanism !== "unknown"; + } catch { /* continue with existing state */ } - // ---- Phase 5: Extended gameplay with integrated score tracking ---- - await runGameplayPhase(page, cal, session, gameplay); + await runGameplayPhase(page, cal, session, gameplay); + gameplayPlacedPieces = gameplay.pieces_placed > 0; + } - // ---- Phase 6: Game over test ---- - try { - await loadGamePage(page, serverUrl); - cal = await calibrate(page); - } catch { /* continue */ } + // ---- Phase 5: Game over -- ONLY if gameplay placed pieces ---- + if (gameplayPlacedPieces) { + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { /* continue */ } - await runGameOverPhase(page, cal, session); + await runGameOverPhase(page, cal, session); + } - // ---- Phase 7: 30-second endurance play ---- - try { - await loadGamePage(page, serverUrl); - cal = await calibrate(page); - } catch { /* continue */ } + // ---- Phase 6: Endurance -- ONLY if gameplay worked ---- + if (gameplayPlacedPieces) { + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { /* continue */ } - await runEndurancePhase(page, cal, session, gameplay, consoleErrors); + await runEndurancePhase(page, cal, session, gameplay, consoleErrors); + } session.durationSeconds = gameplay.play_duration_seconds; // ---- Derive test results from session data ---- - const testResults = deriveTestResults(session, cal, loadResult, consoleErrors, gameplay); + const phaseState = { mechanicsSucceeded, gameplayPlacedPieces }; + const testResults = deriveTestResults(session, cal, loadResult, consoleErrors, gameplay, phaseState); return { testResults, calibration: cal, gameplay, session }; } @@ -470,18 +484,79 @@ async function runGameplayPhase( } /** - * Stack pieces to trigger game over. + * Stack pieces to trigger game over using grid reader verification. + * + * Instead of screenshot comparison (which false-positives on static screens), + * we: + * 1. Hard drop 30-40 pieces rapidly in the same column to build a tower + * 2. After each batch of 5 drops, check grid for filled cells in the top 4 rows + * 3. If top rows are filled AND new drops don't change the grid, game is over + * 4. Also check for "game over" text in DOM as a secondary signal */ async function runGameOverPhase( page: Page, cal: CalibrationResult, session: GameSession ): Promise<void> { - const isOver = await stackToGameOver(page, cal, 40); - if (isOver) { - session.gameOverDetected = true; - session.events.push({ type: "game_over", frame: session.frames }); + const MAX_DROPS = 40; + const BATCH_SIZE = 5; + + for (let i = 0; i < MAX_DROPS; i++) { + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(150); + + // Check after each batch of drops + if ((i + 1) % BATCH_SIZE === 0) { + const grid = await readGrid(page, cal); + if (grid) { + session.gridReadSuccess++; + session.frames++; + + if (hasFilledInTopRows(grid, 4)) { + // Top rows are filled -- check if new drops actually change the grid + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(300); + const gridAfter = await readGrid(page, cal); + if (gridAfter) { + session.gridReadSuccess++; + session.frames++; + if (!gridsAreDifferent(grid, gridAfter)) { + // Grid didn't change after a drop -- game is over + session.gameOverDetected = true; + session.events.push({ type: "game_over", frame: session.frames }); + return; + } + } + } + } else { + session.gridReadFail++; + session.frames++; + } + } } + + // Final check: look for game over text in DOM + try { + const hasGameOverText = await page.evaluate(() => { + const text = document.body.innerText.toLowerCase(); + return ( + text.includes("game over") || + text.includes("gameover") || + text.includes("you lose") || + text.includes("try again") || + text.includes("play again") + ); + }); + if (hasGameOverText) { + // Only trust DOM text if we also saw pieces in the grid (prevents false + // positives from static pages that happen to have "restart" text) + const finalGrid = await readGrid(page, cal); + if (finalGrid && countFilled(finalGrid) > 10) { + session.gameOverDetected = true; + session.events.push({ type: "game_over", frame: session.frames }); + } + } + } catch { /* ignore */ } } /** @@ -537,12 +612,18 @@ const ALL_TEST_NAMES = [ "playable_30s", ]; +interface PhaseState { + mechanicsSucceeded: boolean; + gameplayPlacedPieces: boolean; +} + function deriveTestResults( session: GameSession, cal: CalibrationResult, loadResult: LoadResult, consoleErrors: string[], - gameplay: GameplayStats + gameplay: GameplayStats, + phaseState: PhaseState ): TestResult[] { const results: TestResult[] = []; const gridReliable = session.gridReadSuccess > 0 && @@ -564,34 +645,49 @@ function deriveTestResults( : "could not start game with any mechanism", }); + // Helper: produce a skip result for tests whose prerequisite phase was skipped + const skipResult = (name: string, reason: string): TestResult => ({ + name, + pass: false, + detail: `skipped: ${reason}`, + }); + // 3. auto_drop -- MUST be verified via grid reader - const autoDropEvents = session.events.filter( - (e) => e.type === "piece_moved" && e.direction === "down" && - // Only count the first few frames (before we sent any input) - e.frame <= 2 - ); - if (autoDropEvents.length > 0) { - results.push({ - name: "auto_drop", - pass: true, - detail: "grid state changed after 5s with no input (grid-verified)", - }); - } else if (!gridReliable) { - results.push({ - name: "auto_drop", - pass: false, - detail: "grid reader unreliable, cannot verify auto-drop", - }); + if (!session.started) { + results.push(skipResult("auto_drop", "game did not start")); } else { - results.push({ - name: "auto_drop", - pass: false, - detail: "piece did not move down in 5 seconds (grid-verified)", - }); + const autoDropEvents = session.events.filter( + (e) => e.type === "piece_moved" && e.direction === "down" && + // Only count the first few frames (before we sent any input) + e.frame <= 2 + ); + if (autoDropEvents.length > 0) { + results.push({ + name: "auto_drop", + pass: true, + detail: "grid state changed after 5s with no input (grid-verified)", + }); + } else if (!gridReliable) { + results.push({ + name: "auto_drop", + pass: false, + detail: "grid reader unreliable, cannot verify auto-drop", + }); + } else { + results.push({ + name: "auto_drop", + pass: false, + detail: "piece did not move down in 5 seconds (grid-verified)", + }); + } } // 4-6. movement tests for (const dir of ["left", "right", "down"] as const) { + if (!session.started) { + results.push(skipResult(`move_${dir}`, "game did not start")); + continue; + } const moveEvents = session.events.filter( (e) => e.type === "piece_moved" && e.direction === dir ); @@ -617,7 +713,9 @@ function deriveTestResults( } // 7. rotate - if (session.rotationsObserved > 0) { + if (!session.started) { + results.push(skipResult("rotate", "game did not start")); + } else if (session.rotationsObserved > 0) { results.push({ name: "rotate", pass: true, @@ -638,30 +736,35 @@ function deriveTestResults( } // 7b. all_pieces_rotate -- derived from piece types seen - // We can only confidently test this if we saw multiple piece types - const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown"); - if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) { - results.push({ - name: "all_pieces_rotate", - pass: true, - detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]`, - }); - } else if (session.rotationsObserved > 0) { - results.push({ - name: "all_pieces_rotate", - pass: true, - detail: "rotation confirmed but could not identify individual piece types", - }); + if (!session.started) { + results.push(skipResult("all_pieces_rotate", "game did not start")); } else { - results.push({ - name: "all_pieces_rotate", - pass: false, - detail: "could not detect any piece rotations via grid reader", - }); + const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown"); + if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) { + results.push({ + name: "all_pieces_rotate", + pass: true, + detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]`, + }); + } else if (session.rotationsObserved > 0) { + results.push({ + name: "all_pieces_rotate", + pass: true, + detail: "rotation confirmed but could not identify individual piece types", + }); + } else { + results.push({ + name: "all_pieces_rotate", + pass: false, + detail: "could not detect any piece rotations via grid reader", + }); + } } // 8. hard_drop - if (session.hardDropsObserved > 0) { + if (!session.started) { + results.push(skipResult("hard_drop", "game did not start")); + } else if (session.hardDropsObserved > 0) { results.push({ name: "hard_drop", pass: true, @@ -681,38 +784,48 @@ function deriveTestResults( }); } - // 9. piece_locks - const lockEvents = session.events.filter((e) => e.type === "piece_locked"); - if (lockEvents.length > 0) { - results.push({ - name: "piece_locks", - pass: true, - detail: `filled cells persist at bottom (grid-verified, ${lockEvents.length} lock event(s))`, - }); - } else if (session.piecesLocked > 0 && session.piecesSpawned > 0) { - // Only trust locked count if we also detected spawns (prevents false positives - // from static UI being misread as game state) - results.push({ - name: "piece_locks", - pass: true, - detail: `${session.piecesLocked} piece(s) locked during play`, - }); - } else if (session.piecesLocked > 0 && session.piecesSpawned === 0) { + // 9. piece_locks -- only trust if grid is reliable + if (!session.started) { + results.push(skipResult("piece_locks", "game did not start")); + } else if (!gridReliable) { results.push({ name: "piece_locks", pass: false, - detail: `${session.piecesLocked} lock event(s) but 0 spawns detected - likely false positive from UI misread`, + detail: "grid reader unreliable, cannot verify piece locking", }); } else { - results.push({ - name: "piece_locks", - pass: false, - detail: "could not verify piece locking via grid reader", - }); + const lockEvents = session.events.filter((e) => e.type === "piece_locked"); + if (lockEvents.length > 0) { + results.push({ + name: "piece_locks", + pass: true, + detail: `filled cells persist at bottom (grid-verified, ${lockEvents.length} lock event(s))`, + }); + } else if (session.piecesLocked > 0 && session.piecesSpawned > 0) { + results.push({ + name: "piece_locks", + pass: true, + detail: `${session.piecesLocked} piece(s) locked during play`, + }); + } else if (session.piecesLocked > 0 && session.piecesSpawned === 0) { + results.push({ + name: "piece_locks", + pass: false, + detail: `${session.piecesLocked} lock event(s) but 0 spawns detected - likely false positive from UI misread`, + }); + } else { + results.push({ + name: "piece_locks", + pass: false, + detail: "could not verify piece locking via grid reader", + }); + } } // 10. new_piece_spawns - if (session.piecesSpawned > 0) { + if (!session.started) { + results.push(skipResult("new_piece_spawns", "game did not start")); + } else if (session.piecesSpawned > 0) { results.push({ name: "new_piece_spawns", pass: true, @@ -727,7 +840,9 @@ function deriveTestResults( } // 11. multiple_pieces - if (session.piecesLocked >= 3 && session.piecesSpawned > 0) { + if (!phaseState.mechanicsSucceeded) { + results.push(skipResult("multiple_pieces", "mechanics phase not met")); + } else if (session.piecesLocked >= 3 && session.piecesSpawned > 0) { results.push({ name: "multiple_pieces", pass: true, @@ -742,7 +857,9 @@ function deriveTestResults( } // 12. line_clear - if (session.linesCleared > 0) { + if (!phaseState.mechanicsSucceeded) { + results.push(skipResult("line_clear", "mechanics phase not met")); + } else if (session.linesCleared > 0) { results.push({ name: "line_clear", pass: true, @@ -757,7 +874,9 @@ function deriveTestResults( } // 13. score_changes - if (session.scoreValues.length >= 2) { + if (!phaseState.mechanicsSucceeded) { + results.push(skipResult("score_changes", "mechanics phase not met")); + } else if (session.scoreValues.length >= 2) { const min = Math.min(...session.scoreValues); const max = Math.max(...session.scoreValues); if (max > min) { @@ -787,37 +906,45 @@ function deriveTestResults( }); } - // 14. game_over - results.push({ - name: "game_over", - pass: session.gameOverDetected, - detail: session.gameOverDetected - ? "game stopped after stacking to top" - : "could not trigger or detect game over", - }); - - // 15. playable_30s - const crashed = session.consoleErrors.length > 0 || gameplay.errors_during_play > 3; - if (!crashed && gameplay.play_duration_seconds >= 10) { - results.push({ - name: "playable_30s", - pass: true, - detail: `played for ${gameplay.play_duration_seconds}s, placed ${gameplay.pieces_placed} pieces, no crashes`, - }); - } else if (crashed) { - results.push({ - name: "playable_30s", - pass: false, - detail: `${session.consoleErrors.length} console error(s), ${gameplay.errors_during_play} play errors`, - }); + // 14. game_over -- requires gameplay to have placed pieces + if (!phaseState.gameplayPlacedPieces) { + results.push(skipResult("game_over", "gameplay phase not met")); } else { results.push({ - name: "playable_30s", - pass: false, - detail: `only played for ${gameplay.play_duration_seconds}s`, + name: "game_over", + pass: session.gameOverDetected, + detail: session.gameOverDetected + ? "game stopped after stacking to top (grid-verified)" + : "could not trigger or detect game over via grid reader", }); } + // 15. playable_30s -- requires gameplay to have worked + if (!phaseState.gameplayPlacedPieces) { + results.push(skipResult("playable_30s", "gameplay phase not met")); + } else { + const crashed = session.consoleErrors.length > 0 || gameplay.errors_during_play > 3; + if (!crashed && gameplay.play_duration_seconds >= 10) { + results.push({ + name: "playable_30s", + pass: true, + detail: `played for ${gameplay.play_duration_seconds}s, placed ${gameplay.pieces_placed} pieces, no crashes`, + }); + } else if (crashed) { + results.push({ + name: "playable_30s", + pass: false, + detail: `${session.consoleErrors.length} console error(s), ${gameplay.errors_during_play} play errors`, + }); + } else { + results.push({ + name: "playable_30s", + pass: false, + detail: `only played for ${gameplay.play_duration_seconds}s`, + }); + } + } + return results; }

Impressum · Datenschutz