loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 43fb9fa4943a04511bffd96ccd1ba7e925d1ef15
parent 69173d2750e5cab2d6a94d1c152116be336341c2
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Wed,  8 Apr 2026 09:59:16 +0200

Rewrite start detection: 5-phase, language-agnostic, visual change

Phase 1: auto-start (10 frames at 100ms, no input)
Phase 2: DOM buttons by visual prominence (no text matching)
Phase 3: canvas click grid (center, upper, lower, 3x3)
Phase 4: keyboard triggers with combos
Phase 5: retry all phases
detectVisualChange: Level 1 (any change) + Level 2 (gameplay pattern)
30-second total budget. Stateful button recording.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 760+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 24+++++++++++++++++-------
Mtasks/tetris/eval/gameplay-bot/types.ts | 7+++++++
3 files changed, 499 insertions(+), 292 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -27,7 +27,9 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { // Wait for DOM to fully settle (scripts, animations, timers) await page.waitForTimeout(2000); - let startMechanism = await detectStartMechanism(page); + let startResult = await detectStartMechanism(page); + let startMechanism: StartMechanism = startResult.mechanism; + let startButton = startResult.startButton; let { renderer, gridBounds, cellWidth, cellHeight } = await detectGrid(page); let backgroundColor = renderer === "canvas" && gridBounds @@ -69,7 +71,7 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { gridConfidence: 0, }); - return { + const result: CalibrationResult = { renderer, gridDetected: gridBounds !== null, gridBounds, @@ -82,6 +84,12 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { consoleErrors, gridConfidence, }; + + if (startButton) { + result.startButton = startButton; + } + + return result; } /** @@ -158,194 +166,154 @@ async function measureGridConfidence( } /** - * Detect a falling piece by taking 5 screenshots ~200ms apart and looking - * for a cluster of colored pixels that moved downward between frames. + * Take a screenshot and sample it into a grid of "colored" (true) / "background" (false) + * values. Reusable building block for visual change detection. + */ +async function sampleScreenshot( + page: Page, + sampleCols: number, + sampleRows: number, + colorThreshold: number = 40 +): Promise<boolean[][]> { + const shot = await page.screenshot(); + const base64 = shot.toString("base64"); + const grid = await page.evaluate( + async ({ base64, sampleCols, sampleRows, colorThreshold }) => { + const img = new Image(); + const loaded = new Promise<void>((resolve, reject) => { + img.onload = () => resolve(); + img.onerror = () => reject(new Error("image decode failed")); + }); + img.src = `data:image/png;base64,${base64}`; + await loaded; + + const canvas = document.createElement("canvas"); + canvas.width = img.width; + canvas.height = img.height; + const ctx = canvas.getContext("2d")!; + ctx.drawImage(img, 0, 0); + + const stepX = img.width / sampleCols; + const stepY = img.height / sampleRows; + + const colors: number[][] = []; + for (let r = 0; r < sampleRows; r++) { + const row: number[] = []; + for (let c = 0; c < sampleCols; c++) { + const px = Math.floor(c * stepX + stepX / 2); + const py = Math.floor(r * stepY + stepY / 2); + const pixel = ctx.getImageData(px, py, 1, 1).data; + row.push(pixel[0] * 1000000 + pixel[1] * 1000 + pixel[2]); + } + colors.push(row); + } + + const colorCounts = new Map<number, number>(); + for (const row of colors) { + for (const c of row) { + colorCounts.set(c, (colorCounts.get(c) || 0) + 1); + } + } + let bgColor = 0; + let bgCount = 0; + for (const [color, count] of colorCounts) { + if (count > bgCount) { + bgCount = count; + bgColor = color; + } + } + const bgR = Math.floor(bgColor / 1000000); + const bgG = Math.floor((bgColor % 1000000) / 1000); + const bgB = bgColor % 1000; + + const result: boolean[][] = []; + for (let r = 0; r < sampleRows; r++) { + const row: boolean[] = []; + for (let c = 0; c < sampleCols; c++) { + const v = colors[r][c]; + const pR = Math.floor(v / 1000000); + const pG = Math.floor((v % 1000000) / 1000); + const pB = v % 1000; + const dist = Math.sqrt( + (pR - bgR) ** 2 + (pG - bgG) ** 2 + (pB - bgB) ** 2 + ); + row.push(dist > colorThreshold); + } + result.push(row); + } + return result; + }, + { base64, sampleCols, sampleRows, colorThreshold } + ); + return grid; +} + +/** + * Detect visual change by taking multiple screenshots at fast intervals. * - * This works for canvas, DOM, SVG, WebGL -- any rendering approach. - * It does NOT require the grid reader or calibrated grid bounds. + * Two-level detection: + * Level 1: did >5% of sampled pixels change between any two consecutive frames? + * Level 2: did something move downward (gameplay pattern)? * - * Implementation: divide the visible page into a grid of sample points - * (~20 columns x ~40 rows). Read pixel colors at each point via screenshot - * buffer. Between consecutive frames, look for a group of colored - * (non-background) points that disappeared from one position and appeared - * lower -- a "falling cluster" of roughly 4 cells (2x2 to 4x1 bounding box). + * Level 1 alone is enough to confirm the game responded to input. + * Level 2 confirms actual gameplay (piece falling). */ -async function detectFallingPiece(page: Page): Promise<boolean> { +async function detectVisualChange( + page: Page, + options?: { frames?: number; intervalMs?: number } +): Promise<{ changed: boolean; gameplayDetected: boolean }> { + const FRAMES = options?.frames ?? 10; + const INTERVAL = options?.intervalMs ?? 100; const SAMPLE_COLS = 20; const SAMPLE_ROWS = 40; - const SCREENSHOTS = 5; - const INTERVAL_MS = 200; - // Minimum downward shift in sample-grid rows to count as "falling" - const MIN_DOWN_SHIFT = 1; - // Cluster size bounds (roughly a tetromino: 3-6 sample points) - const MIN_CLUSTER = 3; - const MAX_CLUSTER = 12; - // Color distance threshold to distinguish filled from background - const COLOR_THRESHOLD = 40; - - // Take screenshots - const shots: Buffer[] = []; - for (let i = 0; i < SCREENSHOTS; i++) { - shots.push(await page.screenshot()); - if (i < SCREENSHOTS - 1) await page.waitForTimeout(INTERVAL_MS); - } + const CHANGE_THRESHOLD = 0.05; - // Parse each screenshot into a grid of "colored" (true) / "background" (false) - // by sampling pixel colors at evenly spaced points. - // We use page.evaluate to decode the PNG in the browser via canvas. const grids: boolean[][][] = []; - - for (const shot of shots) { - const base64 = shot.toString("base64"); - const grid = await page.evaluate( - async ({ base64, sampleCols, sampleRows, colorThreshold }) => { - // Decode the screenshot PNG into pixel data - const img = new Image(); - const loaded = new Promise<void>((resolve, reject) => { - img.onload = () => resolve(); - img.onerror = () => reject(new Error("image decode failed")); - }); - img.src = `data:image/png;base64,${base64}`; - await loaded; - - const canvas = document.createElement("canvas"); - canvas.width = img.width; - canvas.height = img.height; - const ctx = canvas.getContext("2d")!; - ctx.drawImage(img, 0, 0); - - const stepX = img.width / sampleCols; - const stepY = img.height / sampleRows; - - // First pass: sample all pixel colors - const colors: number[][] = []; - for (let r = 0; r < sampleRows; r++) { - const row: number[] = []; - for (let c = 0; c < sampleCols; c++) { - const px = Math.floor(c * stepX + stepX / 2); - const py = Math.floor(r * stepY + stepY / 2); - const pixel = ctx.getImageData(px, py, 1, 1).data; - // Store as a single luminance-like value for quick background detection - // and the full RGB for distance checks - row.push(pixel[0] * 1000000 + pixel[1] * 1000 + pixel[2]); - } - colors.push(row); - } - - // Determine background color: the most common color in the sample grid - const colorCounts = new Map<number, number>(); - for (const row of colors) { - for (const c of row) { - colorCounts.set(c, (colorCounts.get(c) || 0) + 1); - } - } - let bgColor = 0; - let bgCount = 0; - for (const [color, count] of colorCounts) { - if (count > bgCount) { - bgCount = count; - bgColor = color; - } - } - const bgR = Math.floor(bgColor / 1000000); - const bgG = Math.floor((bgColor % 1000000) / 1000); - const bgB = bgColor % 1000; - - // Second pass: mark cells as "colored" if they differ from background - const result: boolean[][] = []; - for (let r = 0; r < sampleRows; r++) { - const row: boolean[] = []; - for (let c = 0; c < sampleCols; c++) { - const v = colors[r][c]; - const pR = Math.floor(v / 1000000); - const pG = Math.floor((v % 1000000) / 1000); - const pB = v % 1000; - const dist = Math.sqrt( - (pR - bgR) ** 2 + (pG - bgG) ** 2 + (pB - bgB) ** 2 - ); - row.push(dist > colorThreshold); - } - result.push(row); - } - return result; - }, - { base64, sampleCols: SAMPLE_COLS, sampleRows: SAMPLE_ROWS, colorThreshold: COLOR_THRESHOLD } - ); + for (let i = 0; i < FRAMES; i++) { + const grid = await sampleScreenshot(page, SAMPLE_COLS, SAMPLE_ROWS); grids.push(grid); + if (i < FRAMES - 1) await page.waitForTimeout(INTERVAL); } - // Compare consecutive frame pairs to find downward-moving clusters + // Level 1: check for any significant change between consecutive frames + let changed = false; for (let f = 0; f < grids.length - 1; f++) { - const prev = grids[f]; - const curr = grids[f + 1]; - - // Find cells that were colored in prev but not in curr ("disappeared") - const disappeared: [number, number][] = []; - // Find cells that are colored in curr but not in prev ("appeared") - const appeared: [number, number][] = []; - + let diffs = 0; for (let r = 0; r < SAMPLE_ROWS; r++) { for (let c = 0; c < SAMPLE_COLS; c++) { - if (prev[r][c] && !curr[r][c]) disappeared.push([r, c]); - if (!prev[r][c] && curr[r][c]) appeared.push([r, c]); + if (grids[f][r][c] !== grids[f + 1][r][c]) diffs++; } } + if (diffs / (SAMPLE_ROWS * SAMPLE_COLS) > CHANGE_THRESHOLD) { + changed = true; + break; + } + } - // If nothing changed, no falling piece in this frame pair - if (disappeared.length === 0 || appeared.length === 0) continue; - - // Cluster the disappeared points using simple flood fill - const disappearedClusters = clusterPoints(disappeared); - const appearedClusters = clusterPoints(appeared); - - // For each disappeared cluster, look for a matching appeared cluster - // that is shifted downward (same rough column range, higher row numbers) - for (const dCluster of disappearedClusters) { - if (dCluster.length < MIN_CLUSTER || dCluster.length > MAX_CLUSTER) continue; - - const dMinCol = Math.min(...dCluster.map(([, c]) => c)); - const dMaxCol = Math.max(...dCluster.map(([, c]) => c)); - const dMinRow = Math.min(...dCluster.map(([r]) => r)); - const dCenterCol = (dMinCol + dMaxCol) / 2; - - for (const aCluster of appearedClusters) { - if (aCluster.length < MIN_CLUSTER || aCluster.length > MAX_CLUSTER) continue; - - const aMinCol = Math.min(...aCluster.map(([, c]) => c)); - const aMaxCol = Math.max(...aCluster.map(([, c]) => c)); - const aMinRow = Math.min(...aCluster.map(([r]) => r)); - const aCenterCol = (aMinCol + aMaxCol) / 2; - - // Check: appeared cluster is below disappeared cluster - // and in roughly the same column range - const colOverlap = Math.abs(dCenterCol - aCenterCol) <= 3; - const movedDown = aMinRow > dMinRow && (aMinRow - dMinRow) >= MIN_DOWN_SHIFT; + if (!changed) return { changed: false, gameplayDetected: false }; - if (colOverlap && movedDown) { - return true; - } + // Level 2: check for downward movement pattern + let gameplayDetected = false; + for (let f = 0; f < grids.length - 1; f++) { + const disappeared: [number, number][] = []; + const appeared: [number, number][] = []; + for (let r = 0; r < SAMPLE_ROWS; r++) { + for (let c = 0; c < SAMPLE_COLS; c++) { + if (grids[f][r][c] && !grids[f + 1][r][c]) disappeared.push([r, c]); + if (!grids[f][r][c] && grids[f + 1][r][c]) appeared.push([r, c]); } } - - // Also check if the overall set of colored points shifted down - // (handles cases where clusters partially overlap between frames) - if (disappeared.length >= MIN_CLUSTER && appeared.length >= MIN_CLUSTER) { - const dAvgRow = disappeared.reduce((s, [r]) => s + r, 0) / disappeared.length; - const aAvgRow = appeared.reduce((s, [r]) => s + r, 0) / appeared.length; - const dAvgCol = disappeared.reduce((s, [, c]) => s + c, 0) / disappeared.length; - const aAvgCol = appeared.reduce((s, [, c]) => s + c, 0) / appeared.length; - - if ( - aAvgRow > dAvgRow + MIN_DOWN_SHIFT && - Math.abs(aAvgCol - dAvgCol) <= 3 && - Math.abs(disappeared.length - appeared.length) <= 4 - ) { - return true; + if (disappeared.length >= 3 && appeared.length >= 3) { + const avgDisRow = disappeared.reduce((s, [r]) => s + r, 0) / disappeared.length; + const avgAppRow = appeared.reduce((s, [r]) => s + r, 0) / appeared.length; + if (avgAppRow > avgDisRow) { + gameplayDetected = true; + break; } } } - return false; + return { changed, gameplayDetected }; } /** @@ -389,125 +357,361 @@ function clusterPoints(points: [number, number][]): [number, number][][] { return clusters; } +/** Result of the 5-phase start detection. */ +interface StartDetectionResult { + mechanism: StartMechanism; + startButton?: CalibrationResult["startButton"]; +} + /** - * Try multiple mechanisms to start the game. - * After each trigger, runs the falling piece detector to confirm - * the game actually started (not just a title screen animation). + * 5-phase start detection. Language-agnostic, visual-first. * - * The ONLY way to confirm start is detecting a falling piece. + * Phase 1: Auto-start (1s, no input) + * Phase 2: DOM button discovery (click all clickable elements by prominence) + * Phase 3: Canvas click grid (for canvas-rendered buttons) + * Phase 4: Keyboard triggers (Enter, Space, ArrowDown, Z, combos) + * Phase 5: Retry phases 2-4 (some games need two interactions) + * + * Total budget: 30 seconds. */ -async function detectStartMechanism(page: Page): Promise<StartMechanism> { - // Ordered list of triggers to try - const triggers: Array<{ name: StartMechanism; action: () => Promise<void> }> = [ - // 1. Wait 3s (auto-start games) - { - name: "auto", - action: async () => { - await page.waitForTimeout(3000); - }, - }, - // 2. Click the canvas - { - name: "click_canvas", - action: async () => { - const canvas = page.locator("canvas").first(); - if ((await canvas.count()) > 0) await canvas.click(); - }, - }, - // 3. Click any game-like container - { - name: "click_canvas", - action: async () => { - const container = page.locator( - '[class*="game"], [class*="board"], [id*="game"], [id*="board"]' - ).first(); - if ((await container.count()) > 0) await container.click(); - }, - }, - // 4. Press Enter - { - name: "enter", - action: async () => { await page.keyboard.press("Enter"); }, - }, - // 5. Press Space - { - name: "space", - action: async () => { await page.keyboard.press("Space"); }, - }, - // 6. Click body - { - name: "click_canvas", - action: async () => { - await page.locator("body").click({ position: { x: 100, y: 100 } }); - }, - }, - // 7. Click a start/play button - { - name: "button", - action: async () => { - const button = page.locator("button, a, [role='button']").filter({ - hasText: /start|play|begin|new game|restart|reset|new/i, - }).first(); - if ((await button.count()) > 0) await button.click(); - }, - }, - // 8. Click text that looks like a start prompt - { - name: "button", - action: async () => { - const textMatch = page.locator( - ':text-matches("start|play|begin|new.game|restart|reset", "i")' - ).first(); - if ((await textMatch.count()) > 0) await textMatch.click(); - }, - }, - // 9. Click any button regardless of text - { - name: "button", - action: async () => { - const anyButton = page.locator("button").first(); - if ((await anyButton.count()) > 0) await anyButton.click(); - }, - }, - // 10. Press various keys (catches games that start on any keydown) - { - name: "anykey", - action: async () => { - for (const key of ["a", "p", "s", "n", "Escape"]) { - await page.keyboard.press(key); - await page.waitForTimeout(100); +async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { + const deadline = Date.now() + 30000; + + const budgetExceeded = () => Date.now() >= deadline; + + // ---- Phase 1: Auto-start (no input, 1 second) ---- + { + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { + return { mechanism: "auto" }; + } + } + + // ---- Phase 2: DOM button discovery (language-agnostic) ---- + { + const phase2Result = await tryDomButtons(page, budgetExceeded); + if (phase2Result) return phase2Result; + } + + // ---- Phase 3: Canvas click grid ---- + if (!budgetExceeded()) { + const phase3Result = await tryCanvasClicks(page, budgetExceeded); + if (phase3Result) return phase3Result; + } + + // ---- Phase 4: Keyboard triggers ---- + if (!budgetExceeded()) { + const phase4Result = await tryKeyboardTriggers(page, budgetExceeded); + if (phase4Result) return phase4Result; + } + + // ---- Phase 5: Retry phases 2-4 (some games need two interactions) ---- + if (!budgetExceeded()) { + const phase2Retry = await tryDomButtons(page, budgetExceeded); + if (phase2Retry) return phase2Retry; + } + if (!budgetExceeded()) { + const phase3Retry = await tryCanvasClicks(page, budgetExceeded); + if (phase3Retry) return phase3Retry; + } + if (!budgetExceeded()) { + const phase4Retry = await tryKeyboardTriggers(page, budgetExceeded); + if (phase4Retry) return phase4Retry; + } + + return { mechanism: "unknown" }; +} + +/** + * Phase 2: Find all clickable DOM elements (language-agnostic, no text matching). + * Sort by visual prominence (size, centrality). Click each and observe. + */ +async function tryDomButtons( + page: Page, + budgetExceeded: () => boolean +): Promise<StartDetectionResult | null> { + try { + // Collect all clickable elements + const clickableSelector = + 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; + const visualSelector = + '[class*="btn"], [class*="button"], [class*="start"], [class*="play"], ' + + '[id*="start"], [id*="play"], [id*="btn"]'; + + // Gather element info (position, size, text) for sorting + const elementInfos = await page.evaluate( + ({ clickableSelector, visualSelector }) => { + const seen = new Set<Element>(); + const results: Array<{ + index: number; + text: string; + x: number; + y: number; + width: number; + height: number; + area: number; + centerDist: number; + selector: string; + }> = []; + + const allElements: Element[] = []; + for (const el of document.querySelectorAll(clickableSelector)) { + if (!seen.has(el)) { + seen.add(el); + allElements.push(el); + } + } + for (const el of document.querySelectorAll(visualSelector)) { + if (!seen.has(el)) { + seen.add(el); + allElements.push(el); + } + } + + const pageW = window.innerWidth; + const pageH = window.innerHeight; + const pageCenterX = pageW / 2; + const pageCenterY = pageH / 2; + + for (let i = 0; i < allElements.length; i++) { + const el = allElements[i]; + const rect = el.getBoundingClientRect(); + if (rect.width < 5 || rect.height < 5) continue; + if (rect.top > pageH || rect.left > pageW) continue; + + const cx = rect.left + rect.width / 2; + const cy = rect.top + rect.height / 2; + const centerDist = Math.sqrt((cx - pageCenterX) ** 2 + (cy - pageCenterY) ** 2); + + let selector = ""; + if (el.id) { + selector = `#${el.id}`; + } else if ((el as HTMLElement).className) { + const cls = (el as HTMLElement).className.toString().split(" ")[0]; + if (cls) selector = `${el.tagName.toLowerCase()}.${cls}`; + } + if (!selector) selector = `${el.tagName.toLowerCase()}:nth-of-type(${i + 1})`; + + results.push({ + index: i, + text: (el.textContent || "").trim().slice(0, 50), + x: Math.round(cx), + y: Math.round(cy), + width: rect.width, + height: rect.height, + area: rect.width * rect.height, + centerDist, + selector, + }); } + + // Sort by visual prominence: larger elements first, then closer to center + results.sort((a, b) => b.area - a.area || a.centerDist - b.centerDist); + + return results; }, - }, - // 11. Press ArrowDown (some games start on directional input) - { - name: "anykey", - action: async () => { await page.keyboard.press("ArrowDown"); }, - }, + { clickableSelector, visualSelector } + ); + + // Click each element and observe for visual change + for (const info of elementInfos) { + if (budgetExceeded()) break; + + try { + // Check if element still exists before clicking + const wasVisible = await page.evaluate( + ({ x, y }) => { + const el = document.elementFromPoint(x, y); + return el !== null; + }, + { x: info.x, y: info.y } + ); + if (!wasVisible) continue; + + await page.mouse.click(info.x, info.y); + await page.waitForTimeout(200); + + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { + // Check if the element disappeared after clicking + const disappeared = await page.evaluate( + ({ selector }) => { + if (!selector) return false; + try { + const el = document.querySelector(selector); + if (!el) return true; + const rect = el.getBoundingClientRect(); + return rect.width === 0 || rect.height === 0; + } catch { + return false; + } + }, + { selector: info.selector } + ); + + return { + mechanism: "button", + startButton: { + selector: info.selector, + text: info.text, + disappeared, + position: { x: info.x, y: info.y }, + }, + }; + } + + // No change -- try pressing Escape to undo any menu we opened + try { + await page.keyboard.press("Escape"); + await page.waitForTimeout(100); + } catch { /* ignore */ } + } catch { /* continue to next element */ } + } + } catch { /* phase 2 failed entirely */ } + + return null; +} + +/** + * Phase 3: Click the canvas at strategic positions. + * Center first, then upper-center, lower-center, then a 3x3 grid. + */ +async function tryCanvasClicks( + page: Page, + budgetExceeded: () => boolean +): Promise<StartDetectionResult | null> { + // Find the canvas or primary game container + let targetBox: { x: number; y: number; width: number; height: number } | null = null; + + try { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) { + targetBox = await canvas.boundingBox(); + } + } catch { /* no canvas */ } + + if (!targetBox) { + // Try the viewport itself + const viewport = page.viewportSize(); + if (viewport) { + targetBox = { x: 0, y: 0, width: viewport.width, height: viewport.height }; + } + } + + if (!targetBox) return null; + + const cx = targetBox.x + targetBox.width / 2; + const cy = targetBox.y + targetBox.height / 2; + + // Click positions: center, upper-center, lower-center, then 3x3 grid + const positions: Array<{ x: number; y: number; label: string }> = [ + { x: cx, y: cy, label: "center" }, + { x: cx, y: targetBox.y + targetBox.height * 0.25, label: "upper-center" }, + { x: cx, y: targetBox.y + targetBox.height * 0.75, label: "lower-center" }, ]; - for (const trigger of triggers) { + // Add 3x3 grid positions (skip center since we already have it) + for (let row = 0; row < 3; row++) { + for (let col = 0; col < 3; col++) { + if (row === 1 && col === 1) continue; // skip center duplicate + positions.push({ + x: targetBox.x + targetBox.width * (col + 0.5) / 3, + y: targetBox.y + targetBox.height * (row + 0.5) / 3, + label: `grid_${row}_${col}`, + }); + } + } + + for (const pos of positions) { + if (budgetExceeded()) break; + + try { + await page.mouse.click(pos.x, pos.y); + await page.waitForTimeout(200); + + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { + return { + mechanism: "click_canvas", + startButton: { + selector: "canvas", + text: `canvas click at ${pos.label}`, + disappeared: false, + position: { x: Math.round(pos.x), y: Math.round(pos.y) }, + }, + }; + } + } catch { /* continue */ } + } + + return null; +} + +/** + * Phase 4: Keyboard triggers. + * Try Enter, Space, ArrowDown, Z individually, + * then click-then-Enter and click-then-Space combos. + */ +async function tryKeyboardTriggers( + page: Page, + budgetExceeded: () => boolean +): Promise<StartDetectionResult | null> { + const mechanismMap: Record<string, StartMechanism> = { + Enter: "enter", + Space: "space", + ArrowDown: "anykey", + z: "anykey", + }; + + // Single key presses + for (const key of ["Enter", "Space", "ArrowDown", "z"]) { + if (budgetExceeded()) break; + try { - await trigger.action(); - // Wait for overlay to dismiss and first piece to start falling - await page.waitForTimeout(1500); + await page.keyboard.press(key); + await page.waitForTimeout(200); + + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { + return { mechanism: mechanismMap[key] }; + } + } catch { /* continue */ } + } + + // Combo: click canvas center, then Enter / Space + for (const key of ["Enter", "Space"]) { + if (budgetExceeded()) break; + + try { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) { + await canvas.click(); + } else { + const viewport = page.viewportSize(); + if (viewport) { + await page.mouse.click(viewport.width / 2, viewport.height / 2); + } + } + await page.waitForTimeout(100); + await page.keyboard.press(key); + await page.waitForTimeout(200); - if (await detectFallingPiece(page)) { - return trigger.name; + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { + return { mechanism: mechanismMap[key] }; } - } catch { /* continue to next trigger */ } + } catch { /* continue */ } } - return "unknown"; + return null; } /** - * Re-calibration fallback: try ALL start mechanisms again with longer waits, + * Re-calibration fallback: try start mechanisms again with longer waits, * re-scanning for the grid after each attempt. Used when the first pass * failed to detect the start mechanism or the grid. * - * Uses the falling piece detector (not screenshot comparison) to confirm - * the game actually started. + * Uses detectVisualChange() to confirm the game responded. */ async function recalibrateWithRetry( page: Page, @@ -522,7 +726,6 @@ async function recalibrateWithRetry( cellHeight: 0, }; - // Ordered list of start attempts with longer waits between each const attempts: Array<{ name: StartMechanism; action: () => Promise<void> }> = [ { name: "click_canvas", @@ -553,19 +756,6 @@ async function recalibrateWithRetry( }, }, { - name: "button", - action: async () => { - const btn = page.locator("button, a, [role='button']").filter({ - hasText: /start|play|begin|restart|reset|new/i, - }).first(); - if ((await btn.count()) > 0) await btn.click(); - }, - }, - { - name: "anykey", - action: async () => { await page.keyboard.press("a"); }, - }, - { name: "anykey", action: async () => { await page.keyboard.press("ArrowDown"); }, }, @@ -574,11 +764,11 @@ async function recalibrateWithRetry( for (const attempt of attempts) { try { await attempt.action(); - await page.waitForTimeout(1500); + await page.waitForTimeout(200); - // Use falling piece detector instead of screenshot comparison if (startMechanism === "unknown") { - if (await detectFallingPiece(page)) { + const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + if (result.changed) { startMechanism = attempt.name; } } diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -637,13 +637,23 @@ function deriveTestResults( }); // 2. game_starts - results.push({ - name: "game_starts", - pass: session.started, - detail: session.started - ? `started via ${session.startMechanism}` - : "could not start game with any mechanism", - }); + { + let startDetail: string; + if (session.started) { + startDetail = `started via ${session.startMechanism}`; + if (cal.startButton) { + const btn = cal.startButton; + startDetail += ` (${btn.selector}, "${btn.text}"${btn.disappeared ? ", disappeared after click" : ""})`; + } + } else { + startDetail = "could not start game with any mechanism"; + } + results.push({ + name: "game_starts", + pass: session.started, + detail: startDetail, + }); + } // Helper: produce a skip result for tests whose prerequisite phase was skipped const skipResult = (name: string, reason: string): TestResult => ({ diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -47,6 +47,13 @@ export interface CalibrationResult { consoleErrors: string[]; /** Fraction of grid reads that returned non-null during calibration polling. */ gridConfidence: number; + /** Details about the button that started the game, if any. */ + startButton?: { + selector: string; + text: string; + disappeared: boolean; + position: { x: number; y: number }; + }; } /** Result of an individual test. */

Impressum · Datenschutz