loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d90ff0c861644921c16ac1de3ee07a4cad53ed23
parent 9417cc444a07fdb588a0e10d992ca2dc3fe67c1e
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 23:34:40 +0200

Improve gameplay bot calibration with fallbacks and DOM grid detection

Calibration:
- 2s pre-wait for DOM to settle before detection
- Broadened button text: restart, reset, new (not just start/play)
- Click any button as fallback when named button not found
- Click body to catch games that start on any click
- Press 'a' key for games that start on any keydown
- recalibrateWithRetry(): when first calibration fails, tries all
  start mechanisms again with 1.5s waits, re-scanning grid each time

Grid detection:
- Heuristic DOM scan: finds containers with 180-220 uniformly-sized
  children (flat grid) or 18-22 rows of 8-12 cells (row-based grid)
- detectEmptyBg(): samples cells to find the most common background
  color as the "empty" reference
- Expanded filled-cell detection: checks occupied/locked classes,
  data-type attribute, background color comparison
- Relaxed count matching for non-standard grid sizes
- DOM fallback in readGrid() when renderer is unknown

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 211++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mtasks/tetris/eval/gameplay-bot/grid-reader.ts | 204++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 20++++++++++++++++----
3 files changed, 385 insertions(+), 50 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -24,12 +24,33 @@ export async function calibrate(page: Page): Promise<CalibrationResult> { const consoleErrors: string[] = []; page.on("pageerror", (err) => consoleErrors.push(err.message)); - const startMechanism = await detectStartMechanism(page); - const { renderer, gridBounds, cellWidth, cellHeight } = await detectGrid(page); - const backgroundColor = + // Wait for DOM to fully settle (scripts, animations, timers) + await page.waitForTimeout(2000); + + let startMechanism = await detectStartMechanism(page); + let { renderer, gridBounds, cellWidth, cellHeight } = await detectGrid(page); + let backgroundColor = renderer === "canvas" && gridBounds ? await sampleBackgroundColor(page, gridBounds, cellWidth, cellHeight) : null; + + // Re-calibration fallback: if start or grid detection failed, retry with + // longer waits and re-scan after each start attempt + if (startMechanism === "unknown" || gridBounds === null) { + const retry = await recalibrateWithRetry(page, startMechanism, gridBounds); + if (retry.startMechanism !== "unknown") startMechanism = retry.startMechanism; + if (retry.gridBounds) { + renderer = retry.renderer; + gridBounds = retry.gridBounds; + cellWidth = retry.cellWidth; + cellHeight = retry.cellHeight; + backgroundColor = + renderer === "canvas" && gridBounds + ? await sampleBackgroundColor(page, gridBounds, cellWidth, cellHeight) + : null; + } + } + const controls = await detectControls(page); const scoreElementSelector = await detectScoreElement(page); @@ -112,10 +133,21 @@ async function detectStartMechanism(page: Page): Promise<StartMechanism> { } prevShot = newShot; - // 5. Look for a start/play button + // 5. Click the body/document (some games start on any click) + try { + await page.locator("body").click({ position: { x: 100, y: 100 } }); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "click_canvas"; + } + prevShot = newShot; + } catch { /* continue */ } + + // 6. Look for a start/play/restart button try { const button = page.locator("button, a, [role='button']").filter({ - hasText: /start|play|begin|new game/i, + hasText: /start|play|begin|new game|restart|reset|new/i, }).first(); if ((await button.count()) > 0) { await button.click(); @@ -131,7 +163,7 @@ async function detectStartMechanism(page: Page): Promise<StartMechanism> { // Also try elements that aren't buttons but have matching text try { const textMatch = page.locator( - ':text-matches("start|play|begin|new.game", "i")' + ':text-matches("start|play|begin|new.game|restart|reset", "i")' ).first(); if ((await textMatch.count()) > 0) { await textMatch.click(); @@ -144,8 +176,22 @@ async function detectStartMechanism(page: Page): Promise<StartMechanism> { } } catch { /* continue */ } - // 6. Press any key (try a few) - for (const key of ["p", "s", "n", "Escape"]) { + // 7. Try clicking any <button> element regardless of text + try { + const anyButton = page.locator("button").first(); + if ((await anyButton.count()) > 0) { + await anyButton.click(); + await page.waitForTimeout(500); + newShot = await page.screenshot(); + if (!Buffer.from(prevShot).equals(Buffer.from(newShot))) { + return "button"; + } + prevShot = newShot; + } + } catch { /* continue */ } + + // 8. Press any key (try a few -- catches games that start on any keydown) + for (const key of ["a", "p", "s", "n", "Escape"]) { await page.keyboard.press(key); await page.waitForTimeout(300); newShot = await page.screenshot(); @@ -158,6 +204,107 @@ async function detectStartMechanism(page: Page): Promise<StartMechanism> { return "unknown"; } +/** + * Re-calibration fallback: try ALL start mechanisms again with longer waits, + * re-scanning for the grid after each attempt. Used when the first pass + * failed to detect the start mechanism or the grid. + */ +async function recalibrateWithRetry( + page: Page, + currentStart: StartMechanism, + currentGrid: GridBounds | null +): Promise<GridDetection & { startMechanism: StartMechanism }> { + let startMechanism: StartMechanism = currentStart; + let gridResult: GridDetection = { + renderer: "unknown", + gridBounds: currentGrid, + cellWidth: 0, + cellHeight: 0, + }; + + // Ordered list of start attempts with longer waits between each + const attempts: Array<{ name: StartMechanism; action: () => Promise<void> }> = [ + { + name: "click_canvas", + action: async () => { + const canvas = page.locator("canvas").first(); + if ((await canvas.count()) > 0) await canvas.click(); + }, + }, + { + name: "click_canvas", + action: async () => { + await page.locator("body").click({ position: { x: 200, y: 200 } }); + }, + }, + { + name: "enter", + action: async () => { await page.keyboard.press("Enter"); }, + }, + { + name: "space", + action: async () => { await page.keyboard.press("Space"); }, + }, + { + name: "button", + action: async () => { + const btn = page.locator("button").first(); + if ((await btn.count()) > 0) await btn.click(); + }, + }, + { + name: "button", + action: async () => { + const btn = page.locator("button, a, [role='button']").filter({ + hasText: /start|play|begin|restart|reset|new/i, + }).first(); + if ((await btn.count()) > 0) await btn.click(); + }, + }, + { + name: "anykey", + action: async () => { await page.keyboard.press("a"); }, + }, + { + name: "anykey", + action: async () => { await page.keyboard.press("ArrowDown"); }, + }, + ]; + + let prevShot = await page.screenshot(); + + for (const attempt of attempts) { + try { + await attempt.action(); + await page.waitForTimeout(1500); + + const newShot = await page.screenshot(); + const changed = !Buffer.from(prevShot).equals(Buffer.from(newShot)); + + if (changed && startMechanism === "unknown") { + startMechanism = attempt.name; + } + + // Re-scan for grid after each attempt + if (!gridResult.gridBounds) { + const detected = await detectGrid(page); + if (detected.gridBounds) { + gridResult = detected; + } + } + + // If we have both, stop early + if (startMechanism !== "unknown" && gridResult.gridBounds) { + break; + } + + prevShot = newShot; + } catch { /* continue */ } + } + + return { ...gridResult, startMechanism }; +} + interface GridDetection { renderer: RendererType; gridBounds: GridBounds | null; @@ -278,6 +425,54 @@ async function detectGrid(page: Page): Promise<GridDetection> { } } + // Heuristic scan: look for ANY container with many same-sized children + // arranged in a grid pattern, even without specific class/id naming + const allElements = document.querySelectorAll("div, section, main, article"); + for (const el of allElements) { + const ch = el.children; + // Flat list of ~200 cells (10x20) + if (ch.length >= 180 && ch.length <= 220) { + const firstChild = ch[0] as HTMLElement; + if (!firstChild) continue; + const firstRect = firstChild.getBoundingClientRect(); + if (firstRect.width < 5 || firstRect.height < 5) continue; + let uniform = true; + for (let i = 1; i < Math.min(10, ch.length); i++) { + const r = (ch[i] as HTMLElement).getBoundingClientRect(); + if (Math.abs(r.width - firstRect.width) > 2 || Math.abs(r.height - firstRect.height) > 2) { + uniform = false; + break; + } + } + if (uniform) { + const cols = 10; + const rows = Math.round(ch.length / cols); + const rect = el.getBoundingClientRect(); + return { + type: "dom" as const, + bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, + rows, + cols, + }; + } + } + // Container with ~20 row children, each having ~10 cell children + if (ch.length >= 18 && ch.length <= 22) { + const firstRowCells = ch[0].children; + if (firstRowCells.length >= 8 && firstRowCells.length <= 12) { + const rect = el.getBoundingClientRect(); + if (rect.width > 50 && rect.height > 100) { + return { + type: "dom" as const, + bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, + rows: ch.length, + cols: firstRowCells.length, + }; + } + } + } + } + return null; }); diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts @@ -24,6 +24,10 @@ export async function readGrid( if (cal.gridBounds) { return await readCanvasGrid(page, cal.gridBounds, cal.cellWidth, cal.cellHeight, cal.backgroundColor); } + // Last resort: try DOM reader even if renderer is unknown + // (the grid may have appeared after calibration) + const domGrid = await readDomGrid(page); + if (domGrid) return domGrid; return null; } catch { return null; @@ -117,6 +121,64 @@ async function readDomGrid(page: Page): Promise<Grid | null> { } } + // Helper: determine if a cell element is "filled" by checking its + // background color, class names, and data attributes. Also accepts + // an optional "empty" reference color so we can distinguish filled + // cells in games that use a non-standard background (e.g. dark gray + // for empty cells instead of transparent/black). + function isCellFilled(cell: HTMLElement, emptyBg?: string): boolean { + const style = window.getComputedStyle(cell); + const bg = style.backgroundColor; + const cls = cell.className.toLowerCase(); + + // Class/data attribute hints always win + if ( + cls.includes("filled") || + cls.includes("active") || + cls.includes("block") || + cls.includes("piece") || + cls.includes("occupied") || + cls.includes("locked") || + cell.dataset.filled === "true" || + cell.dataset.type !== undefined + ) { + return true; + } + + // If we have a known empty background, compare against it + if (emptyBg && bg === emptyBg) return false; + + // Default: non-transparent, non-black background = filled + return ( + bg !== "" && + bg !== "rgba(0, 0, 0, 0)" && + bg !== "transparent" && + bg !== "rgb(0, 0, 0)" + ); + } + + // Determine the "empty cell" background by sampling a few cells + // and picking the most common background color + function detectEmptyBg(cells: HTMLElement[]): string | undefined { + const colorCounts = new Map<string, number>(); + for (const cell of cells) { + const bg = window.getComputedStyle(cell).backgroundColor; + colorCounts.set(bg, (colorCounts.get(bg) || 0) + 1); + } + // The most common color is likely the empty cell color + let maxCount = 0; + let emptyBg: string | undefined; + for (const [color, count] of colorCounts) { + if (count > maxCount) { + maxCount = count; + emptyBg = color; + } + } + // Only use if it appears in > 60% of cells (most cells should be empty) + if (emptyBg && maxCount > cells.length * 0.6) return emptyBg; + return undefined; + } + // Strategy 2: look for a grid/flex container with child cells const containers = document.querySelectorAll( '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]' @@ -124,54 +186,120 @@ async function readDomGrid(page: Page): Promise<Grid | null> { for (const container of containers) { const children = container.children; // Could be a flat list of 200 cells (10x20) or 20 rows of 10 cells - if (children.length === rows * cols) { + if (children.length >= rows * cols - 10 && children.length <= rows * cols + 10) { + const actualCols = cols; + const actualRows = Math.round(children.length / actualCols); + const allCells = Array.from(children).slice(0, actualRows * actualCols) as HTMLElement[]; + const emptyBg = detectEmptyBg(allCells); const result: boolean[][] = []; - for (let r = 0; r < rows; r++) { + for (let r = 0; r < actualRows; r++) { const rowData: boolean[] = []; - for (let c = 0; c < cols; c++) { - const cell = children[r * cols + c] as HTMLElement; - const style = window.getComputedStyle(cell); - const bg = style.backgroundColor; - const cls = cell.className.toLowerCase(); - const isFilled = - (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") || - cls.includes("filled") || - cls.includes("active") || - cls.includes("block") || - cls.includes("piece") || - cell.dataset.filled === "true"; - rowData.push(isFilled); + for (let c = 0; c < actualCols; c++) { + const cell = allCells[r * actualCols + c]; + rowData.push(cell ? isCellFilled(cell, emptyBg) : false); } result.push(rowData); } return result; } // Could be 20 row containers each with 10 cells - if (children.length === rows) { - let valid = true; - const result: boolean[][] = []; - for (let r = 0; r < rows; r++) { - const rowEl = children[r]; - const cells = rowEl.children; - if (cells.length < cols) { valid = false; break; } - const rowData: boolean[] = []; - for (let c = 0; c < cols; c++) { - const cell = cells[c] as HTMLElement; - const style = window.getComputedStyle(cell); - const bg = style.backgroundColor; - const cls = cell.className.toLowerCase(); - const isFilled = - (bg !== "" && bg !== "rgba(0, 0, 0, 0)" && bg !== "transparent" && bg !== "rgb(0, 0, 0)") || - cls.includes("filled") || - cls.includes("active") || - cls.includes("block") || - cls.includes("piece") || - cell.dataset.filled === "true"; - rowData.push(isFilled); + if (children.length >= rows - 2 && children.length <= rows + 2) { + const firstRowCells = children[0]?.children; + if (firstRowCells && firstRowCells.length >= cols - 2 && firstRowCells.length <= cols + 2) { + const actualRows = children.length; + const actualCols = firstRowCells.length; + // Collect all cells for empty-bg detection + const allCells: HTMLElement[] = []; + for (let r = 0; r < actualRows; r++) { + const cells = children[r].children; + for (let c = 0; c < Math.min(actualCols, cells.length); c++) { + allCells.push(cells[c] as HTMLElement); + } + } + const emptyBg = detectEmptyBg(allCells); + let valid = true; + const result: boolean[][] = []; + for (let r = 0; r < actualRows; r++) { + const rowEl = children[r]; + const cells = rowEl.children; + if (cells.length < actualCols) { valid = false; break; } + const rowData: boolean[] = []; + for (let c = 0; c < actualCols; c++) { + rowData.push(isCellFilled(cells[c] as HTMLElement, emptyBg)); + } + result.push(rowData); + } + if (valid) return result; + } + } + } + + // Strategy 3: heuristic scan for ANY container with many same-sized + // children arranged in a grid pattern (no class/id naming required) + const allElements = document.querySelectorAll("div, section, main, article"); + for (const el of allElements) { + const ch = el.children; + // Flat list of ~200 cells + if (ch.length >= 180 && ch.length <= 220) { + const firstChild = ch[0] as HTMLElement; + if (!firstChild) continue; + const firstRect = firstChild.getBoundingClientRect(); + if (firstRect.width < 5 || firstRect.height < 5) continue; + let uniform = true; + for (let i = 1; i < Math.min(10, ch.length); i++) { + const r = (ch[i] as HTMLElement).getBoundingClientRect(); + if (Math.abs(r.width - firstRect.width) > 2 || Math.abs(r.height - firstRect.height) > 2) { + uniform = false; + break; + } + } + if (uniform) { + const actualCols = cols; + const actualRows = Math.round(ch.length / actualCols); + const allCells = Array.from(ch).slice(0, actualRows * actualCols) as HTMLElement[]; + const emptyBg = detectEmptyBg(allCells); + const result: boolean[][] = []; + for (let r = 0; r < actualRows; r++) { + const rowData: boolean[] = []; + for (let c = 0; c < actualCols; c++) { + const cell = allCells[r * actualCols + c]; + rowData.push(cell ? isCellFilled(cell, emptyBg) : false); + } + result.push(rowData); + } + return result; + } + } + // Container with ~20 row children each having ~10 cell children + if (ch.length >= 18 && ch.length <= 22) { + const firstRowCells = ch[0]?.children; + if (firstRowCells && firstRowCells.length >= 8 && firstRowCells.length <= 12) { + const rect = el.getBoundingClientRect(); + if (rect.width > 50 && rect.height > 100) { + const actualRows = ch.length; + const actualCols = firstRowCells.length; + const allCells: HTMLElement[] = []; + for (let r = 0; r < actualRows; r++) { + const cells = ch[r].children; + for (let c = 0; c < Math.min(actualCols, cells.length); c++) { + allCells.push(cells[c] as HTMLElement); + } + } + const emptyBg = detectEmptyBg(allCells); + let valid = true; + const result: boolean[][] = []; + for (let r = 0; r < actualRows; r++) { + const cells = ch[r].children; + if (cells.length < actualCols) { valid = false; break; } + const rowData: boolean[] = []; + for (let c = 0; c < actualCols; c++) { + rowData.push(isCellFilled(cells[c] as HTMLElement, emptyBg)); + } + result.push(rowData); + } + if (valid) return result; } - result.push(rowData); } - if (valid) return result; } } diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -458,16 +458,28 @@ async function testAllPiecesRotate( // Start the game (use camelCase startMechanism from CalibrationResult) if (cal.startMechanism === "button") { - const btn = page.locator("button").filter({ hasText: /start|play|begin|new/i }).first(); - if (await btn.count() > 0) await btn.click(); + const btn = page.locator("button, a, [role='button']").filter({ hasText: /start|play|begin|new|restart|reset/i }).first(); + if (await btn.count() > 0) { + await btn.click(); + } else { + // Fall back to clicking any button + const anyBtn = page.locator("button").first(); + if (await anyBtn.count() > 0) await anyBtn.click(); + } } else if (cal.startMechanism === "space") { await page.keyboard.press("Space"); } else if (cal.startMechanism === "enter") { await page.keyboard.press("Enter"); } else if (cal.startMechanism === "click_canvas") { - await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true }); + try { + await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true }); + } catch { + await page.locator("body").click({ position: { x: 200, y: 200 } }); + } + } else if (cal.startMechanism === "anykey") { + await page.keyboard.press("a"); } - await page.waitForTimeout(1000); + await page.waitForTimeout(1500); const rotatedPieces = new Set<string>(); const failedPieces = new Set<string>();

Impressum · Datenschutz