loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 8dc9ec566791cf32913b7ea8f3ba37a789ef0b86
parent 14d5747dc2f92d77eaa7655145f2fe71bcde4d0a
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 10 Apr 2026 18:58:30 +0200

V2 fix: handle absolute-positioned active piece overlays

Game 8fe72fce uses absolute-positioned div overlays for the falling
piece, separate from the 200 grid cells. The grid reader was missing
the active piece because it only read the first 200 children.

Fix:
- Added refreshGridDetection() in driver: re-detects grid without
  full re-calibration, called by verifyGameStarted() after start clicks
- readDomGrid() now reads overlay children (>200 children) and computes
  which grid cell each absolute-positioned overlay falls into
- Widened child-count ranges from 180-220 to 180-230 to accommodate overlays
- Added screenshotGridArea() and captureGridDomFingerprint() as fallback
  signals for verifyGameStarted when grid-based detection misses

Results: 8fe72fce 0% -> 95% (matches human's 20/20).
Overall V2 vs human: 82% -> 86% agreement.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mtasks/tetris/eval/gameplay-bot-v2/driver.ts | 222++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 27+++++++++++++++++++++++++++
3 files changed, 305 insertions(+), 10 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -1915,8 +1915,15 @@ async function verifyGameStarted(driver: TetrisDriver): Promise<{ } } catch { /* continue */ } - // 2. tryStartMechanism() populated a minimal calibration for us. If it - // couldn't find a grid, the candidate is not a real start. + // 2. tryStartMechanism() populated a minimal calibration for us, but some + // games create their grid cells dynamically inside an animation frame + // after the start button is clicked -- the initial detectGrid() can run + // before the grid is fully populated. Refresh grid detection now so we + // pick up any cells that appeared in the meantime. + try { + await driver.refreshGridDetection(); + } catch { /* leave whatever tryStartMechanism populated */ } + let cal; try { cal = driver.getCalibration(); @@ -1947,26 +1954,55 @@ async function verifyGameStarted(driver: TetrisDriver): Promise<{ } // 4. Evidence: press ArrowLeft and see if the grid changes (movement works). + // Capture pixel + DOM fingerprint snapshots at each step so we can fall + // back to either signal for games that render the active piece outside + // the cell layout (e.g. absolute-positioned divs floating over the grid). + // Pixel diff is clipped to the grid area; DOM fingerprint catches changes + // even when the piece is currently off-screen. let movementSeen = false; + let movementPixelsChanged = false; + let movementDomChanged = false; try { + const pxBefore = await driver.screenshotGridArea(); + const fpBefore = await driver.captureGridDomFingerprint(); const before = await driver.readGrid(); await driver.pressKey("left"); await driver.wait(250); const after = await driver.readGrid(); + const pxAfter = await driver.screenshotGridArea(); + const fpAfter = await driver.captureGridDomFingerprint(); if (before.grid && after.grid && driver.gridsAreDifferent(before.grid, after.grid)) { movementSeen = true; } + if (pxBefore && pxAfter && !pxBefore.equals(pxAfter)) { + movementPixelsChanged = true; + } + if (fpBefore && fpAfter && fpBefore !== fpAfter) { + movementDomChanged = true; + } } catch { /* fall through to auto-drop check */ } // 5. Evidence: wait 1.1s and see if the grid changes on its own (auto-drop). let autoDropSeen = false; + let autoDropPixelsChanged = false; + let autoDropDomChanged = false; try { + const pxBefore = await driver.screenshotGridArea(); + const fpBefore = await driver.captureGridDomFingerprint(); const before = await driver.readGrid(); await driver.wait(1100); const after = await driver.readGrid(); + const pxAfter = await driver.screenshotGridArea(); + const fpAfter = await driver.captureGridDomFingerprint(); if (before.grid && after.grid && driver.gridsAreDifferent(before.grid, after.grid)) { autoDropSeen = true; } + if (pxBefore && pxAfter && !pxBefore.equals(pxAfter)) { + autoDropPixelsChanged = true; + } + if (fpBefore && fpAfter && fpBefore !== fpAfter) { + autoDropDomChanged = true; + } } catch { /* fall through */ } // 6. Second chance at game-over after interaction. @@ -1987,6 +2023,32 @@ async function verifyGameStarted(driver: TetrisDriver): Promise<{ return { ok: true, reason: "grid changes on its own (auto-drop)" }; } + // 6b. Pixel-based fallback: if the grid reader can't see movement but the + // grid-area pixels changed both on key press AND during auto-drop, we're + // almost certainly looking at a running Tetris game that renders its + // active piece outside the cell layout (absolute divs, canvas overlay, + // etc). Require BOTH signals to avoid accepting spurious animations + // (cursor blink, score tick) as gameplay. + if (movementPixelsChanged && autoDropPixelsChanged) { + return { + ok: true, + reason: "grid-area pixels change on key press and on auto-drop (piece rendered outside cells)", + }; + } + + // 6c. DOM-fingerprint fallback: when the active piece is an absolute- + // positioned overlay that happens to be off-screen in the current + // viewport (tall sidebars that push the grid out of frame), pixel diff + // can come back clean while the DOM still reflects the moving piece. + // Require BOTH a key-press-driven change AND an auto-drop-driven change + // so static pages with idle timers don't slip through. + if (movementDomChanged && autoDropDomChanged) { + return { + ok: true, + reason: "grid container DOM changes on key press and on auto-drop (piece rendered outside cells)", + }; + } + // 7. Weaker fallback: if the grid is populated in a plausible range // (some pieces visible somewhere) and there's no game over, accept it // provisionally. The downstream phases will weed out dead starts. diff --git a/tasks/tetris/eval/gameplay-bot-v2/driver.ts b/tasks/tetris/eval/gameplay-bot-v2/driver.ts @@ -349,7 +349,7 @@ export class PlaywrightDriver implements TetrisDriver { for (const container of containers) { const ch = container.children; if ( - (ch.length >= 180 && ch.length <= 220) || + (ch.length >= 180 && ch.length <= 230) || (ch.length >= 18 && ch.length <= 22 && ch[0]?.children.length >= 8 && ch[0]?.children.length <= 12) ) { @@ -372,7 +372,7 @@ export class PlaywrightDriver implements TetrisDriver { const allElements = document.querySelectorAll("div, section, main, article"); for (const el of allElements) { const ch = el.children; - if (ch.length >= 180 && ch.length <= 220) { + if (ch.length >= 180 && ch.length <= 230) { const firstChild = ch[0] as HTMLElement; if (!firstChild) continue; const firstRect = firstChild.getBoundingClientRect(); @@ -725,6 +725,62 @@ export class PlaywrightDriver implements TetrisDriver { return this.cal!; } + /** + * Lightweight grid re-detection without any side effects. Unlike + * recalibrate() / calibrate(), this never clicks, presses keys, or + * runs detectStartMechanism(). Safe to call from verifyGameStarted() + * mid-start-discovery -- if the page has since spawned its grid (e.g. + * a DOM game that builds cells inside requestAnimationFrame after a + * start button click), the cached calibration gets updated; otherwise + * this.cal is left untouched. + */ + async refreshGridDetection(): Promise<void> { + // Short settle delay: some games build their grid inside the first few + // animation frames after startGame() runs, so the initial detectGrid() + // inside tryStartMechanism() may have fired before the DOM was ready. + await this.page.waitForTimeout(200); + const gridDetection = await this.detectGrid(); + if (!gridDetection.gridBounds) return; + + const backgroundColor = + gridDetection.renderer === "canvas" + ? await this.sampleBackgroundColor( + gridDetection.gridBounds, + gridDetection.cellWidth, + gridDetection.cellHeight + ) + : null; + + if (this.cal) { + this.cal = { + ...this.cal, + renderer: gridDetection.renderer, + gridDetected: true, + gridBounds: gridDetection.gridBounds, + cellWidth: gridDetection.cellWidth, + cellHeight: gridDetection.cellHeight, + backgroundColor, + gridDetectedAt: "after_start", + }; + } else { + this.cal = { + renderer: gridDetection.renderer, + gridDetected: true, + gridBounds: gridDetection.gridBounds, + cellWidth: gridDetection.cellWidth, + cellHeight: gridDetection.cellHeight, + controls: { ...DEFAULT_CONTROLS }, + startMechanism: "unknown", + scoreElementSelector: null, + levelElementSelector: null, + backgroundColor, + consoleErrors: [...this.consoleErrors], + gridConfidence: 0, + gridDetectedAt: "after_start", + }; + } + } + getCalibration(): DriverCalibration { if (!this.cal) throw new Error("calibrate() must be called before getCalibration()"); return this.cal; @@ -1330,6 +1386,108 @@ export class PlaywrightDriver implements TetrisDriver { return await this.page.screenshot(); } + async screenshotGridArea(): Promise<Buffer | null> { + const cal = this.cal; + if (!cal || !cal.gridBounds) return null; + const b = cal.gridBounds; + // For DOM renderers, gridBounds are viewport coordinates and can be clipped + // directly. For canvas renderers they are internal canvas coordinates, so + // re-derive the on-page bounds from the canvas location to stay accurate. + try { + if (cal.renderer === "canvas") { + const boundingBox = await this.page.locator("canvas").first().boundingBox(); + if (!boundingBox) return null; + return await this.page.screenshot({ + clip: { + x: Math.max(0, Math.round(boundingBox.x)), + y: Math.max(0, Math.round(boundingBox.y)), + width: Math.max(1, Math.round(boundingBox.width)), + height: Math.max(1, Math.round(boundingBox.height)), + }, + }); + } + return await this.page.screenshot({ + clip: { + x: Math.max(0, Math.round(b.x)), + y: Math.max(0, Math.round(b.y)), + width: Math.max(1, Math.round(b.width)), + height: Math.max(1, Math.round(b.height)), + }, + }); + } catch { + return null; + } + } + + async captureGridDomFingerprint(): Promise<string> { + try { + return await this.page.evaluate(() => { + // Locate the most plausible grid container. Mirrors the detection in + // detectGrid() but runs standalone so the fingerprint works even when + // the calibration has not committed to a specific grid yet. + const findContainer = (): Element | null => { + const tables = document.querySelectorAll("table"); + for (const table of tables) { + const rows = table.querySelectorAll("tr"); + if (rows.length >= 18) { + const firstRow = rows[0].querySelectorAll("td"); + if (firstRow.length >= 8 && firstRow.length <= 12) return table; + } + } + const namedCandidates = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], ' + + '[id*="board"], [id*="grid"], [id*="field"]' + ); + for (const c of namedCandidates) { + const ch = c.children; + if (ch.length >= 180 && ch.length <= 230) return c; + if ( + ch.length >= 18 && ch.length <= 22 && + ch[0] && ch[0].children.length >= 8 && ch[0].children.length <= 12 + ) { + return c; + } + } + // Heuristic scan for any container with ~200 uniform children. + const allElements = document.querySelectorAll("div, section, main, article"); + for (const el of allElements) { + const ch = el.children; + if (ch.length >= 180 && ch.length <= 230) return el; + } + return null; + }; + + const container = findContainer(); + if (!container) return ""; + + const parts: string[] = []; + parts.push(`count=${container.children.length}`); + + // Serialize each child: class, inline position, inline background. + // Only inline styles (not computed) -- avoids paying for + // getComputedStyle() on 200+ elements per fingerprint, and in practice + // absolute-positioned piece overlays always use inline top/left/bg. + let i = 0; + for (const child of container.children) { + if (i >= 260) break; // hard cap to bound work + const el = child as HTMLElement; + const cls = el.className || ""; + const style = el.style; + const left = style.left || ""; + const top = style.top || ""; + const bg = style.backgroundColor || ""; + const color = style.getPropertyValue("--color") || ""; + parts.push(`${i}:${cls}:${left}:${top}:${bg}:${color}`); + i++; + } + + return parts.join("|"); + }); + } catch { + return ""; + } + } + async measureDropInterval(): Promise<number> { try { const intervals: number[] = []; @@ -1482,16 +1640,61 @@ export class PlaywrightDriver implements TetrisDriver { } } + // Overlay detection: some games render the active piece as absolute- + // positioned sibling divs inside the grid container (so they float + // over the static cell grid). These are NOT part of the cell loop but + // their position tells us which grid cell they occupy. Called after + // building the cell-based grid; only overwrites empty cells. + function applyOverlayPieces( + container: Element, cellGrid: boolean[][], cellsConsumed: number, + actualRows: number, actualCols: number + ): void { + const containerRect = container.getBoundingClientRect(); + if (containerRect.width <= 0 || containerRect.height <= 0) return; + const cellW = containerRect.width / actualCols; + const cellH = containerRect.height / actualRows; + if (cellW < 5 || cellH < 5) return; + + const allChildren = container.children; + for (let i = cellsConsumed; i < allChildren.length; i++) { + const el = allChildren[i] as HTMLElement; + const style = window.getComputedStyle(el); + // Skip statically-positioned siblings -- we only want pieces + // that float over the grid. + if (style.position !== "absolute" && style.position !== "fixed") continue; + const rect = el.getBoundingClientRect(); + if (rect.width <= 0 || rect.height <= 0) continue; + // Center of the overlay element, relative to the container + const cx = rect.left + rect.width / 2 - containerRect.left; + const cy = rect.top + rect.height / 2 - containerRect.top; + const col = Math.floor(cx / cellW); + const row = Math.floor(cy / cellH); + if (row < 0 || row >= actualRows || col < 0 || col >= actualCols) continue; + cellGrid[row][col] = true; + } + } + // Strategy 2: named grid containers const containers = document.querySelectorAll( '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]' ); for (const container of containers) { const children = container.children; - if (children.length >= rows * cols - 10 && children.length <= rows * cols + 10) { + // Container has rows*cols (+/- 10) static cells, optionally followed + // by a handful of absolute-positioned children that act as piece + // overlays (e.g. the active piece rendered on top of the static grid). + // Accept up to 30 extra children beyond the cell count. + if (children.length >= rows * cols - 10 && children.length <= rows * cols + 30) { const actualCols = cols; - const actualRows = Math.round(children.length / actualCols); - const allCells = Array.from(children).slice(0, actualRows * actualCols) as HTMLElement[]; + // If we're at the short end of the range, fall back to the old + // behaviour and derive actualRows from the child count. Otherwise + // assume the extras are overlays and use the full grid dimensions. + const isShortGrid = children.length <= rows * cols + 4; + const actualRows = isShortGrid + ? Math.round(children.length / actualCols) + : rows; + const cellsConsumed = actualRows * actualCols; + const allCells = Array.from(children).slice(0, cellsConsumed) as HTMLElement[]; const emptyBg = detectEmptyBg(allCells); const result: boolean[][] = []; for (let r = 0; r < actualRows; r++) { @@ -1502,6 +1705,9 @@ export class PlaywrightDriver implements TetrisDriver { } result.push(rowData); } + // Overlay detection is a no-op when there are no extra children + // past the static cell grid, so it's safe to always call. + applyOverlayPieces(container, result, cellsConsumed, actualRows, actualCols); return result; } if (children.length >= rows - 2 && children.length <= rows + 2) { @@ -1533,7 +1739,7 @@ export class PlaywrightDriver implements TetrisDriver { const allElements = document.querySelectorAll("div, section, main, article"); for (const el of allElements) { const ch = el.children; - if (ch.length >= 180 && ch.length <= 220) { + if (ch.length >= 180 && ch.length <= 230) { const firstChild = ch[0] as HTMLElement; if (!firstChild) continue; const firstRect = firstChild.getBoundingClientRect(); @@ -2141,7 +2347,7 @@ export class PlaywrightDriver implements TetrisDriver { ); for (const container of containers) { const children = container.children; - if (children.length >= 180 && children.length <= 220) { + if (children.length >= 180 && children.length <= 230) { const rect = container.getBoundingClientRect(); return { type: "dom" as const, bounds: { x: rect.x, y: rect.y, width: rect.width, height: rect.height }, rows: Math.round(children.length / 10), cols: 10 }; } @@ -2158,7 +2364,7 @@ export class PlaywrightDriver implements TetrisDriver { const allElements = document.querySelectorAll("div, section, main, article"); for (const el of allElements) { const ch = el.children; - if (ch.length >= 180 && ch.length <= 220) { + if (ch.length >= 180 && ch.length <= 230) { const firstChild = ch[0] as HTMLElement; if (!firstChild) continue; const firstRect = firstChild.getBoundingClientRect(); diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -137,6 +137,14 @@ export interface TetrisDriver { surveyPage(): Promise<SurveyData>; calibrate(): Promise<DriverCalibration>; recalibrate(): Promise<DriverCalibration>; + /** + * Lightweight, side-effect-free grid re-detection. Does NOT click, press + * keys, or run start-mechanism detection. If the page has since spawned + * its grid (common with DOM games that build cells in requestAnimationFrame + * after a start button click), the cached calibration is updated; otherwise + * the current calibration is left untouched. + */ + refreshGridDetection(): Promise<void>; getCalibration(): DriverCalibration; // -- Start mechanism discovery/verification bridge -- @@ -179,6 +187,25 @@ export interface TetrisDriver { // -- Screenshots -- screenshot(): Promise<Buffer>; + /** + * Screenshot clipped to the grid area (uses current calibration's gridBounds). + * Returns null if no grid has been detected yet. Useful as a fallback for + * verifyGameStarted when the grid reader can't see active pieces because the + * game renders them outside the cell layout (e.g. absolute-positioned divs + * floating over the grid). + */ + screenshotGridArea(): Promise<Buffer | null>; + /** + * Compute a string fingerprint of the grid container's DOM state, capturing + * child count, class names, and inline position styles. Used by the start + * verification fallback to detect piece movement for games that render the + * active piece as absolute-positioned divs outside the cell layout -- those + * changes are invisible to the grid reader and may be off-screen for pixel + * diffs, but they always show up in the DOM. + * + * Returns an empty string if no grid container could be located. + */ + captureGridDomFingerprint(): Promise<string>; measureDropInterval(): Promise<number>; }

Impressum · Datenschutz