loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 7fbe88ce2a1febb0954305d10f4e1878570e0f14
parent 53c719fefdf6f437deb0b34eb1b8dbff56d06643
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 11:48:33 +0200

Verify game interactivity via DOM + screenshot after start detection

Start detection now requires the game to respond to gameplay inputs
(ArrowLeft/Right/Down) before confirming a mechanism worked. Checks
both screenshot changes AND DOM state changes (class names, styles on
grid children). This catches:
- False starts from Space (visual change but not interactive)
- Games that rebuild DOM via innerHTML (screenshot identical but DOM differs)

Spanish game e2e04e75 now correctly starts via "Iniciar Juego" button
(was falsely starting via Space). Score went from 18% to 75%.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 109 insertions(+), 12 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -314,6 +314,75 @@ async function detectVisualChange( } /** + * Verify that the game is actually interactive -- gameplay inputs cause + * visible state changes. This distinguishes a truly started game from + * animations, overlays, or other false positives. + * + * Sends ArrowLeft then ArrowRight and checks if the page responds. + * A game that started will move a piece; a static page won't change. + */ +async function verifyInteractivity(page: Page): Promise<boolean> { + try { + // Wait for at least one render frame before baseline + await page.waitForTimeout(200); + + // Capture baseline: both screenshot and DOM state + const baseline = await page.screenshot(); + const domBefore = await page.evaluate(() => { + // Snapshot the largest grid-like container's child class/style state + const candidates = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], table' + ); + let best = ""; + for (const el of candidates) { + const snap = Array.from(el.children).map(c => + (c as HTMLElement).className + (c as HTMLElement).style.cssText + ).join("|"); + if (snap.length > best.length) best = snap; + } + // Also capture body innerHTML hash as fallback + if (!best) best = document.body.innerHTML.substring(0, 5000); + return best; + }); + + // Try multiple inputs + for (const key of ["ArrowLeft", "ArrowRight", "ArrowDown"]) { + await page.keyboard.press(key); + await page.waitForTimeout(200); + + // Check screenshot change + const after = await page.screenshot(); + if (!baseline.equals(after)) { + return true; + } + + // Check DOM state change (catches games where screenshot is identical + // but DOM classes/styles changed -- e.g. innerHTML-rebuilt grids) + const domAfter = await page.evaluate(() => { + const candidates = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], table' + ); + let best = ""; + for (const el of candidates) { + const snap = Array.from(el.children).map(c => + (c as HTMLElement).className + (c as HTMLElement).style.cssText + ).join("|"); + if (snap.length > best.length) best = snap; + } + if (!best) best = document.body.innerHTML.substring(0, 5000); + return best; + }); + if (domAfter !== domBefore) { + return true; + } + } + return false; + } catch { + return false; + } +} + +/** * Cluster adjacent points using flood fill. * Two points are adjacent if they differ by at most 1 in both row and column. */ @@ -395,7 +464,11 @@ async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { const result = await detectVisualChange(page, { frames: 6, intervalMs: 200 }); log(`Phase 1 result: changed=${result.changed}`); if (result.changed) { - return { mechanism: "auto" }; + const interactive = await verifyInteractivity(page); + if (interactive) { + return { mechanism: "auto" }; + } + log("Phase 1: visual change detected but game not interactive (animation?)"); } } @@ -577,6 +650,17 @@ async function tryDomButtons( const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); console.log(`[start-detect] After click "${info.text}": changed=${result.changed}`); if (result.changed) { + // Wait for the game to fully initialize after button click + await page.waitForTimeout(300); + // Verify the game is actually interactive after clicking this button + const interactive = await verifyInteractivity(page); + if (!interactive) { + console.log(`[start-detect] Button "${info.text}" caused visual change but game not interactive, continuing...`); + // Try pressing Escape to undo and continue + try { await page.keyboard.press("Escape"); await page.waitForTimeout(50); } catch {} + continue; + } + // Check if the element disappeared after clicking const disappeared = await page.evaluate( ({ selector }) => { @@ -676,15 +760,19 @@ async function tryCanvasClicks( const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { - return { - mechanism: "click_canvas", - startButton: { - selector: "canvas", - text: `canvas click at ${pos.label}`, - disappeared: false, - position: { x: Math.round(pos.x), y: Math.round(pos.y) }, - }, - }; + const interactive = await verifyInteractivity(page); + if (interactive) { + return { + mechanism: "click_canvas", + startButton: { + selector: "canvas", + text: `canvas click at ${pos.label}`, + disappeared: false, + position: { x: Math.round(pos.x), y: Math.round(pos.y) }, + }, + }; + } + console.log(`[start-detect] Canvas click at ${pos.label} caused change but not interactive`); } } catch { /* continue */ } } @@ -719,7 +807,12 @@ async function tryKeyboardTriggers( const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { - return { mechanism: mechanismMap[key] }; + // Verify the game is actually interactive, not just an animation + const interactive = await verifyInteractivity(page); + if (interactive) { + return { mechanism: mechanismMap[key] }; + } + console.log(`[start-detect] ${key} caused visual change but game not interactive, continuing...`); } } catch { /* continue */ } } @@ -745,7 +838,11 @@ async function tryKeyboardTriggers( const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { - return { mechanism: mechanismMap[key] }; + const interactive = await verifyInteractivity(page); + if (interactive) { + return { mechanism: mechanismMap[key] }; + } + console.log(`[start-detect] ${key}+click caused visual change but game not interactive, continuing...`); } } catch { /* continue */ } }

Impressum · Datenschutz