loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 1d5cce537fb6c78ac946ca42dca010215a97e6fd
parent 4ce8d09103c723f23b4f1d266fe3aef143995996
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 09:03:26 +0200

Language-agnostic start detection for gameplay bot

Rewrote start mechanism detection to be fully language-agnostic:
- No text string matching (removed btn/start/play selectors)
- Detects clickable elements by cursor:pointer, background color, size
- Sorts candidates by prominence (size, center proximity, contrast)
- Keyboard triggers (Enter, Space) tried before DOM buttons
- All wait times reduced from 300-500ms to 100ms
- Overlay detection purely structural (position, z-index, viewport %)

Tested: Spanish DOM game now starts correctly (game_starts: PASS via space).
Spanish DOM game 2 scores 89% (up from 80%).
Canvas games still blocked by GPU pixel readback issue.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 256++++++++++++++++++++++++++++++++++++++++++-------------------------------------
1 file changed, 136 insertions(+), 120 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -361,12 +361,13 @@ interface StartDetectionResult { } /** - * 5-phase start detection. Language-agnostic, visual-first. + * 5-phase start detection. Fully language-agnostic, visual-first. + * No text matching of any kind -- detection is purely structural and behavioral. * - * Phase 1: Auto-start (1s, no input) - * Phase 2: DOM button discovery (click all clickable elements by prominence) - * Phase 3: Canvas click grid (for canvas-rendered buttons) - * Phase 4: Keyboard triggers (Enter, Space, ArrowDown, Z, combos) + * Phase 1: Auto-start (no input, visual change detection) + * Phase 2: Keyboard triggers (Enter, Space, ArrowDown, Z -- fast, universal) + * Phase 3: DOM button discovery (click all clickable elements by visual prominence) + * Phase 4: Canvas click grid (for canvas-rendered buttons) * Phase 5: Retry phases 2-4 (some games need two interactions) * * Total budget: 30 seconds. @@ -398,37 +399,43 @@ async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { } } - // ---- Phase 2: DOM button discovery (language-agnostic) ---- - { - log("Phase 2: trying DOM buttons..."); - const phase2Result = await tryDomButtons(page, budgetExceeded); - log(`Phase 2 result: ${phase2Result ? `found=${phase2Result.mechanism}` : "none"}`); - if (phase2Result) return phase2Result; + // ---- Phase 2: Keyboard triggers (fast, language-agnostic) ---- + if (!budgetExceeded()) { + log("Phase 2: trying keyboard triggers..."); + const phase2Result = await tryKeyboardTriggers(page, budgetExceeded); + if (phase2Result) { + log(`Phase 2 result: found=${phase2Result.mechanism}`); + return phase2Result; + } + log("Phase 2 result: none"); } - // ---- Phase 3: Canvas click grid ---- + // ---- Phase 3: DOM button discovery (language-agnostic, visual-only) ---- if (!budgetExceeded()) { - const phase3Result = await tryCanvasClicks(page, budgetExceeded); + log("Phase 3: trying DOM buttons..."); + const phase3Result = await tryDomButtons(page, budgetExceeded); + log(`Phase 3 result: ${phase3Result ? `found=${phase3Result.mechanism}` : "none"}`); if (phase3Result) return phase3Result; } - // ---- Phase 4: Keyboard triggers ---- + // ---- Phase 4: Canvas click grid ---- if (!budgetExceeded()) { - const phase4Result = await tryKeyboardTriggers(page, budgetExceeded); + log("Phase 4: trying canvas clicks..."); + const phase4Result = await tryCanvasClicks(page, budgetExceeded); if (phase4Result) return phase4Result; } // ---- Phase 5: Retry phases 2-4 (some games need two interactions) ---- if (!budgetExceeded()) { - const phase2Retry = await tryDomButtons(page, budgetExceeded); + const phase2Retry = await tryKeyboardTriggers(page, budgetExceeded); if (phase2Retry) return phase2Retry; } if (!budgetExceeded()) { - const phase3Retry = await tryCanvasClicks(page, budgetExceeded); + const phase3Retry = await tryDomButtons(page, budgetExceeded); if (phase3Retry) return phase3Retry; } if (!budgetExceeded()) { - const phase4Retry = await tryKeyboardTriggers(page, budgetExceeded); + const phase4Retry = await tryCanvasClicks(page, budgetExceeded); if (phase4Retry) return phase4Retry; } @@ -436,95 +443,114 @@ async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { } /** - * Phase 2: Find all clickable DOM elements (language-agnostic, no text matching). - * Sort by visual prominence (size, centrality). Click each and observe. + * Phase 2: Find all clickable DOM elements (fully language-agnostic, no text matching). + * Finds buttons, anchors, role=button, onclick, and cursor:pointer elements. + * Sort by visual prominence (size, centrality, contrast). Click each and observe. */ async function tryDomButtons( page: Page, budgetExceeded: () => boolean ): Promise<StartDetectionResult | null> { try { - // Collect all clickable elements - const clickableSelector = - 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; - const visualSelector = - '[class*="btn"], [class*="button"], [class*="start"], [class*="play"], ' + - '[id*="start"], [id*="play"], [id*="btn"]'; - - // Gather element info (position, size, text) for sorting - const elementInfos = await page.evaluate( - ({ clickableSelector, visualSelector }) => { - const seen = new Set<Element>(); - const results: Array<{ - index: number; - text: string; - x: number; - y: number; - width: number; - height: number; - area: number; - centerDist: number; - selector: string; - }> = []; - - const allElements: Element[] = []; - for (const el of document.querySelectorAll(clickableSelector)) { - if (!seen.has(el)) { - seen.add(el); - allElements.push(el); - } - } - for (const el of document.querySelectorAll(visualSelector)) { - if (!seen.has(el)) { + // Gather element info (position, size, text) for sorting -- purely structural/visual + const elementInfos = await page.evaluate(() => { + const seen = new Set<Element>(); + const results: Array<{ + index: number; + text: string; + x: number; + y: number; + width: number; + height: number; + area: number; + centerDist: number; + selector: string; + hasBackground: boolean; + }> = []; + + // Phase A: structural clickable elements (type-based, no text matching) + const clickableSelector = + 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; + for (const el of document.querySelectorAll(clickableSelector)) { + if (!seen.has(el)) seen.add(el); + } + + // Phase B: elements with cursor:pointer computed style (catches custom divs/spans acting as buttons) + const allEls = document.querySelectorAll("*"); + for (const el of allEls) { + if (seen.has(el)) continue; + try { + const style = window.getComputedStyle(el); + if (style.cursor === "pointer") { seen.add(el); - allElements.push(el); } - } + } catch { /* skip */ } + } - const pageW = window.innerWidth; - const pageH = window.innerHeight; - const pageCenterX = pageW / 2; - const pageCenterY = pageH / 2; - - for (let i = 0; i < allElements.length; i++) { - const el = allElements[i]; - const rect = el.getBoundingClientRect(); - if (rect.width < 5 || rect.height < 5) continue; - if (rect.top > pageH || rect.left > pageW) continue; - - const cx = rect.left + rect.width / 2; - const cy = rect.top + rect.height / 2; - const centerDist = Math.sqrt((cx - pageCenterX) ** 2 + (cy - pageCenterY) ** 2); - - let selector = ""; - if (el.id) { - selector = `#${el.id}`; - } else if ((el as HTMLElement).className) { - const cls = (el as HTMLElement).className.toString().split(" ")[0]; - if (cls) selector = `${el.tagName.toLowerCase()}.${cls}`; + const pageW = window.innerWidth; + const pageH = window.innerHeight; + const pageCenterX = pageW / 2; + const pageCenterY = pageH / 2; + + let idx = 0; + for (const el of seen) { + const rect = el.getBoundingClientRect(); + if (rect.width < 5 || rect.height < 5) continue; + if (rect.top > pageH || rect.left > pageW) continue; + // Skip elements that cover most of the viewport (overlays, not buttons) + if (rect.width > pageW * 0.8 && rect.height > pageH * 0.8) continue; + + const cx = rect.left + rect.width / 2; + const cy = rect.top + rect.height / 2; + const centerDist = Math.sqrt((cx - pageCenterX) ** 2 + (cy - pageCenterY) ** 2); + + // Check if element has a distinct background (high contrast, likely a button) + let hasBackground = false; + try { + const style = window.getComputedStyle(el as HTMLElement); + const bg = style.backgroundColor; + // transparent or rgba(0,0,0,0) means no background + if (bg && bg !== "transparent" && bg !== "rgba(0, 0, 0, 0)") { + hasBackground = true; } - if (!selector) selector = `${el.tagName.toLowerCase()}:nth-of-type(${i + 1})`; - - results.push({ - index: i, - text: (el.textContent || "").trim().slice(0, 50), - x: Math.round(cx), - y: Math.round(cy), - width: rect.width, - height: rect.height, - area: rect.width * rect.height, - centerDist, - selector, - }); + } catch { /* skip */ } + + let selector = ""; + if (el.id) { + selector = `#${el.id}`; + } else if ((el as HTMLElement).className) { + const cls = (el as HTMLElement).className.toString().split(" ")[0]; + if (cls) selector = `${el.tagName.toLowerCase()}.${cls}`; } + if (!selector) selector = `${el.tagName.toLowerCase()}:nth-of-type(${idx + 1})`; + + results.push({ + index: idx, + text: (el.textContent || "").trim().slice(0, 50), + x: Math.round(cx), + y: Math.round(cy), + width: rect.width, + height: rect.height, + area: rect.width * rect.height, + centerDist, + selector, + hasBackground, + }); + idx++; + } - // Sort by visual prominence: larger elements first, then closer to center - results.sort((a, b) => b.area - a.area || a.centerDist - b.centerDist); + // Sort by visual prominence: + // 1. Elements with background first (more likely to be buttons) + // 2. Larger elements first + // 3. Closer to center preferred + results.sort((a, b) => { + if (a.hasBackground !== b.hasBackground) return a.hasBackground ? -1 : 1; + if (Math.abs(b.area - a.area) > 100) return b.area - a.area; + return a.centerDist - b.centerDist; + }); - return results; - }, - { clickableSelector, visualSelector } - ); + return results; + }); console.log(`[start-detect] Phase 2: found ${elementInfos.length} clickable elements`); // Click each element and observe for visual change @@ -546,9 +572,9 @@ async function tryDomButtons( const before = await page.screenshot(); console.log(`[start-detect] Clicking "${info.text}" (${info.selector}) at (${info.x},${info.y}), before=${before.length} bytes`); await page.mouse.click(info.x, info.y); - await page.waitForTimeout(300); + await page.waitForTimeout(100); - const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); console.log(`[start-detect] After click "${info.text}": changed=${result.changed}`); if (result.changed) { // Check if the element disappeared after clicking @@ -581,7 +607,7 @@ async function tryDomButtons( // No change -- try pressing Escape to undo any menu we opened try { await page.keyboard.press("Escape"); - await page.waitForTimeout(100); + await page.waitForTimeout(50); } catch { /* ignore */ } } catch { /* continue to next element */ } } @@ -646,9 +672,9 @@ async function tryCanvasClicks( try { const before = await page.screenshot(); await page.mouse.click(pos.x, pos.y); - await page.waitForTimeout(300); + await page.waitForTimeout(100); - const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { return { mechanism: "click_canvas", @@ -689,9 +715,9 @@ async function tryKeyboardTriggers( try { const before = await page.screenshot(); await page.keyboard.press(key); - await page.waitForTimeout(300); + await page.waitForTimeout(100); - const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { return { mechanism: mechanismMap[key] }; } @@ -715,9 +741,9 @@ async function tryKeyboardTriggers( } await page.waitForTimeout(100); await page.keyboard.press(key); - await page.waitForTimeout(300); + await page.waitForTimeout(100); - const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { return { mechanism: mechanismMap[key] }; } @@ -786,10 +812,10 @@ async function recalibrateWithRetry( try { const before = await page.screenshot(); await attempt.action(); - await page.waitForTimeout(300); + await page.waitForTimeout(100); if (startMechanism === "unknown") { - const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 100, before }); if (result.changed) { startMechanism = attempt.name; } @@ -1180,7 +1206,7 @@ async function detectScoreElement(page: Page): Promise<string | null> { export async function surveyPage(page: Page): Promise<SurveyData> { try { const data = await page.evaluate(() => { - // Check for full-screen overlay + // Check for full-screen overlay (language-agnostic: purely structural detection) let hasOverlay = false; const allEls = document.querySelectorAll("*"); const vw = window.innerWidth; @@ -1192,20 +1218,10 @@ export async function surveyPage(page: Page): Promise<SurveyData> { const zIndex = parseInt(style.zIndex, 10); if (zIndex > 0 || style.zIndex === "auto") { const rect = (el as HTMLElement).getBoundingClientRect(); - if (rect.width > vw * 0.8 && rect.height > vh * 0.8) { - const text = ((el as HTMLElement).innerText || "").toLowerCase(); - if ( - text.includes("start") || - text.includes("play") || - text.includes("enter") || - text.includes("press") || - text.includes("begin") || - text.includes("click") || - text.length < 5 // empty overlay - ) { - hasOverlay = true; - break; - } + if (rect.width > vw * 0.5 && rect.height > vh * 0.5) { + // Large positioned overlay detected -- no text matching needed + hasOverlay = true; + break; } } }

Impressum · Datenschutz