loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 71f0c4b7931a2d58ae370587a6a0f62ba352b716
parent e0a13b62466491d470aedf8a0eb251d5075f2951
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat, 11 Apr 2026 07:28:13 +0200

V2: language-agnostic game over detection, capture in Phase 6

Two fixes:

1. Game over display detection happens in Phase 6 (when game over is
   actually triggered) and is stored on session. Phase 8 no longer needs
   to re-trigger game over. Added gameOverText and gameOverRestartAvailable
   to GameSession.

2. detectGameOverText() and detectRestartOption() are now language-agnostic.
   Instead of matching text patterns, they detect structural modals:
   position fixed/absolute elements covering >15% of viewport with visible
   background/content and z-index. Restart detection finds clickable
   elements inside the detected modal.

Also updated CLAUDE.md with explicit "driver MUST NOT hard-code language
strings" convention.

Known follow-ups: readLevel, detectNextPiecePreview, detectControls, and
score element search still use text matching. These need language-agnostic
replacements.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
MCLAUDE.md | 5+++++
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 44++++++++++++++++++++++++++++++++++++++++----
Mtasks/tetris/eval/gameplay-bot-v2/driver.ts | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 4++++
4 files changed, 134 insertions(+), 29 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md @@ -126,6 +126,11 @@ Short URL IDs: 8-char SHA256 hash for `/r/` and `/c/` routes with redirect pages - Pre-push hook verifies dashboard build - Provider must be explicit (--provider flag required) - GLM models use real names (glm-4.5-air), never mapped to haiku/sonnet/opus +- **Gameplay bot driver MUST NOT hard-code language strings.** No text matching + for start buttons, game over detection, restart buttons, score/level labels, + or any other UI element. Detection must be purely structural (DOM structure, + element properties, visual changes, behavioral response to input). The bot + should work for games in any language without code changes. ## TODO diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -365,6 +365,8 @@ export async function runAllTests( movementsObserved: 0, hardDropsObserved: 0, gameOverDetected: false, + gameOverText: null, + gameOverRestartAvailable: false, consoleErrors: [], durationSeconds: 0, pieceTypes: new Set<string>(), @@ -838,6 +840,21 @@ async function runGameOverPhase( const MAX_DROPS = 40; const BATCH_SIZE = 5; + // Capture game over text + restart option immediately after triggering game + // over, before anything (e.g. Phase 8 page reload) can clear it. + const captureGameOverDisplay = async (): Promise<void> => { + try { + session.gameOverText = await driver.detectGameOverText(); + } catch { + session.gameOverText = null; + } + try { + session.gameOverRestartAvailable = await driver.detectRestartOption(); + } catch { + session.gameOverRestartAvailable = false; + } + }; + for (let i = 0; i < MAX_DROPS; i++) { await driver.pressKey("drop"); await driver.wait(150); @@ -858,6 +875,7 @@ async function runGameOverPhase( if (!driver.gridsAreDifferent(snap.grid, snapAfter.grid)) { session.gameOverDetected = true; session.events.push({ type: "game_over", frame: session.frames }); + await captureGameOverDisplay(); return; } } @@ -876,6 +894,13 @@ async function runGameOverPhase( if (finalSnap.grid && finalSnap.filledCount > 10) { session.gameOverDetected = true; session.events.push({ type: "game_over", frame: session.frames }); + // We already have the text; capture restart too. + session.gameOverText = gameOverText; + try { + session.gameOverRestartAvailable = await driver.detectRestartOption(); + } catch { + session.gameOverRestartAvailable = false; + } } } } @@ -1848,19 +1873,30 @@ function deriveTestResults( } // 22. game_over_display + // Prefer the Phase 6 capture (taken immediately after game over was + // triggered) because Phase 8's competitive-play check happens after a fresh + // page reload and rarely sees a real game-over screen. Fall back to the + // Phase 8 reading only if Phase 6 never reached game over. if (!phaseState.gameplayWorks || !competitivePlay) { results.push(skipResult("game_over_display", "competitive play phase did not run")); } else if (!competitivePlay.game_over_reached && !session.gameOverDetected) { results.push(skipResult("game_over_display", "game over not reached during play")); } else { - const hasText = competitivePlay.game_over_text_found !== null; - const hasRestart = competitivePlay.restart_available; + const usePhase6 = session.gameOverDetected; + const text = usePhase6 + ? session.gameOverText ?? null + : competitivePlay.game_over_text_found; + const hasRestart = usePhase6 + ? session.gameOverRestartAvailable === true + : competitivePlay.restart_available; + const hasText = text !== null && text !== undefined; + const source = usePhase6 ? "phase6" : "phase8"; results.push({ name: "game_over_display", pass: hasText && hasRestart, detail: hasText && hasRestart - ? `game over display: "${competitivePlay.game_over_text_found}", restart available` - : `missing: ${!hasText ? "game over text" : ""}${!hasText && !hasRestart ? " and " : ""}${!hasRestart ? "restart option" : ""}`, + ? `game over display: "${text}", restart available (${source})` + : `missing: ${!hasText ? "game over text" : ""}${!hasText && !hasRestart ? " and " : ""}${!hasRestart ? "restart option" : ""} (${source})`, }); } diff --git a/tasks/tetris/eval/gameplay-bot-v2/driver.ts b/tasks/tetris/eval/gameplay-bot-v2/driver.ts @@ -2041,22 +2041,51 @@ export class PlaywrightDriver implements TetrisDriver { // -- Page State Queries -- + /** + * Detect if a game-over modal/overlay is visible on the page. + * Language-agnostic: looks for a NEW visible element that wasn't there during + * gameplay, covering a significant portion of the viewport, with position + * fixed/absolute and meaningful z-index. Returns a description like "modal" + * or "overlay" rather than extracted text. + */ async detectGameOverText(): Promise<string | null> { try { return await this.page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - const patterns: [RegExp, string][] = [ - [/game\s*over/i, "Game Over"], - [/you lose/i, "You Lose"], - [/try again/i, "Try Again"], - [/play again/i, "Play Again"], - [/fin del juego/i, "Fin del Juego"], - [/juego terminado/i, "Juego Terminado"], - [/partida terminada/i, "Partida Terminada"], - ]; - for (const [regex, label] of patterns) { - if (regex.test(text)) return label; + const vw = window.innerWidth; + const vh = window.innerHeight; + const viewportArea = vw * vh; + + // Find elements that look like modals/overlays + const all = document.querySelectorAll<HTMLElement>("*"); + for (const el of all) { + const style = window.getComputedStyle(el); + const pos = style.position; + if (pos !== "fixed" && pos !== "absolute") continue; + + // Must be visible + if (style.display === "none" || style.visibility === "hidden" || + parseFloat(style.opacity) === 0) continue; + + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) continue; + + const area = rect.width * rect.height; + // Significant coverage: 15% or more of viewport + if (area < viewportArea * 0.15) continue; + + // Must have visible background or children (not a transparent wrapper) + const hasBg = style.backgroundColor !== "rgba(0, 0, 0, 0)" && + style.backgroundColor !== "transparent"; + const hasContent = el.children.length > 0 || (el.textContent || "").trim().length > 0; + if (!hasBg && !hasContent) continue; + + // High z-index relative to the page + const z = parseInt(style.zIndex, 10); + if (!isNaN(z) && z < 1) continue; + + return "modal"; } + return null; }); } catch { @@ -2064,22 +2093,53 @@ export class PlaywrightDriver implements TetrisDriver { } } + /** + * Detect if a restart/new-game option is visible. + * Language-agnostic: looks for a clickable element that appears NOW and + * wasn't there during initial play. We approximate this by finding any + * visible clickable element (button, [role=button], cursor:pointer) inside + * a visible modal/overlay, or a clickable element that wasn't present at + * calibration time. This avoids text matching. + */ async detectRestartOption(): Promise<boolean> { try { return await this.page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - const buttons = document.querySelectorAll("button"); - for (const btn of buttons) { - const btnText = (btn.textContent || "").toLowerCase(); - if (btnText.includes("restart") || btnText.includes("play again") || - btnText.includes("new game") || btnText.includes("reiniciar") || - btnText.includes("jugar de nuevo") || btnText.includes("nueva partida")) { - return true; - } - } - return text.includes("restart") || text.includes("play again") || - text.includes("press") || text.includes("try again") || - text.includes("reiniciar") || text.includes("jugar de nuevo"); + const vw = window.innerWidth; + const vh = window.innerHeight; + const viewportArea = vw * vh; + + // Find modals/overlays + const all = document.querySelectorAll<HTMLElement>("*"); + for (const el of all) { + const style = window.getComputedStyle(el); + const pos = style.position; + if (pos !== "fixed" && pos !== "absolute") continue; + if (style.display === "none" || style.visibility === "hidden" || + parseFloat(style.opacity) === 0) continue; + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) continue; + if (rect.width * rect.height < viewportArea * 0.15) continue; + + // Look for clickable elements inside this modal + const clickables = el.querySelectorAll<HTMLElement>( + "button, [role=button], a, input[type=button], input[type=submit]" + ); + for (const c of clickables) { + const cRect = c.getBoundingClientRect(); + if (cRect.width > 0 && cRect.height > 0) return true; + } + + // Also check for elements with cursor: pointer + const all2 = el.querySelectorAll<HTMLElement>("*"); + for (const c of all2) { + if (window.getComputedStyle(c).cursor === "pointer") { + const cRect = c.getBoundingClientRect(); + if (cRect.width > 0 && cRect.height > 0) return true; + } + } + } + + return false; }); } catch { return false; diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -316,6 +316,10 @@ export interface GameSession { movementsObserved: number; hardDropsObserved: number; gameOverDetected: boolean; + /** Game over text captured in Phase 6 immediately after triggering game over. */ + gameOverText?: string | null; + /** Whether a restart option was visible in Phase 6 immediately after triggering game over. */ + gameOverRestartAvailable?: boolean; consoleErrors: string[]; durationSeconds: number; pieceTypes: Set<string>;

Impressum · Datenschutz