loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 42321c004e708af0b099db58e9cbedf54e03e145
parent 13710ed75e85f22717f6463b1b3bc0b822a0a66b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun, 12 Apr 2026 08:31:49 +0200

V2: landmarks-based game_loads, updated calibration test names

game_loads now checks for structural landmarks (canvas, DOM grid,
tetris-ratio elements, cell containers) instead of failing on console
errors. Console errors are informational, not a pass/fail gate.

8fe72fce: game_loads now PASS (was FAIL from benign startup TypeError),
score 100% (20/20 scorable).

Updated all calibration JSON files: score_changes renamed to
score_increases_on_clear + score_element_visible.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 163++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mtasks/tetris/eval/gameplay-bot-v2/driver.ts | 83+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 4++++
Mtasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/4949d521.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/7a348b81.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/93e8feea.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/9805c24a.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/bbb70053.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/cbbff570.json | 3++-
Mtasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json | 3++-
13 files changed, 248 insertions(+), 32 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -314,7 +314,8 @@ const ALL_TEST_NAMES = [ "new_piece_spawns", "multiple_pieces", "line_clear", - "score_changes", + "score_increases_on_clear", + "score_element_visible", "game_over", "playable_30s", "multi_line_clear", @@ -423,6 +424,13 @@ export async function runAllTests( // ---- Pre-test survey ---- survey = await driver.surveyPage(); + // ---- Detect game-shaped landmarks for the game_loads test ---- + try { + session.gameLoadLandmarks = await driver.detectGameLandmarks(); + } catch { + session.gameLoadLandmarks = undefined; + } + // ---- Phase 2: Discover + verify start, then calibrate ---- // Bridge flow: try each candidate, ask verifyGameStarted() to confirm, // commit the first verified candidate. On false positive, reload and try @@ -830,6 +838,14 @@ async function runGameplayPhase( } } + // Propagate score-before/after-clear from AI play + if (result.scoreBeforeClear !== undefined && session.scoreBeforeClear === undefined) { + session.scoreBeforeClear = result.scoreBeforeClear; + } + if (result.scoreAfterClear !== undefined && session.scoreAfterClear === undefined) { + session.scoreAfterClear = result.scoreAfterClear; + } + // Read final score const finalScore = await driver.readScore(); if (finalScore !== null) { @@ -853,12 +869,25 @@ async function runGameplayPhase( // If no lines cleared by AI, try brute-force if (session.linesCleared === 0) { + // Capture score before brute-force line clear attempt + if (session.scoreBeforeClear === undefined) { + const preScore = await driver.readScore(); + if (preScore !== null) session.scoreBeforeClear = preScore; + } + const cleared = await tryFillRow(driver, 10); gameplay.pieces_placed += 10; if (cleared) { session.linesCleared++; gameplay.lines_cleared++; session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); + + // Capture score after brute-force line clear + if (session.scoreAfterClear === undefined) { + await driver.wait(200); + const postScore = await driver.readScore(); + if (postScore !== null) session.scoreAfterClear = postScore; + } } } @@ -867,9 +896,23 @@ async function runGameplayPhase( const snapAfter = await driver.readGrid(); const filledAfter = snapAfter.filledCount; if (filledAfter < filledBefore && filledBefore > 0) { + // Capture score around grid-verified line clear + if (session.scoreBeforeClear === undefined) { + // Use the last known score reading as proxy + const lastKnown = session.scoreValues.length > 0 + ? session.scoreValues[session.scoreValues.length - 1] + : null; + if (lastKnown !== null) session.scoreBeforeClear = lastKnown; + } + session.linesCleared++; gameplay.lines_cleared++; session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); + + if (session.scoreAfterClear === undefined) { + const postScore = await driver.readScore(); + if (postScore !== null) session.scoreAfterClear = postScore; + } } } } @@ -1469,7 +1512,7 @@ async function playGame( maxDurationMs?: number; rotationTrack?: Map<string, Set<string>>; } -): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> { +): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[]; scoreBeforeClear?: number; scoreAfterClear?: number }> { const maxPieces = options.maxPieces ?? 100; const maxDuration = options.maxDurationMs ?? 30000; const rotationTrack = options.rotationTrack; @@ -1482,6 +1525,11 @@ async function playGame( let consecutiveReadFails = 0; const scoreValues: number[] = []; let scorePollCounter = 0; + let scoreBeforeClear: number | undefined; + let scoreAfterClear: number | undefined; + /** The most recent score reading, kept up to date so we can snapshot it + * immediately before a line clear is detected. */ + let lastScoreReading: number | null = null; // settledGrid = the locked board WITHOUT any active piece. We recompute // it after each placement by reading the fresh grid and stripping out the @@ -1511,7 +1559,10 @@ async function playGame( scorePollCounter++; if (scorePollCounter % 5 === 0) { const score = await driver.readScore(); - if (score !== null) scoreValues.push(score); + if (score !== null) { + scoreValues.push(score); + lastScoreReading = score; + } } if (snap.activePieceCells && snap.activePieceCells.length === 4) { @@ -1560,6 +1611,20 @@ async function playGame( const boardBeforePlacement = stripActivePiece(workingSnap.grid!, workingSnap.activePieceCells!); const placement = findBestPlacement(boardBeforePlacement, pieceType as PieceType); + // Track whether this placement will produce a line clear so we can + // snapshot the score before and after. + const expectsClear = placement ? placement.linesCleared > 0 : false; + // Snapshot score just before executing the drop (for score_increases_on_clear) + if (expectsClear && scoreBeforeClear === undefined) { + const preClearScore = await driver.readScore(); + if (preClearScore !== null) { + scoreBeforeClear = preClearScore; + lastScoreReading = preClearScore; + } else if (lastScoreReading !== null) { + scoreBeforeClear = lastScoreReading; + } + } + if (placement) { await executePlacement(driver, placement, workingSnap.activePieceCells!); linesCleared += placement.linesCleared; @@ -1582,9 +1647,30 @@ async function playGame( // Line-clear detection by filled-count delta. const filledBefore = countFilled(boardBeforePlacement) + 4; const filledAfter = countFilled(afterSnap.grid); + let clearsThisPlacement = 0; if (filledAfter < filledBefore) { const possibleClears = Math.round((filledBefore - filledAfter) / GRID_COLS); - if (possibleClears > 0 && possibleClears <= 4) linesCleared += possibleClears; + if (possibleClears > 0 && possibleClears <= 4) { + linesCleared += possibleClears; + clearsThisPlacement = possibleClears; + } + } + + // If a line clear happened (AI-predicted or grid-verified) and we + // haven't captured the post-clear score yet, read it now. + if ((expectsClear || clearsThisPlacement > 0) && scoreAfterClear === undefined) { + // If we didn't snapshot the before-clear score yet (grid-detected + // clear that the AI didn't predict), capture it retroactively from + // the last known reading. + if (scoreBeforeClear === undefined && lastScoreReading !== null) { + scoreBeforeClear = lastScoreReading; + } + const postClearScore = await driver.readScore(); + if (postClearScore !== null) { + scoreAfterClear = postClearScore; + scoreValues.push(postClearScore); + lastScoreReading = postClearScore; + } } if (afterSnap.activePieceCells && afterSnap.activePieceCells.length >= 3 && afterSnap.activePieceCells.length <= 5) { @@ -1607,9 +1693,10 @@ async function playGame( } } - return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails, scoreValues }; + return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails, scoreValues, scoreBeforeClear, scoreAfterClear }; } + async function executePlacement( driver: TetrisDriver, placement: Placement, @@ -1710,12 +1797,27 @@ function deriveTestResults( name, pass: false, detail: `skipped: ${reason}`, }); - // 1. game_loads - results.push({ - name: "game_loads", - pass: loadResult.loaded && loadResult.errorsOnLoad === 0, - detail: loadResult.detail, - }); + // 1. game_loads: page loaded with non-trivial content and at least one + // game-shaped landmark. Console errors are captured in the report for + // informational purposes but do NOT gate this test. + { + const loaded = loadResult.loaded; + const landmarks = session.gameLoadLandmarks; + const hasLandmark = landmarks && ( + landmarks.hasCanvas || + landmarks.hasDomGrid || + landmarks.hasTetrisRatioElement || + landmarks.hasManyCellsContainer + ); + const bodyHasContent = landmarks?.bodyHasContent ?? false; + const pass = loaded && bodyHasContent && !!hasLandmark; + let detail: string; + if (!loaded) detail = loadResult.detail || "page failed to load"; + else if (!bodyHasContent) detail = "blank page (body has no content)"; + else if (!hasLandmark) detail = "no game landmarks found (no canvas, grid, or game-shaped element)"; + else detail = `loaded with landmarks: ${landmarks!.landmarksFound.join(", ")}`; + results.push({ name: "game_loads", pass, detail }); + } // 2. game_starts { @@ -1937,21 +2039,38 @@ function deriveTestResults( results.push({ name: "line_clear", pass: false, detail: "could not trigger or detect a line clear via grid reader" }); } - // 14. score_changes + // 14a. score_increases_on_clear if (!phaseState.mechanicsWork) { - results.push(skipResult("score_changes", "mechanics phase failed")); - } else if (session.scoreValues.length >= 2) { - const min = Math.min(...session.scoreValues); - const max = Math.max(...session.scoreValues); - if (max > min) { - results.push({ name: "score_changes", pass: true, detail: `score changed from ${min} to ${max}` }); + results.push(skipResult("score_increases_on_clear", "mechanics phase failed")); + } else if (session.linesCleared > 0) { + if (session.scoreBeforeClear !== undefined && session.scoreAfterClear !== undefined) { + const scoreIncreased = session.scoreAfterClear > session.scoreBeforeClear; + results.push({ + name: "score_increases_on_clear", + pass: scoreIncreased, + detail: scoreIncreased + ? `score went from ${session.scoreBeforeClear} to ${session.scoreAfterClear} after line clear` + : `score stayed at ${session.scoreBeforeClear} after clearing ${session.linesCleared} line(s)`, + }); + } else if (!cal.scoreElementSelector) { + results.push(skipResult("score_increases_on_clear", "no score element found, cannot verify scoring on clear")); } else { - results.push({ name: "score_changes", pass: false, detail: `score stayed at ${min}` }); + results.push(skipResult("score_increases_on_clear", "lines cleared but could not read score before/after")); } - } else if (!cal.scoreElementSelector) { - results.push({ name: "score_changes", pass: false, detail: "no score element found" }); } else { - results.push({ name: "score_changes", pass: false, detail: "could not read score values" }); + results.push(skipResult("score_increases_on_clear", "no lines cleared, cannot verify scoring")); + } + + // 14b. score_element_visible + { + const hasScoreElement = !!cal.scoreElementSelector; + results.push({ + name: "score_element_visible", + pass: hasScoreElement, + detail: hasScoreElement + ? `score display found (${cal.scoreElementSelector})` + : "no score display detected", + }); } // 15. game_over diff --git a/tasks/tetris/eval/gameplay-bot-v2/driver.ts b/tasks/tetris/eval/gameplay-bot-v2/driver.ts @@ -14,6 +14,7 @@ import type { StartCandidate, TryStartResult, SurveyData, + GameLandmarks, PieceType, DriverCalibration, CalibrationDrift, @@ -520,6 +521,88 @@ export class PlaywrightDriver implements TetrisDriver { } } + async detectGameLandmarks(): Promise<GameLandmarks> { + return await this.page.evaluate(() => { + const body = document.body; + const bodyText = body?.innerText?.trim() || ""; + const bodyHasContent = body !== null && + (bodyText.length > 0 || body.children.length > 0); + + // Check for canvas + const canvases = document.querySelectorAll("canvas"); + let hasCanvas = false; + for (const c of canvases) { + const rect = (c as HTMLCanvasElement).getBoundingClientRect(); + if (rect.width >= 50 && rect.height >= 50) { + hasCanvas = true; + break; + } + } + + // Check for named DOM grid containers + const namedContainers = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"]' + ); + let hasDomGrid = false; + for (const el of namedContainers) { + const rect = (el as HTMLElement).getBoundingClientRect(); + if (rect.width >= 50 && rect.height >= 50) { + hasDomGrid = true; + break; + } + } + + // Check for tetris-grid-aspect element (2:1 height:width) + let hasTetrisRatioElement = false; + const allElements = document.querySelectorAll("div, section, main, article"); + for (const el of allElements) { + const rect = (el as HTMLElement).getBoundingClientRect(); + if (rect.width < 100 || rect.height < 200) continue; + const ratio = rect.height / rect.width; + if (ratio >= 1.5 && ratio <= 2.5) { + hasTetrisRatioElement = true; + break; + } + } + + // Check for large container with many same-sized children + let hasManyCellsContainer = false; + for (const el of allElements) { + const children = el.children; + if (children.length < 50 || children.length > 400) continue; + const sizes = new Set<string>(); + const sampleCount = Math.min(10, children.length); + for (let i = 0; i < sampleCount; i++) { + const child = children[Math.floor(i * children.length / sampleCount)] as HTMLElement; + const r = child.getBoundingClientRect(); + if (r.width > 0 && r.height > 0) { + sizes.add(`${Math.round(r.width)}x${Math.round(r.height)}`); + } + } + if (sizes.size <= 3 && sizes.size > 0) { + hasManyCellsContainer = true; + break; + } + } + + const landmarksFound: string[] = []; + if (bodyHasContent) landmarksFound.push("body_content"); + if (hasCanvas) landmarksFound.push("canvas"); + if (hasDomGrid) landmarksFound.push("dom_grid"); + if (hasTetrisRatioElement) landmarksFound.push("tetris_ratio"); + if (hasManyCellsContainer) landmarksFound.push("cells_container"); + + return { + bodyHasContent, + hasCanvas, + hasDomGrid, + hasTetrisRatioElement, + hasManyCellsContainer, + landmarksFound, + }; + }); + } + async calibrate(): Promise<DriverCalibration> { // Fast path: try applying the cached calibration from a prior run. if (this.firstCal) { diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -346,6 +346,10 @@ export interface GameSession { durationSeconds: number; pieceTypes: Set<string>; scoreValues: number[]; + /** Score reading taken just before the first detected line clear. */ + scoreBeforeClear?: number; + /** Score reading taken just after the first detected line clear. */ + scoreAfterClear?: number; gridReadSuccess: number; gridReadFail: number; frames: number; diff --git a/tasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json b/tasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": true, - "score_changes": true, + "score_increases_on_clear": true, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": true, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/4949d521.json b/tasks/tetris/eval/gameplay-bot/calibration/4949d521.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": false, - "score_changes": true, + "score_increases_on_clear": true, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json b/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": true, - "score_changes": null, + "score_increases_on_clear": null, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json b/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json @@ -18,7 +18,8 @@ "new_piece_spawns": null, "multiple_pieces": null, "line_clear": null, - "score_changes": null, + "score_increases_on_clear": null, + "score_element_visible": null, "game_over": null, "playable_30s": null, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json b/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": true, - "score_changes": true, + "score_increases_on_clear": true, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": true, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json b/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json @@ -18,7 +18,8 @@ "new_piece_spawns": null, "multiple_pieces": null, "line_clear": null, - "score_changes": null, + "score_increases_on_clear": null, + "score_element_visible": null, "game_over": null, "playable_30s": null, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/9805c24a.json b/tasks/tetris/eval/gameplay-bot/calibration/9805c24a.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": null, - "score_changes": null, + "score_increases_on_clear": null, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/bbb70053.json b/tasks/tetris/eval/gameplay-bot/calibration/bbb70053.json @@ -18,7 +18,8 @@ "new_piece_spawns": null, "multiple_pieces": null, "line_clear": null, - "score_changes": null, + "score_increases_on_clear": null, + "score_element_visible": null, "game_over": null, "playable_30s": null, "multi_line_clear": null, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/cbbff570.json b/tasks/tetris/eval/gameplay-bot/calibration/cbbff570.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": true, - "score_changes": true, + "score_increases_on_clear": true, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": true, diff --git a/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json b/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json @@ -18,7 +18,8 @@ "new_piece_spawns": true, "multiple_pieces": true, "line_clear": null, - "score_changes": false, + "score_increases_on_clear": false, + "score_element_visible": null, "game_over": true, "playable_30s": true, "multi_line_clear": null,

Impressum · Datenschutz