loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 57b74f96d1ff52223531c72a77bb145a43f51272
parent 68c55df2846dad69bc4b1247c072387f21e32909
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 09:37:01 +0200

Prevent off-grid reading and false positive piece detection

Grid reader:
- Reject grids where >60% of cells are filled (likely reading UI
  chrome, borders, or decorations instead of game grid)
- validateGridBounds(): check aspect ratio (~2:1 for 10x20 grid),
  reject grids that are too small or too large

Tests:
- piece_locks: require piecesSpawned > 0 alongside piecesLocked,
  otherwise flag as "likely false positive from UI misread"
- multiple_pieces: same spawned > 0 requirement
- Prevents the scenario where 0 pieces spawn but 10 "lock" because
  static UI elements were misread as game state

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/grid-reader.ts | 32++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot/tests.ts | 12++++++++++--
2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts @@ -100,10 +100,42 @@ async function readCanvasGrid( { x: bounds.x, y: bounds.y, cellW, cellH, rows: GRID_ROWS, cols: GRID_COLS, bgR, bgG, bgB, threshold } ); + // Validate: a freshly-read grid should make sense + if (grid) { + const totalCells = GRID_ROWS * GRID_COLS; + const filledCells = grid.reduce((sum, row) => sum + row.filter(Boolean).length, 0); + const filledPct = filledCells / totalCells; + + // If >60% of the grid is "filled" at any point, we're probably reading + // off-grid (UI chrome, borders, decorations). A real Tetris grid rarely + // exceeds 50% filled even in a losing game. + if (filledPct > 0.60) { + return null; + } + } + return grid; } /** + * Validate that calibrated grid bounds look like a real Tetris grid. + * Returns true if the bounds are plausible. + */ +export function validateGridBounds(bounds: GridBounds | null): boolean { + if (!bounds) return false; + + // Aspect ratio should be roughly 1:2 (width:height) for a 10x20 grid + const ratio = bounds.height / bounds.width; + if (ratio < 1.3 || ratio > 2.8) return false; + + // Grid should be a reasonable size (not tiny or the entire viewport) + if (bounds.width < 50 || bounds.height < 100) return false; + if (bounds.width > 1000 || bounds.height > 1500) return false; + + return true; +} + +/** * Read grid from DOM elements. Looks for a grid-like structure and checks * background colors or class names to determine filled vs empty cells. */ diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -746,12 +746,20 @@ function deriveTestResults( pass: true, detail: `filled cells persist at bottom (grid-verified, ${lockEvents.length} lock event(s))`, }); - } else if (session.piecesLocked > 0) { + } else if (session.piecesLocked > 0 && session.piecesSpawned > 0) { + // Only trust locked count if we also detected spawns (prevents false positives + // from static UI being misread as game state) results.push({ name: "piece_locks", pass: true, detail: `${session.piecesLocked} piece(s) locked during play`, }); + } else if (session.piecesLocked > 0 && session.piecesSpawned === 0) { + results.push({ + name: "piece_locks", + pass: false, + detail: `${session.piecesLocked} lock event(s) but 0 spawns detected - likely false positive from UI misread`, + }); } else { results.push({ name: "piece_locks", @@ -776,7 +784,7 @@ function deriveTestResults( } // 11. multiple_pieces - if (session.piecesLocked >= 3) { + if (session.piecesLocked >= 3 && session.piecesSpawned > 0) { results.push({ name: "multiple_pieces", pass: true,

Impressum · Datenschutz