commit 57b74f96d1ff52223531c72a77bb145a43f51272
parent 68c55df2846dad69bc4b1247c072387f21e32909
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 09:37:01 +0200
Prevent off-grid reading and false positive piece detection
Grid reader:
- Reject grids where >60% of cells are filled (likely reading UI
chrome, borders, or decorations instead of game grid)
- validateGridBounds(): check aspect ratio (~2:1 for 10x20 grid),
reject grids that are too small or too large
Tests:
- piece_locks: require piecesSpawned > 0 alongside piecesLocked,
otherwise flag as "likely false positive from UI misread"
- multiple_pieces: same spawned > 0 requirement
- Prevents the scenario where 0 pieces spawn but 10 "lock" because
static UI elements were misread as game state
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/tasks/tetris/eval/gameplay-bot/grid-reader.ts b/tasks/tetris/eval/gameplay-bot/grid-reader.ts
@@ -100,10 +100,42 @@ async function readCanvasGrid(
{ x: bounds.x, y: bounds.y, cellW, cellH, rows: GRID_ROWS, cols: GRID_COLS, bgR, bgG, bgB, threshold }
);
+ // Validate: a freshly-read grid should make sense
+ if (grid) {
+ const totalCells = GRID_ROWS * GRID_COLS;
+ const filledCells = grid.reduce((sum, row) => sum + row.filter(Boolean).length, 0);
+ const filledPct = filledCells / totalCells;
+
+ // If >60% of the grid is "filled" at any point, we're probably reading
+ // off-grid (UI chrome, borders, decorations). A real Tetris grid rarely
+ // exceeds 50% filled even in a losing game.
+ if (filledPct > 0.60) {
+ return null;
+ }
+ }
+
return grid;
}
/**
+ * Validate that calibrated grid bounds look like a real Tetris grid.
+ * Returns true if the bounds are plausible.
+ */
+export function validateGridBounds(bounds: GridBounds | null): boolean {
+ if (!bounds) return false;
+
+ // Aspect ratio should be roughly 1:2 (width:height) for a 10x20 grid
+ const ratio = bounds.height / bounds.width;
+ if (ratio < 1.3 || ratio > 2.8) return false;
+
+ // Grid should be a reasonable size (not tiny or the entire viewport)
+ if (bounds.width < 50 || bounds.height < 100) return false;
+ if (bounds.width > 1000 || bounds.height > 1500) return false;
+
+ return true;
+}
+
+/**
* Read grid from DOM elements. Looks for a grid-like structure and checks
* background colors or class names to determine filled vs empty cells.
*/
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -746,12 +746,20 @@ function deriveTestResults(
pass: true,
detail: `filled cells persist at bottom (grid-verified, ${lockEvents.length} lock event(s))`,
});
- } else if (session.piecesLocked > 0) {
+ } else if (session.piecesLocked > 0 && session.piecesSpawned > 0) {
+ // Only trust locked count if we also detected spawns (prevents false positives
+ // from static UI being misread as game state)
results.push({
name: "piece_locks",
pass: true,
detail: `${session.piecesLocked} piece(s) locked during play`,
});
+ } else if (session.piecesLocked > 0 && session.piecesSpawned === 0) {
+ results.push({
+ name: "piece_locks",
+ pass: false,
+ detail: `${session.piecesLocked} lock event(s) but 0 spawns detected - likely false positive from UI misread`,
+ });
} else {
results.push({
name: "piece_locks",
@@ -776,7 +784,7 @@ function deriveTestResults(
}
// 11. multiple_pieces
- if (session.piecesLocked >= 3) {
+ if (session.piecesLocked >= 3 && session.piecesSpawned > 0) {
results.push({
name: "multiple_pieces",
pass: true,