commit d1b5c77738368fcf645c325b912383d5c69f22ed
parent 669aa68861617a446e5ae56aea0a462995d183f0
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 11 Apr 2026 09:12:39 +0200
V2: stricter rotation test requires distinct rotation states
Previous rotate test passed if pressing rotate caused ANY grid change.
Games with broken rotation (only 1 of 4 states works) would pass.
New test:
- Press rotate 4 times, wait 100ms between each
- Record normalized active piece shape after each press
- rotate test: passes if 2+ distinct shapes (baseline + rotation)
- all_pieces_rotate: passes if 2+ J/L/T pieces reach 3+ distinct shapes
- Skips if fewer than 2 J/L/T piece types seen
Uses position-invariant shape keys (normalized to top-left origin) so
auto-drop during the test doesn't confuse the comparison.
Tracking:
- session.distinctRotationShapes: max observed in Phase 3 probe
- session.rotationShapesByPiece: Map<piece_type, Set<shape_key>>
- playGame accepts rotationTrack param for gameplay-phase probing
Results:
- 9805c24a (broken): rotate now FAIL, all_pieces_rotate FAIL (was PASS/PASS)
- cbbff570 (flaky): rotate FAIL (was PASS)
- 4c7db3b9 (working): 100% score (up from 94%)
- 1d08ee76: 95% (up from 89%)
- 8fe72fce: 94% (unchanged)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 243 insertions(+), 38 deletions(-)
diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts
@@ -259,6 +259,22 @@ function boundingBox(cells: [number, number][]): { w: number; h: number } {
return { w: maxCol - minCol + 1, h: maxRow - minRow + 1 };
}
+/**
+ * Compute a position-invariant key for an active piece shape. Subtracts the
+ * minimum row/col so the shape is translated to (0,0), then sorts and joins.
+ * Two active-piece cell sets that differ only by position (e.g. because the
+ * piece auto-dropped during the test) will produce the same key.
+ */
+function normalizedShapeKey(cells: [number, number][]): string {
+ if (cells.length === 0) return "";
+ const minRow = Math.min(...cells.map(([r]) => r));
+ const minCol = Math.min(...cells.map(([, c]) => c));
+ return cells
+ .map(([r, c]) => `${r - minRow},${c - minCol}`)
+ .sort()
+ .join("|");
+}
+
function countFilledInTopRows(grid: Grid, rows: number): number {
let count = 0;
for (let r = 0; r < Math.min(rows, grid.length); r++) {
@@ -376,6 +392,8 @@ export async function runAllTests(
frames: 0,
events: [],
skippedPhases: [],
+ distinctRotationShapes: 0,
+ rotationShapesByPiece: new Map<string, Set<string>>(),
};
let survey: SurveyData = {
@@ -654,43 +672,65 @@ async function runBasicMechanicsPhase(
}
}
- // Rotation test
+ // Rotation test: press rotate 4 times and check how many DISTINCT shapes
+ // the piece cycles through. A correctly working Tetris game lets you
+ // rotate a non-O piece to at least 2 different states (I/S/Z have 2 total
+ // states, J/L/T have 4). A broken game where rotation only changes the
+ // piece once (then gets stuck) will only show 1 non-baseline shape, which
+ // is not enough to pass this test.
+ //
+ // Shape comparison is position-invariant (see normalizedShapeKey) so that
+ // auto-drop between presses doesn't confuse the detector.
const snapBeforeRot = await driver.readGrid();
if (snapBeforeRot.grid) session.gridReadSuccess++;
else session.gridReadFail++;
session.frames++;
- await driver.pressKey("rotate");
- await driver.wait(300);
+ const observedShapes = new Set<string>();
+ if (snapBeforeRot.activePieceCells && snapBeforeRot.activePieceCells.length > 0) {
+ observedShapes.add(normalizedShapeKey(snapBeforeRot.activePieceCells));
+ }
- const snapAfterRot = await driver.readGrid();
- if (snapAfterRot.grid) session.gridReadSuccess++;
- else session.gridReadFail++;
- session.frames++;
+ let lastRotationSnap = snapBeforeRot;
+ for (let i = 0; i < 4; i++) {
+ await driver.pressKey("rotate");
+ await driver.wait(100);
- if (snapBeforeRot.grid && snapAfterRot.grid && driver.gridsAreDifferent(snapBeforeRot.grid, snapAfterRot.grid)) {
- const cellsBefore = snapBeforeRot.activePieceCells;
- const cellsAfter = snapAfterRot.activePieceCells;
- if (cellsBefore && cellsAfter) {
- const bbBefore = boundingBox(cellsBefore);
- const bbAfter = boundingBox(cellsAfter);
- if (bbBefore.w !== bbAfter.w || bbBefore.h !== bbAfter.h) {
- session.rotationsObserved++;
- session.events.push({ type: "piece_rotated", frame: session.frames });
- } else {
- const keyBefore = cellsBefore.map(([r, c]) => `${r},${c}`).sort().join("|");
- const keyAfter = cellsAfter.map(([r, c]) => `${r},${c}`).sort().join("|");
- if (keyBefore !== keyAfter) {
- session.rotationsObserved++;
- session.events.push({ type: "piece_rotated", frame: session.frames });
- }
- }
- } else {
+ const snap = await driver.readGrid();
+ if (snap.grid) session.gridReadSuccess++;
+ else session.gridReadFail++;
+ session.frames++;
+
+ if (snap.activePieceCells && snap.activePieceCells.length > 0) {
+ observedShapes.add(normalizedShapeKey(snap.activePieceCells));
+ }
+ lastRotationSnap = snap;
+
+ // Any observed change in the grid between successive rotations counts as
+ // a "rotation observed" event for legacy metrics. This keeps the gameplay
+ // phase's counters consistent with the old behavior for downstream tests
+ // that still use session.rotationsObserved.
+ if (
+ i === 0 &&
+ snapBeforeRot.grid &&
+ snap.grid &&
+ driver.gridsAreDifferent(snapBeforeRot.grid, snap.grid)
+ ) {
session.rotationsObserved++;
session.events.push({ type: "piece_rotated", frame: session.frames });
}
}
+ // Record the max number of distinct shapes ever seen in a single rotation
+ // test. Later gameplay-phase probes may overwrite this if they see more.
+ if (observedShapes.size > session.distinctRotationShapes) {
+ session.distinctRotationShapes = observedShapes.size;
+ }
+
+ // Silence unused-variable warning: lastRotationSnap is kept for potential
+ // debugging/extension.
+ void lastRotationSnap;
+
// Hard drop test
const snapBeforeDrop = await driver.readGrid();
if (snapBeforeDrop.grid) session.gridReadSuccess++;
@@ -768,6 +808,7 @@ async function runGameplayPhase(
const result = await playGame(driver, {
maxPieces: 60,
maxDurationMs: 45000,
+ rotationTrack: session.rotationShapesByPiece,
});
gameplay.pieces_placed += result.piecesPlaced;
gameplay.errors_during_play += result.errors;
@@ -913,7 +954,10 @@ async function runEndurancePhase(
const errorsBefore = driver.getConsoleErrors().length;
const start = Date.now();
- const result = await playGame(driver, { maxDurationMs: 30000 });
+ const result = await playGame(driver, {
+ maxDurationMs: 30000,
+ rotationTrack: session.rotationShapesByPiece,
+ });
const elapsed = Math.round((Date.now() - start) / 1000);
gameplay.pieces_placed += result.piecesPlaced;
@@ -1226,6 +1270,29 @@ async function runCompetitivePlayPhase(
const pieceType = snap.activePieceType || "unknown";
session.pieceTypes.add(pieceType);
+ // Rotation probe for all_pieces_rotate: for the first time we see
+ // this piece type in competitive play, press rotate 4 times and
+ // record distinct normalized shapes. A correctly working game
+ // returns to baseline after 4 presses so the placement logic below
+ // continues from the same rotation state. Skip O and unknown.
+ if (
+ pieceType !== "unknown" &&
+ pieceType !== "O" &&
+ !session.rotationShapesByPiece.has(pieceType)
+ ) {
+ const shapes = new Set<string>();
+ shapes.add(normalizedShapeKey(snap.activePieceCells));
+ for (let r = 0; r < 4; r++) {
+ await driver.pressKey("rotate");
+ await driver.wait(80);
+ const rotSnap = await driver.readGrid(settledGrid);
+ if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) {
+ shapes.add(normalizedShapeKey(rotSnap.activePieceCells));
+ }
+ }
+ session.rotationShapesByPiece.set(pieceType, shapes);
+ }
+
// Soft drop test
if (!softDropTestDone && result.pieces_placed > 3 && result.pieces_placed % 5 === 0) {
const snapBeforeDown = await driver.readGrid(settledGrid);
@@ -1397,10 +1464,15 @@ async function runCompetitivePlayPhase(
async function playGame(
driver: TetrisDriver,
- options: { maxPieces?: number; maxDurationMs?: number }
+ options: {
+ maxPieces?: number;
+ maxDurationMs?: number;
+ rotationTrack?: Map<string, Set<string>>;
+ }
): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> {
const maxPieces = options.maxPieces ?? 100;
const maxDuration = options.maxDurationMs ?? 30000;
+ const rotationTrack = options.rotationTrack;
const start = Date.now();
let piecesPlaced = 0;
let linesCleared = 0;
@@ -1445,14 +1517,51 @@ async function playGame(
if (snap.activePieceCells && snap.activePieceCells.length === 4) {
const pieceType = snap.activePieceType || "unknown";
+ // Rotation probe: for the first time we see this piece type, press
+ // rotate 4 times and record each resulting shape. A correctly
+ // working game cycles through the piece's rotation states and
+ // returns to baseline after 4 presses (so executePlacement below
+ // can proceed normally from the same rotation state). A broken
+ // game where rotation only fires once (or stalls) will record
+ // fewer distinct shapes -- which is exactly what the
+ // all_pieces_rotate test is looking for.
+ if (
+ rotationTrack &&
+ pieceType !== "unknown" &&
+ pieceType !== "O" &&
+ !rotationTrack.has(pieceType)
+ ) {
+ const shapes = new Set<string>();
+ shapes.add(normalizedShapeKey(snap.activePieceCells));
+ for (let r = 0; r < 4; r++) {
+ await driver.pressKey("rotate");
+ await driver.wait(80);
+ const rotSnap = await driver.readGrid(settledGrid);
+ if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) {
+ shapes.add(normalizedShapeKey(rotSnap.activePieceCells));
+ }
+ }
+ rotationTrack.set(pieceType, shapes);
+ }
+
+ // Re-read snap after rotation probe so activePieceCells reflects
+ // the current position (auto-drop may have shifted the piece).
+ let workingSnap = snap;
+ if (rotationTrack && pieceType !== "unknown" && pieceType !== "O") {
+ const freshSnap = await driver.readGrid(settledGrid);
+ if (freshSnap.grid && freshSnap.activePieceCells && freshSnap.activePieceCells.length === 4) {
+ workingSnap = freshSnap;
+ }
+ }
+
// Save the locked board as-of right now (no active piece). This is
// what findBestPlacement evaluates against, and what we use as the
// diff base for the NEXT iteration's active-piece detection.
- const boardBeforePlacement = stripActivePiece(snap.grid, snap.activePieceCells);
+ const boardBeforePlacement = stripActivePiece(workingSnap.grid!, workingSnap.activePieceCells!);
const placement = findBestPlacement(boardBeforePlacement, pieceType as PieceType);
if (placement) {
- await executePlacement(driver, placement, snap.activePieceCells);
+ await executePlacement(driver, placement, workingSnap.activePieceCells!);
linesCleared += placement.linesCleared;
piecesPlaced++;
} else {
@@ -1665,14 +1774,43 @@ function deriveTestResults(
}
// 7. rotate
+ //
+ // A correctly working game should cycle a non-O piece through at least 2
+ // rotation states (I/S/Z have 2 states; J/L/T have 4). We press rotate 4
+ // times and count distinct normalized shapes; a broken game where
+ // rotation only fires once will only produce 2 shapes total (initial +
+ // first rotation), which is NOT enough. We require 3 distinct shapes
+ // overall (initial + at least 2 other states).
+ //
+ // Fallback: if the Phase 3 probe landed on an O piece (only 1 shape) or
+ // the grid reader couldn't identify the active piece cells, we also
+ // consult the gameplay-phase per-piece tracking -- any piece type with
+ // 3+ distinct shapes observed there proves the game rotates correctly.
+ const gameplayRotationShapesMax = [...session.rotationShapesByPiece.values()]
+ .reduce((max, set) => Math.max(max, set.size), 0);
+ const maxShapesSeen = Math.max(session.distinctRotationShapes, gameplayRotationShapesMax);
if (!phaseState.gameStarted) {
results.push(skipResult("rotate", "game did not start"));
- } else if (session.rotationsObserved > 0) {
- results.push({ name: "rotate", pass: true, detail: `piece shape changed after rotate key (grid-verified, ${session.rotationsObserved} rotation(s))` });
+ } else if (maxShapesSeen >= 3) {
+ results.push({
+ name: "rotate",
+ pass: true,
+ detail: `piece cycled through ${maxShapesSeen} distinct shapes after 4 rotate presses (grid-verified)`,
+ });
} else if (!gridReliable) {
results.push({ name: "rotate", pass: false, detail: "grid reader unreliable, cannot verify rotation" });
+ } else if (maxShapesSeen === 2) {
+ results.push({
+ name: "rotate",
+ pass: false,
+ detail: "piece only reached 1 rotation state then stalled (expected at least 2 distinct non-baseline shapes)",
+ });
} else {
- results.push({ name: "rotate", pass: false, detail: "no shape change detected after rotate key" });
+ results.push({
+ name: "rotate",
+ pass: false,
+ detail: `no shape change detected after rotate key (${maxShapesSeen} distinct shape(s))`,
+ });
}
// 8. hard_drop
@@ -1687,16 +1825,70 @@ function deriveTestResults(
}
// 9. all_pieces_rotate
+ //
+ // During gameplay we probe each new piece type by pressing rotate 4
+ // times and recording distinct normalized shapes. A correctly working
+ // game cycles J/L/T through 4 rotation states (we expect to observe 3+
+ // distinct shapes even accounting for timing jitter).
+ //
+ // Pass rule: at least 2 multi-state piece types (J/L/T) reached 3+
+ // distinct shapes (baseline + at least 2 other states). This excludes
+ // broken games that rotate exactly once before getting stuck (2 shapes
+ // total). We require multi-state pieces because I/S/Z in classic-style
+ // games only have 2 rotation states, so we can't distinguish "broken
+ // rotation stuck at state 1" from "working 2-state S/Z".
+ //
+ // Skip if fewer than 2 J/L/T types were ever seen in the gameplay
+ // phase -- not enough data to make the claim.
if (!phaseState.gameStarted) {
results.push(skipResult("all_pieces_rotate", "game did not start"));
} else {
- const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown");
- if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) {
- results.push({ name: "all_pieces_rotate", pass: true, detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]` });
- } else if (session.rotationsObserved > 0) {
- results.push({ name: "all_pieces_rotate", pass: true, detail: "rotation confirmed but could not identify individual piece types" });
+ // Union of piece types visible via session.pieceTypes AND the tracking
+ // map keys, since the tracking map is populated in gameplay phases that
+ // don't add to pieceTypes.
+ const allSeenTypes = new Set<string>([
+ ...session.pieceTypes,
+ ...session.rotationShapesByPiece.keys(),
+ ]);
+ const multiStateTypesSeen = [...allSeenTypes].filter((t) =>
+ ["J", "L", "T"].includes(t)
+ );
+ const trackedTypes = [...session.rotationShapesByPiece.entries()];
+ const multiStateRotated = trackedTypes.filter(
+ ([t, shapes]) => ["J", "L", "T"].includes(t) && shapes.size >= 3
+ );
+ if (multiStateTypesSeen.length < 2) {
+ results.push(
+ skipResult(
+ "all_pieces_rotate",
+ `not enough piece types to verify (saw ${multiStateTypesSeen.length} of J/L/T, need 2)`
+ )
+ );
+ } else if (multiStateRotated.length >= 2) {
+ const detail = multiStateRotated
+ .map(([t, s]) => `${t}:${s.size}`)
+ .join(" ");
+ results.push({
+ name: "all_pieces_rotate",
+ pass: true,
+ detail: `${multiStateRotated.length} J/L/T piece type(s) rotated to 3+ distinct shapes [${detail}]`,
+ });
+ } else if (!gridReliable) {
+ results.push({
+ name: "all_pieces_rotate",
+ pass: false,
+ detail: "grid reader unreliable, cannot verify per-piece rotation",
+ });
} else {
- results.push({ name: "all_pieces_rotate", pass: false, detail: "could not detect any piece rotations via grid reader" });
+ const detail = trackedTypes
+ .filter(([t]) => ["J", "L", "T"].includes(t))
+ .map(([t, s]) => `${t}:${s.size}`)
+ .join(" ");
+ results.push({
+ name: "all_pieces_rotate",
+ pass: false,
+ detail: `only ${multiStateRotated.length} of ${multiStateTypesSeen.length} J/L/T rotated to 3+ distinct shapes (need 2) [${detail}]`,
+ });
}
}
diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts
@@ -329,6 +329,19 @@ export interface GameSession {
frames: number;
events: GridEvent[];
skippedPhases: string[];
+ /**
+ * Maximum number of distinct normalized shapes observed while pressing
+ * rotate 4 times on a single piece in Phase 3 (basic mechanics). Used by
+ * the rotate test to verify the game actually cycles through rotation
+ * states rather than allowing only one rotation.
+ */
+ distinctRotationShapes: number;
+ /**
+ * Per-piece-type set of distinct normalized shapes observed during the
+ * rotation probe in gameplay phases. Used by all_pieces_rotate to verify
+ * that multiple piece types can each rotate through 2+ distinct shapes.
+ */
+ rotationShapesByPiece: Map<string, Set<string>>;
}
/** An event observed during continuous grid scanning. */