V2: stricter rotation test requires distinct rotation states - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit d1b5c77738368fcf645c325b912383d5c69f22ed
parent 669aa68861617a446e5ae56aea0a462995d183f0
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat, 11 Apr 2026 09:12:39 +0200

V2: stricter rotation test requires distinct rotation states

Previous rotate test passed if pressing rotate caused ANY grid change.
Games with broken rotation (only 1 of 4 states works) would pass.

New test:
- Press rotate 4 times, wait 100ms between each
- Record normalized active piece shape after each press
- rotate test: passes if 2+ distinct shapes (baseline + rotation)
- all_pieces_rotate: passes if 2+ J/L/T pieces reach 3+ distinct shapes
- Skips if fewer than 2 J/L/T piece types seen

Uses position-invariant shape keys (normalized to top-left origin) so
auto-drop during the test doesn't confuse the comparison.

Tracking:
- session.distinctRotationShapes: max observed in Phase 3 probe
- session.rotationShapesByPiece: Map<piece_type, Set<shape_key>>
- playGame accepts rotationTrack param for gameplay-phase probing

Results:
- 9805c24a (broken): rotate now FAIL, all_pieces_rotate FAIL (was PASS/PASS)
- cbbff570 (flaky): rotate FAIL (was PASS)
- 4c7db3b9 (working): 100% score (up from 94%)
- 1d08ee76: 95% (up from 89%)
- 8fe72fce: 94% (unchanged)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M tasks/tetris/eval/gameplay-bot-v2/bot.ts  | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M tasks/tetris/eval/gameplay-bot-v2/types.ts  | 13 +++++++++++++

2 files changed, 243 insertions(+), 38 deletions(-)
diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts
@@ -259,6 +259,22 @@ function boundingBox(cells: [number, number][]): { w: number; h: number } {
   return { w: maxCol - minCol + 1, h: maxRow - minRow + 1 };
 }
 
+/**
+ * Compute a position-invariant key for an active piece shape. Subtracts the
+ * minimum row/col so the shape is translated to (0,0), then sorts and joins.
+ * Two active-piece cell sets that differ only by position (e.g. because the
+ * piece auto-dropped during the test) will produce the same key.
+ */
+function normalizedShapeKey(cells: [number, number][]): string {
+  if (cells.length === 0) return "";
+  const minRow = Math.min(...cells.map(([r]) => r));
+  const minCol = Math.min(...cells.map(([, c]) => c));
+  return cells
+    .map(([r, c]) => `${r - minRow},${c - minCol}`)
+    .sort()
+    .join("|");
+}
+
 function countFilledInTopRows(grid: Grid, rows: number): number {
   let count = 0;
   for (let r = 0; r < Math.min(rows, grid.length); r++) {
@@ -376,6 +392,8 @@ export async function runAllTests(
     frames: 0,
     events: [],
     skippedPhases: [],
+    distinctRotationShapes: 0,
+    rotationShapesByPiece: new Map<string, Set<string>>(),
   };
 
   let survey: SurveyData = {
@@ -654,43 +672,65 @@ async function runBasicMechanicsPhase(
     }
   }
 
-  // Rotation test
+  // Rotation test: press rotate 4 times and check how many DISTINCT shapes
+  // the piece cycles through. A correctly working Tetris game lets you
+  // rotate a non-O piece to at least 2 different states (I/S/Z have 2 total
+  // states, J/L/T have 4). A broken game where rotation only changes the
+  // piece once (then gets stuck) will only show 1 non-baseline shape, which
+  // is not enough to pass this test.
+  //
+  // Shape comparison is position-invariant (see normalizedShapeKey) so that
+  // auto-drop between presses doesn't confuse the detector.
   const snapBeforeRot = await driver.readGrid();
   if (snapBeforeRot.grid) session.gridReadSuccess++;
   else session.gridReadFail++;
   session.frames++;
 
-  await driver.pressKey("rotate");
-  await driver.wait(300);
+  const observedShapes = new Set<string>();
+  if (snapBeforeRot.activePieceCells && snapBeforeRot.activePieceCells.length > 0) {
+    observedShapes.add(normalizedShapeKey(snapBeforeRot.activePieceCells));
+  }
 
-  const snapAfterRot = await driver.readGrid();
-  if (snapAfterRot.grid) session.gridReadSuccess++;
-  else session.gridReadFail++;
-  session.frames++;
+  let lastRotationSnap = snapBeforeRot;
+  for (let i = 0; i < 4; i++) {
+    await driver.pressKey("rotate");
+    await driver.wait(100);
 
-  if (snapBeforeRot.grid && snapAfterRot.grid && driver.gridsAreDifferent(snapBeforeRot.grid, snapAfterRot.grid)) {
-    const cellsBefore = snapBeforeRot.activePieceCells;
-    const cellsAfter = snapAfterRot.activePieceCells;
-    if (cellsBefore && cellsAfter) {
-      const bbBefore = boundingBox(cellsBefore);
-      const bbAfter = boundingBox(cellsAfter);
-      if (bbBefore.w !== bbAfter.w || bbBefore.h !== bbAfter.h) {
-        session.rotationsObserved++;
-        session.events.push({ type: "piece_rotated", frame: session.frames });
-      } else {
-        const keyBefore = cellsBefore.map(([r, c]) => `${r},${c}`).sort().join("|");
-        const keyAfter = cellsAfter.map(([r, c]) => `${r},${c}`).sort().join("|");
-        if (keyBefore !== keyAfter) {
-          session.rotationsObserved++;
-          session.events.push({ type: "piece_rotated", frame: session.frames });
-        }
-      }
-    } else {
+    const snap = await driver.readGrid();
+    if (snap.grid) session.gridReadSuccess++;
+    else session.gridReadFail++;
+    session.frames++;
+
+    if (snap.activePieceCells && snap.activePieceCells.length > 0) {
+      observedShapes.add(normalizedShapeKey(snap.activePieceCells));
+    }
+    lastRotationSnap = snap;
+
+    // Any observed change in the grid between successive rotations counts as
+    // a "rotation observed" event for legacy metrics. This keeps the gameplay
+    // phase's counters consistent with the old behavior for downstream tests
+    // that still use session.rotationsObserved.
+    if (
+      i === 0 &&
+      snapBeforeRot.grid &&
+      snap.grid &&
+      driver.gridsAreDifferent(snapBeforeRot.grid, snap.grid)
+    ) {
       session.rotationsObserved++;
       session.events.push({ type: "piece_rotated", frame: session.frames });
     }
   }
 
+  // Record the max number of distinct shapes ever seen in a single rotation
+  // test. Later gameplay-phase probes may overwrite this if they see more.
+  if (observedShapes.size > session.distinctRotationShapes) {
+    session.distinctRotationShapes = observedShapes.size;
+  }
+
+  // Silence unused-variable warning: lastRotationSnap is kept for potential
+  // debugging/extension.
+  void lastRotationSnap;
+
   // Hard drop test
   const snapBeforeDrop = await driver.readGrid();
   if (snapBeforeDrop.grid) session.gridReadSuccess++;
@@ -768,6 +808,7 @@ async function runGameplayPhase(
   const result = await playGame(driver, {
     maxPieces: 60,
     maxDurationMs: 45000,
+    rotationTrack: session.rotationShapesByPiece,
   });
   gameplay.pieces_placed += result.piecesPlaced;
   gameplay.errors_during_play += result.errors;
@@ -913,7 +954,10 @@ async function runEndurancePhase(
   const errorsBefore = driver.getConsoleErrors().length;
   const start = Date.now();
 
-  const result = await playGame(driver, { maxDurationMs: 30000 });
+  const result = await playGame(driver, {
+    maxDurationMs: 30000,
+    rotationTrack: session.rotationShapesByPiece,
+  });
 
   const elapsed = Math.round((Date.now() - start) / 1000);
   gameplay.pieces_placed += result.piecesPlaced;
@@ -1226,6 +1270,29 @@ async function runCompetitivePlayPhase(
         const pieceType = snap.activePieceType || "unknown";
         session.pieceTypes.add(pieceType);
 
+        // Rotation probe for all_pieces_rotate: for the first time we see
+        // this piece type in competitive play, press rotate 4 times and
+        // record distinct normalized shapes. A correctly working game
+        // returns to baseline after 4 presses so the placement logic below
+        // continues from the same rotation state. Skip O and unknown.
+        if (
+          pieceType !== "unknown" &&
+          pieceType !== "O" &&
+          !session.rotationShapesByPiece.has(pieceType)
+        ) {
+          const shapes = new Set<string>();
+          shapes.add(normalizedShapeKey(snap.activePieceCells));
+          for (let r = 0; r < 4; r++) {
+            await driver.pressKey("rotate");
+            await driver.wait(80);
+            const rotSnap = await driver.readGrid(settledGrid);
+            if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) {
+              shapes.add(normalizedShapeKey(rotSnap.activePieceCells));
+            }
+          }
+          session.rotationShapesByPiece.set(pieceType, shapes);
+        }
+
         // Soft drop test
         if (!softDropTestDone && result.pieces_placed > 3 && result.pieces_placed % 5 === 0) {
           const snapBeforeDown = await driver.readGrid(settledGrid);
@@ -1397,10 +1464,15 @@ async function runCompetitivePlayPhase(
 
 async function playGame(
   driver: TetrisDriver,
-  options: { maxPieces?: number; maxDurationMs?: number }
+  options: {
+    maxPieces?: number;
+    maxDurationMs?: number;
+    rotationTrack?: Map<string, Set<string>>;
+  }
 ): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> {
   const maxPieces = options.maxPieces ?? 100;
   const maxDuration = options.maxDurationMs ?? 30000;
+  const rotationTrack = options.rotationTrack;
   const start = Date.now();
   let piecesPlaced = 0;
   let linesCleared = 0;
@@ -1445,14 +1517,51 @@ async function playGame(
       if (snap.activePieceCells && snap.activePieceCells.length === 4) {
         const pieceType = snap.activePieceType || "unknown";
 
+        // Rotation probe: for the first time we see this piece type, press
+        // rotate 4 times and record each resulting shape. A correctly
+        // working game cycles through the piece's rotation states and
+        // returns to baseline after 4 presses (so executePlacement below
+        // can proceed normally from the same rotation state). A broken
+        // game where rotation only fires once (or stalls) will record
+        // fewer distinct shapes -- which is exactly what the
+        // all_pieces_rotate test is looking for.
+        if (
+          rotationTrack &&
+          pieceType !== "unknown" &&
+          pieceType !== "O" &&
+          !rotationTrack.has(pieceType)
+        ) {
+          const shapes = new Set<string>();
+          shapes.add(normalizedShapeKey(snap.activePieceCells));
+          for (let r = 0; r < 4; r++) {
+            await driver.pressKey("rotate");
+            await driver.wait(80);
+            const rotSnap = await driver.readGrid(settledGrid);
+            if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) {
+              shapes.add(normalizedShapeKey(rotSnap.activePieceCells));
+            }
+          }
+          rotationTrack.set(pieceType, shapes);
+        }
+
+        // Re-read snap after rotation probe so activePieceCells reflects
+        // the current position (auto-drop may have shifted the piece).
+        let workingSnap = snap;
+        if (rotationTrack && pieceType !== "unknown" && pieceType !== "O") {
+          const freshSnap = await driver.readGrid(settledGrid);
+          if (freshSnap.grid && freshSnap.activePieceCells && freshSnap.activePieceCells.length === 4) {
+            workingSnap = freshSnap;
+          }
+        }
+
         // Save the locked board as-of right now (no active piece). This is
         // what findBestPlacement evaluates against, and what we use as the
         // diff base for the NEXT iteration's active-piece detection.
-        const boardBeforePlacement = stripActivePiece(snap.grid, snap.activePieceCells);
+        const boardBeforePlacement = stripActivePiece(workingSnap.grid!, workingSnap.activePieceCells!);
         const placement = findBestPlacement(boardBeforePlacement, pieceType as PieceType);
 
         if (placement) {
-          await executePlacement(driver, placement, snap.activePieceCells);
+          await executePlacement(driver, placement, workingSnap.activePieceCells!);
           linesCleared += placement.linesCleared;
           piecesPlaced++;
         } else {
@@ -1665,14 +1774,43 @@ function deriveTestResults(
   }
 
   // 7. rotate
+  //
+  // A correctly working game should cycle a non-O piece through at least 2
+  // rotation states (I/S/Z have 2 states; J/L/T have 4). We press rotate 4
+  // times and count distinct normalized shapes; a broken game where
+  // rotation only fires once will only produce 2 shapes total (initial +
+  // first rotation), which is NOT enough. We require 3 distinct shapes
+  // overall (initial + at least 2 other states).
+  //
+  // Fallback: if the Phase 3 probe landed on an O piece (only 1 shape) or
+  // the grid reader couldn't identify the active piece cells, we also
+  // consult the gameplay-phase per-piece tracking -- any piece type with
+  // 3+ distinct shapes observed there proves the game rotates correctly.
+  const gameplayRotationShapesMax = [...session.rotationShapesByPiece.values()]
+    .reduce((max, set) => Math.max(max, set.size), 0);
+  const maxShapesSeen = Math.max(session.distinctRotationShapes, gameplayRotationShapesMax);
   if (!phaseState.gameStarted) {
     results.push(skipResult("rotate", "game did not start"));
-  } else if (session.rotationsObserved > 0) {
-    results.push({ name: "rotate", pass: true, detail: `piece shape changed after rotate key (grid-verified, ${session.rotationsObserved} rotation(s))` });
+  } else if (maxShapesSeen >= 3) {
+    results.push({
+      name: "rotate",
+      pass: true,
+      detail: `piece cycled through ${maxShapesSeen} distinct shapes after 4 rotate presses (grid-verified)`,
+    });
   } else if (!gridReliable) {
     results.push({ name: "rotate", pass: false, detail: "grid reader unreliable, cannot verify rotation" });
+  } else if (maxShapesSeen === 2) {
+    results.push({
+      name: "rotate",
+      pass: false,
+      detail: "piece only reached 1 rotation state then stalled (expected at least 2 distinct non-baseline shapes)",
+    });
   } else {
-    results.push({ name: "rotate", pass: false, detail: "no shape change detected after rotate key" });
+    results.push({
+      name: "rotate",
+      pass: false,
+      detail: `no shape change detected after rotate key (${maxShapesSeen} distinct shape(s))`,
+    });
   }
 
   // 8. hard_drop
@@ -1687,16 +1825,70 @@ function deriveTestResults(
   }
 
   // 9. all_pieces_rotate
+  //
+  // During gameplay we probe each new piece type by pressing rotate 4
+  // times and recording distinct normalized shapes. A correctly working
+  // game cycles J/L/T through 4 rotation states (we expect to observe 3+
+  // distinct shapes even accounting for timing jitter).
+  //
+  // Pass rule: at least 2 multi-state piece types (J/L/T) reached 3+
+  // distinct shapes (baseline + at least 2 other states). This excludes
+  // broken games that rotate exactly once before getting stuck (2 shapes
+  // total). We require multi-state pieces because I/S/Z in classic-style
+  // games only have 2 rotation states, so we can't distinguish "broken
+  // rotation stuck at state 1" from "working 2-state S/Z".
+  //
+  // Skip if fewer than 2 J/L/T types were ever seen in the gameplay
+  // phase -- not enough data to make the claim.
   if (!phaseState.gameStarted) {
     results.push(skipResult("all_pieces_rotate", "game did not start"));
   } else {
-    const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown");
-    if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) {
-      results.push({ name: "all_pieces_rotate", pass: true, detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]` });
-    } else if (session.rotationsObserved > 0) {
-      results.push({ name: "all_pieces_rotate", pass: true, detail: "rotation confirmed but could not identify individual piece types" });
+    // Union of piece types visible via session.pieceTypes AND the tracking
+    // map keys, since the tracking map is populated in gameplay phases that
+    // don't add to pieceTypes.
+    const allSeenTypes = new Set<string>([
+      ...session.pieceTypes,
+      ...session.rotationShapesByPiece.keys(),
+    ]);
+    const multiStateTypesSeen = [...allSeenTypes].filter((t) =>
+      ["J", "L", "T"].includes(t)
+    );
+    const trackedTypes = [...session.rotationShapesByPiece.entries()];
+    const multiStateRotated = trackedTypes.filter(
+      ([t, shapes]) => ["J", "L", "T"].includes(t) && shapes.size >= 3
+    );
+    if (multiStateTypesSeen.length < 2) {
+      results.push(
+        skipResult(
+          "all_pieces_rotate",
+          `not enough piece types to verify (saw ${multiStateTypesSeen.length} of J/L/T, need 2)`
+        )
+      );
+    } else if (multiStateRotated.length >= 2) {
+      const detail = multiStateRotated
+        .map(([t, s]) => `${t}:${s.size}`)
+        .join(" ");
+      results.push({
+        name: "all_pieces_rotate",
+        pass: true,
+        detail: `${multiStateRotated.length} J/L/T piece type(s) rotated to 3+ distinct shapes [${detail}]`,
+      });
+    } else if (!gridReliable) {
+      results.push({
+        name: "all_pieces_rotate",
+        pass: false,
+        detail: "grid reader unreliable, cannot verify per-piece rotation",
+      });
     } else {
-      results.push({ name: "all_pieces_rotate", pass: false, detail: "could not detect any piece rotations via grid reader" });
+      const detail = trackedTypes
+        .filter(([t]) => ["J", "L", "T"].includes(t))
+        .map(([t, s]) => `${t}:${s.size}`)
+        .join(" ");
+      results.push({
+        name: "all_pieces_rotate",
+        pass: false,
+        detail: `only ${multiStateRotated.length} of ${multiStateTypesSeen.length} J/L/T rotated to 3+ distinct shapes (need 2) [${detail}]`,
+      });
     }
   }
 
diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts
@@ -329,6 +329,19 @@ export interface GameSession {
   frames: number;
   events: GridEvent[];
   skippedPhases: string[];
+  /**
+   * Maximum number of distinct normalized shapes observed while pressing
+   * rotate 4 times on a single piece in Phase 3 (basic mechanics). Used by
+   * the rotate test to verify the game actually cycles through rotation
+   * states rather than allowing only one rotation.
+   */
+  distinctRotationShapes: number;
+  /**
+   * Per-piece-type set of distinct normalized shapes observed during the
+   * rotation probe in gameplay phases. Used by all_pieces_rotate to verify
+   * that multiple piece types can each rotate through 2+ distinct shapes.
+   */
+  rotationShapesByPiece: Map<string, Set<string>>;
 }
 
 /** An event observed during continuous grid scanning. */

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	tasks/tetris/eval/gameplay-bot-v2/bot.ts	\|	268	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	tasks/tetris/eval/gameplay-bot-v2/types.ts	\|	13	+++++++++++++