loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d1b5c77738368fcf645c325b912383d5c69f22ed
parent 669aa68861617a446e5ae56aea0a462995d183f0
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat, 11 Apr 2026 09:12:39 +0200

V2: stricter rotation test requires distinct rotation states

Previous rotate test passed if pressing rotate caused ANY grid change.
Games with broken rotation (only 1 of 4 states works) would pass.

New test:
- Press rotate 4 times, wait 100ms between each
- Record normalized active piece shape after each press
- rotate test: passes if 2+ distinct shapes (baseline + rotation)
- all_pieces_rotate: passes if 2+ J/L/T pieces reach 3+ distinct shapes
- Skips if fewer than 2 J/L/T piece types seen

Uses position-invariant shape keys (normalized to top-left origin) so
auto-drop during the test doesn't confuse the comparison.

Tracking:
- session.distinctRotationShapes: max observed in Phase 3 probe
- session.rotationShapesByPiece: Map<piece_type, Set<shape_key>>
- playGame accepts rotationTrack param for gameplay-phase probing

Results:
- 9805c24a (broken): rotate now FAIL, all_pieces_rotate FAIL (was PASS/PASS)
- cbbff570 (flaky): rotate FAIL (was PASS)
- 4c7db3b9 (working): 100% score (up from 94%)
- 1d08ee76: 95% (up from 89%)
- 8fe72fce: 94% (unchanged)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 268+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 13+++++++++++++
2 files changed, 243 insertions(+), 38 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -259,6 +259,22 @@ function boundingBox(cells: [number, number][]): { w: number; h: number } { return { w: maxCol - minCol + 1, h: maxRow - minRow + 1 }; } +/** + * Compute a position-invariant key for an active piece shape. Subtracts the + * minimum row/col so the shape is translated to (0,0), then sorts and joins. + * Two active-piece cell sets that differ only by position (e.g. because the + * piece auto-dropped during the test) will produce the same key. + */ +function normalizedShapeKey(cells: [number, number][]): string { + if (cells.length === 0) return ""; + const minRow = Math.min(...cells.map(([r]) => r)); + const minCol = Math.min(...cells.map(([, c]) => c)); + return cells + .map(([r, c]) => `${r - minRow},${c - minCol}`) + .sort() + .join("|"); +} + function countFilledInTopRows(grid: Grid, rows: number): number { let count = 0; for (let r = 0; r < Math.min(rows, grid.length); r++) { @@ -376,6 +392,8 @@ export async function runAllTests( frames: 0, events: [], skippedPhases: [], + distinctRotationShapes: 0, + rotationShapesByPiece: new Map<string, Set<string>>(), }; let survey: SurveyData = { @@ -654,43 +672,65 @@ async function runBasicMechanicsPhase( } } - // Rotation test + // Rotation test: press rotate 4 times and check how many DISTINCT shapes + // the piece cycles through. A correctly working Tetris game lets you + // rotate a non-O piece to at least 2 different states (I/S/Z have 2 total + // states, J/L/T have 4). A broken game where rotation only changes the + // piece once (then gets stuck) will only show 1 non-baseline shape, which + // is not enough to pass this test. + // + // Shape comparison is position-invariant (see normalizedShapeKey) so that + // auto-drop between presses doesn't confuse the detector. const snapBeforeRot = await driver.readGrid(); if (snapBeforeRot.grid) session.gridReadSuccess++; else session.gridReadFail++; session.frames++; - await driver.pressKey("rotate"); - await driver.wait(300); + const observedShapes = new Set<string>(); + if (snapBeforeRot.activePieceCells && snapBeforeRot.activePieceCells.length > 0) { + observedShapes.add(normalizedShapeKey(snapBeforeRot.activePieceCells)); + } - const snapAfterRot = await driver.readGrid(); - if (snapAfterRot.grid) session.gridReadSuccess++; - else session.gridReadFail++; - session.frames++; + let lastRotationSnap = snapBeforeRot; + for (let i = 0; i < 4; i++) { + await driver.pressKey("rotate"); + await driver.wait(100); - if (snapBeforeRot.grid && snapAfterRot.grid && driver.gridsAreDifferent(snapBeforeRot.grid, snapAfterRot.grid)) { - const cellsBefore = snapBeforeRot.activePieceCells; - const cellsAfter = snapAfterRot.activePieceCells; - if (cellsBefore && cellsAfter) { - const bbBefore = boundingBox(cellsBefore); - const bbAfter = boundingBox(cellsAfter); - if (bbBefore.w !== bbAfter.w || bbBefore.h !== bbAfter.h) { - session.rotationsObserved++; - session.events.push({ type: "piece_rotated", frame: session.frames }); - } else { - const keyBefore = cellsBefore.map(([r, c]) => `${r},${c}`).sort().join("|"); - const keyAfter = cellsAfter.map(([r, c]) => `${r},${c}`).sort().join("|"); - if (keyBefore !== keyAfter) { - session.rotationsObserved++; - session.events.push({ type: "piece_rotated", frame: session.frames }); - } - } - } else { + const snap = await driver.readGrid(); + if (snap.grid) session.gridReadSuccess++; + else session.gridReadFail++; + session.frames++; + + if (snap.activePieceCells && snap.activePieceCells.length > 0) { + observedShapes.add(normalizedShapeKey(snap.activePieceCells)); + } + lastRotationSnap = snap; + + // Any observed change in the grid between successive rotations counts as + // a "rotation observed" event for legacy metrics. This keeps the gameplay + // phase's counters consistent with the old behavior for downstream tests + // that still use session.rotationsObserved. + if ( + i === 0 && + snapBeforeRot.grid && + snap.grid && + driver.gridsAreDifferent(snapBeforeRot.grid, snap.grid) + ) { session.rotationsObserved++; session.events.push({ type: "piece_rotated", frame: session.frames }); } } + // Record the max number of distinct shapes ever seen in a single rotation + // test. Later gameplay-phase probes may overwrite this if they see more. + if (observedShapes.size > session.distinctRotationShapes) { + session.distinctRotationShapes = observedShapes.size; + } + + // Silence unused-variable warning: lastRotationSnap is kept for potential + // debugging/extension. + void lastRotationSnap; + // Hard drop test const snapBeforeDrop = await driver.readGrid(); if (snapBeforeDrop.grid) session.gridReadSuccess++; @@ -768,6 +808,7 @@ async function runGameplayPhase( const result = await playGame(driver, { maxPieces: 60, maxDurationMs: 45000, + rotationTrack: session.rotationShapesByPiece, }); gameplay.pieces_placed += result.piecesPlaced; gameplay.errors_during_play += result.errors; @@ -913,7 +954,10 @@ async function runEndurancePhase( const errorsBefore = driver.getConsoleErrors().length; const start = Date.now(); - const result = await playGame(driver, { maxDurationMs: 30000 }); + const result = await playGame(driver, { + maxDurationMs: 30000, + rotationTrack: session.rotationShapesByPiece, + }); const elapsed = Math.round((Date.now() - start) / 1000); gameplay.pieces_placed += result.piecesPlaced; @@ -1226,6 +1270,29 @@ async function runCompetitivePlayPhase( const pieceType = snap.activePieceType || "unknown"; session.pieceTypes.add(pieceType); + // Rotation probe for all_pieces_rotate: for the first time we see + // this piece type in competitive play, press rotate 4 times and + // record distinct normalized shapes. A correctly working game + // returns to baseline after 4 presses so the placement logic below + // continues from the same rotation state. Skip O and unknown. + if ( + pieceType !== "unknown" && + pieceType !== "O" && + !session.rotationShapesByPiece.has(pieceType) + ) { + const shapes = new Set<string>(); + shapes.add(normalizedShapeKey(snap.activePieceCells)); + for (let r = 0; r < 4; r++) { + await driver.pressKey("rotate"); + await driver.wait(80); + const rotSnap = await driver.readGrid(settledGrid); + if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) { + shapes.add(normalizedShapeKey(rotSnap.activePieceCells)); + } + } + session.rotationShapesByPiece.set(pieceType, shapes); + } + // Soft drop test if (!softDropTestDone && result.pieces_placed > 3 && result.pieces_placed % 5 === 0) { const snapBeforeDown = await driver.readGrid(settledGrid); @@ -1397,10 +1464,15 @@ async function runCompetitivePlayPhase( async function playGame( driver: TetrisDriver, - options: { maxPieces?: number; maxDurationMs?: number } + options: { + maxPieces?: number; + maxDurationMs?: number; + rotationTrack?: Map<string, Set<string>>; + } ): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> { const maxPieces = options.maxPieces ?? 100; const maxDuration = options.maxDurationMs ?? 30000; + const rotationTrack = options.rotationTrack; const start = Date.now(); let piecesPlaced = 0; let linesCleared = 0; @@ -1445,14 +1517,51 @@ async function playGame( if (snap.activePieceCells && snap.activePieceCells.length === 4) { const pieceType = snap.activePieceType || "unknown"; + // Rotation probe: for the first time we see this piece type, press + // rotate 4 times and record each resulting shape. A correctly + // working game cycles through the piece's rotation states and + // returns to baseline after 4 presses (so executePlacement below + // can proceed normally from the same rotation state). A broken + // game where rotation only fires once (or stalls) will record + // fewer distinct shapes -- which is exactly what the + // all_pieces_rotate test is looking for. + if ( + rotationTrack && + pieceType !== "unknown" && + pieceType !== "O" && + !rotationTrack.has(pieceType) + ) { + const shapes = new Set<string>(); + shapes.add(normalizedShapeKey(snap.activePieceCells)); + for (let r = 0; r < 4; r++) { + await driver.pressKey("rotate"); + await driver.wait(80); + const rotSnap = await driver.readGrid(settledGrid); + if (rotSnap.activePieceCells && rotSnap.activePieceCells.length > 0) { + shapes.add(normalizedShapeKey(rotSnap.activePieceCells)); + } + } + rotationTrack.set(pieceType, shapes); + } + + // Re-read snap after rotation probe so activePieceCells reflects + // the current position (auto-drop may have shifted the piece). + let workingSnap = snap; + if (rotationTrack && pieceType !== "unknown" && pieceType !== "O") { + const freshSnap = await driver.readGrid(settledGrid); + if (freshSnap.grid && freshSnap.activePieceCells && freshSnap.activePieceCells.length === 4) { + workingSnap = freshSnap; + } + } + // Save the locked board as-of right now (no active piece). This is // what findBestPlacement evaluates against, and what we use as the // diff base for the NEXT iteration's active-piece detection. - const boardBeforePlacement = stripActivePiece(snap.grid, snap.activePieceCells); + const boardBeforePlacement = stripActivePiece(workingSnap.grid!, workingSnap.activePieceCells!); const placement = findBestPlacement(boardBeforePlacement, pieceType as PieceType); if (placement) { - await executePlacement(driver, placement, snap.activePieceCells); + await executePlacement(driver, placement, workingSnap.activePieceCells!); linesCleared += placement.linesCleared; piecesPlaced++; } else { @@ -1665,14 +1774,43 @@ function deriveTestResults( } // 7. rotate + // + // A correctly working game should cycle a non-O piece through at least 2 + // rotation states (I/S/Z have 2 states; J/L/T have 4). We press rotate 4 + // times and count distinct normalized shapes; a broken game where + // rotation only fires once will only produce 2 shapes total (initial + + // first rotation), which is NOT enough. We require 3 distinct shapes + // overall (initial + at least 2 other states). + // + // Fallback: if the Phase 3 probe landed on an O piece (only 1 shape) or + // the grid reader couldn't identify the active piece cells, we also + // consult the gameplay-phase per-piece tracking -- any piece type with + // 3+ distinct shapes observed there proves the game rotates correctly. + const gameplayRotationShapesMax = [...session.rotationShapesByPiece.values()] + .reduce((max, set) => Math.max(max, set.size), 0); + const maxShapesSeen = Math.max(session.distinctRotationShapes, gameplayRotationShapesMax); if (!phaseState.gameStarted) { results.push(skipResult("rotate", "game did not start")); - } else if (session.rotationsObserved > 0) { - results.push({ name: "rotate", pass: true, detail: `piece shape changed after rotate key (grid-verified, ${session.rotationsObserved} rotation(s))` }); + } else if (maxShapesSeen >= 3) { + results.push({ + name: "rotate", + pass: true, + detail: `piece cycled through ${maxShapesSeen} distinct shapes after 4 rotate presses (grid-verified)`, + }); } else if (!gridReliable) { results.push({ name: "rotate", pass: false, detail: "grid reader unreliable, cannot verify rotation" }); + } else if (maxShapesSeen === 2) { + results.push({ + name: "rotate", + pass: false, + detail: "piece only reached 1 rotation state then stalled (expected at least 2 distinct non-baseline shapes)", + }); } else { - results.push({ name: "rotate", pass: false, detail: "no shape change detected after rotate key" }); + results.push({ + name: "rotate", + pass: false, + detail: `no shape change detected after rotate key (${maxShapesSeen} distinct shape(s))`, + }); } // 8. hard_drop @@ -1687,16 +1825,70 @@ function deriveTestResults( } // 9. all_pieces_rotate + // + // During gameplay we probe each new piece type by pressing rotate 4 + // times and recording distinct normalized shapes. A correctly working + // game cycles J/L/T through 4 rotation states (we expect to observe 3+ + // distinct shapes even accounting for timing jitter). + // + // Pass rule: at least 2 multi-state piece types (J/L/T) reached 3+ + // distinct shapes (baseline + at least 2 other states). This excludes + // broken games that rotate exactly once before getting stuck (2 shapes + // total). We require multi-state pieces because I/S/Z in classic-style + // games only have 2 rotation states, so we can't distinguish "broken + // rotation stuck at state 1" from "working 2-state S/Z". + // + // Skip if fewer than 2 J/L/T types were ever seen in the gameplay + // phase -- not enough data to make the claim. if (!phaseState.gameStarted) { results.push(skipResult("all_pieces_rotate", "game did not start")); } else { - const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown"); - if (session.rotationsObserved > 0 && nonOPieceTypes.length > 0) { - results.push({ name: "all_pieces_rotate", pass: true, detail: `rotation observed, piece types seen: [${[...session.pieceTypes].join(", ")}]` }); - } else if (session.rotationsObserved > 0) { - results.push({ name: "all_pieces_rotate", pass: true, detail: "rotation confirmed but could not identify individual piece types" }); + // Union of piece types visible via session.pieceTypes AND the tracking + // map keys, since the tracking map is populated in gameplay phases that + // don't add to pieceTypes. + const allSeenTypes = new Set<string>([ + ...session.pieceTypes, + ...session.rotationShapesByPiece.keys(), + ]); + const multiStateTypesSeen = [...allSeenTypes].filter((t) => + ["J", "L", "T"].includes(t) + ); + const trackedTypes = [...session.rotationShapesByPiece.entries()]; + const multiStateRotated = trackedTypes.filter( + ([t, shapes]) => ["J", "L", "T"].includes(t) && shapes.size >= 3 + ); + if (multiStateTypesSeen.length < 2) { + results.push( + skipResult( + "all_pieces_rotate", + `not enough piece types to verify (saw ${multiStateTypesSeen.length} of J/L/T, need 2)` + ) + ); + } else if (multiStateRotated.length >= 2) { + const detail = multiStateRotated + .map(([t, s]) => `${t}:${s.size}`) + .join(" "); + results.push({ + name: "all_pieces_rotate", + pass: true, + detail: `${multiStateRotated.length} J/L/T piece type(s) rotated to 3+ distinct shapes [${detail}]`, + }); + } else if (!gridReliable) { + results.push({ + name: "all_pieces_rotate", + pass: false, + detail: "grid reader unreliable, cannot verify per-piece rotation", + }); } else { - results.push({ name: "all_pieces_rotate", pass: false, detail: "could not detect any piece rotations via grid reader" }); + const detail = trackedTypes + .filter(([t]) => ["J", "L", "T"].includes(t)) + .map(([t, s]) => `${t}:${s.size}`) + .join(" "); + results.push({ + name: "all_pieces_rotate", + pass: false, + detail: `only ${multiStateRotated.length} of ${multiStateTypesSeen.length} J/L/T rotated to 3+ distinct shapes (need 2) [${detail}]`, + }); } } diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -329,6 +329,19 @@ export interface GameSession { frames: number; events: GridEvent[]; skippedPhases: string[]; + /** + * Maximum number of distinct normalized shapes observed while pressing + * rotate 4 times on a single piece in Phase 3 (basic mechanics). Used by + * the rotate test to verify the game actually cycles through rotation + * states rather than allowing only one rotation. + */ + distinctRotationShapes: number; + /** + * Per-piece-type set of distinct normalized shapes observed during the + * rotation probe in gameplay phases. Used by all_pieces_rotate to verify + * that multiple piece types can each rotate through 2+ distinct shapes. + */ + rotationShapesByPiece: Map<string, Set<string>>; } /** An event observed during continuous grid scanning. */

Impressum · Datenschutz