loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 53625f81965fd7797eaf3cabb4dd67b44a4a8fd5
parent fe319f04a3475f06a33c8d380059948714ea6455
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:06:26 +0200

Add per-piece-type rotation test

Test 7b (all_pieces_rotate) plays up to 40 pieces, identifies each
piece type by its bounding box (I=4x1, O=2x2, others=3x2/2x3),
skips O-piece, and verifies rotation changes the shape dimensions.
Reports which piece types rotated and which failed.

Currently detects "other" (T/S/Z/J/L) pieces reliably. I-piece
detection needs tuning - it may not appear in the top 6 rows
where we scan.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/tests.ts | 171+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 171 insertions(+), 0 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -150,6 +150,18 @@ export async function runAllTests( }); } + // ---- Test 7b: All pieces rotate (except O) ---- + try { + const result = await testAllPiecesRotate(page, cal, gameplay); + testResults.push(result); + } catch (err) { + testResults.push({ + name: "all_pieces_rotate", + pass: false, + detail: `exception: ${err instanceof Error ? err.message : String(err)}`, + }); + } + // ---- Test 8: Hard drop ---- try { const result = await testHardDrop(page, cal); @@ -374,6 +386,165 @@ async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResul return { name: "rotate", pass: false, detail: "no change detected after rotate key" }; } +/** + * Detect the active piece's shape from the grid by finding cells that are + * filled in the current grid but weren't in a "settled" snapshot. + * Returns the bounding box dimensions (width x height) or null. + */ +function detectPieceShape(grid: boolean[][] | null): { w: number; h: number; cells: number } | null { + if (!grid) return null; + + // Find filled cells in the top 6 rows (where new pieces spawn/fall) + const activeCells: Array<[number, number]> = []; + for (let row = 0; row < Math.min(6, grid.length); row++) { + for (let col = 0; col < grid[row].length; col++) { + if (grid[row][col]) { + activeCells.push([row, col]); + } + } + } + + if (activeCells.length < 3 || activeCells.length > 4) return null; + + const minRow = Math.min(...activeCells.map(([r]) => r)); + const maxRow = Math.max(...activeCells.map(([r]) => r)); + const minCol = Math.min(...activeCells.map(([, c]) => c)); + const maxCol = Math.max(...activeCells.map(([, c]) => c)); + + return { + w: maxCol - minCol + 1, + h: maxRow - minRow + 1, + cells: activeCells.length, + }; +} + +/** + * Classify a piece shape. The I-piece is 4x1 or 1x4. + * The O-piece is 2x2. Others are 3x2 or 2x3 variants. + */ +function classifyPiece(shape: { w: number; h: number; cells: number }): string { + if (shape.cells !== 4) return "unknown"; + if ((shape.w === 4 && shape.h === 1) || (shape.w === 1 && shape.h === 4)) return "I"; + if (shape.w === 2 && shape.h === 2) return "O"; + // T, S, Z, J, L are all 3x2 or 2x3 + return "other"; +} + +async function testAllPiecesRotate( + page: Page, + cal: CalibrationResult, + gameplay: GameplayStats, +): Promise<TestResult> { + // Reload to get a fresh game + await page.reload(); + await page.waitForTimeout(1000); + + // Start the game + if (cal.start_mechanism === "button") { + const btn = page.locator("button").filter({ hasText: /start|play|begin|new/i }).first(); + if (await btn.count() > 0) await btn.click(); + } else if (cal.start_mechanism === "space") { + await page.keyboard.press("Space"); + } else if (cal.start_mechanism === "enter") { + await page.keyboard.press("Enter"); + } else if (cal.start_mechanism === "click") { + await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true }); + } + await page.waitForTimeout(1000); + + const rotatedPieces = new Set<string>(); + const failedPieces = new Set<string>(); + const maxAttempts = 40; // Play up to 40 pieces to find all types + + for (let attempt = 0; attempt < maxAttempts; attempt++) { + await page.waitForTimeout(300); + + const gridBefore = await readGrid(page, cal); + const shapeBefore = detectPieceShape(gridBefore); + + if (!shapeBefore) { + // Can't read the piece, drop it and try the next one + await page.keyboard.press(cal.controls.drop); + gameplay.pieces_placed++; + await page.waitForTimeout(500); + continue; + } + + const pieceType = classifyPiece(shapeBefore); + + // O piece should NOT rotate (2x2 stays 2x2), skip it + if (pieceType === "O") { + await page.keyboard.press(cal.controls.drop); + gameplay.pieces_placed++; + await page.waitForTimeout(500); + continue; + } + + // Already tested this type successfully + if (rotatedPieces.has(pieceType)) { + await page.keyboard.press(cal.controls.drop); + gameplay.pieces_placed++; + await page.waitForTimeout(500); + continue; + } + + // Try to rotate + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(300); + + const gridAfter = await readGrid(page, cal); + const shapeAfter = detectPieceShape(gridAfter); + + if (shapeAfter) { + const changed = shapeBefore.w !== shapeAfter.w || shapeBefore.h !== shapeAfter.h; + if (changed) { + rotatedPieces.add(pieceType); + } else { + failedPieces.add(pieceType); + } + } else { + // Couldn't read after rotation, try screenshot comparison + const shotBefore = await page.screenshot(); + // Rotate back and forth + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(200); + const shotAfter = await page.screenshot(); + if (!Buffer.from(shotBefore).equals(Buffer.from(shotAfter))) { + rotatedPieces.add(pieceType); + } else { + failedPieces.add(pieceType); + } + } + + // Drop the piece and move on + await page.keyboard.press(cal.controls.drop); + gameplay.pieces_placed++; + await page.waitForTimeout(500); + + // Check if game is over + const currentGrid = await readGrid(page, cal); + if (currentGrid && hasFilledInTopRows(currentGrid, 2)) { + break; + } + } + + // Remove pieces that eventually rotated from the failed set + for (const p of rotatedPieces) { + failedPieces.delete(p); + } + + const testedTypes = new Set([...rotatedPieces, ...failedPieces]); + const detail = `rotated: [${[...rotatedPieces].join(", ")}] failed: [${[...failedPieces].join(", ")}] (tested ${testedTypes.size} piece types in ${maxAttempts} attempts)`; + + if (failedPieces.size > 0) { + return { name: "all_pieces_rotate", pass: false, detail }; + } + if (rotatedPieces.size === 0) { + return { name: "all_pieces_rotate", pass: false, detail: "could not detect any piece rotations" }; + } + return { name: "all_pieces_rotate", pass: true, detail }; +} + async function testHardDrop(page: Page, cal: CalibrationResult): Promise<TestResult> { const gridBefore = await readGrid(page, cal); const shotBefore = await page.screenshot();

Impressum · Datenschutz