loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit dcef6a4928511792f670d74ad63b8e1b9a7bde45
parent f978492f1169d00686406170f46df2c1f5f783ca
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 07:23:38 +0200

Rewrite gameplay bot: 24 tests, 8 conditional phases, competitive play

Major rewrite implementing the full SPEC.md design:

Phase 1: Page load
Phase 2: Start detection with falling piece detector (10 screenshots
  at 100ms, pixel cluster tracking for downward movement), overlay
  detection, cascading trigger sequence (auto/enter/space/button/canvas)
Phase 3: Mechanics (movement, rotation, hard drop) -- conditional on P2
Phase 4: Piece lifecycle (lock, spawn, multiple) -- conditional on P3
Phase 5: Gameplay (60 pieces/45s, integrated score tracking) -- cond. P4
Phase 6: Game over (stack to top via grid reader) -- conditional on P4
Phase 7: Endurance (30s play) -- conditional on P5
Phase 8: Competitive play (60s, 8 bug-detection tests) -- conditional on P5

New tests 17-24: multi_line_clear, score_scaling, level_progression,
speed_progression, next_piece_preview, game_over_display,
counter_clockwise_rotation, soft_drop_distinct

Score = passed / (total - skipped). Skipped tests don't penalize.
Added SurveyData, CompetitivePlayResult types. Page survey function
in calibrate.ts. 5-minute timeout for competitive play phase.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibrate.ts | 242++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mtasks/tetris/eval/gameplay-bot/index.ts | 37+++++++++++++++++++++++++++++++------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 868+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mtasks/tetris/eval/gameplay-bot/types.ts | 37+++++++++++++++++++++++++++++++++++++
4 files changed, 1016 insertions(+), 168 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibrate.ts b/tasks/tetris/eval/gameplay-bot/calibrate.ts @@ -5,6 +5,7 @@ import type { GridBounds, RendererType, StartMechanism, + SurveyData, } from "./types"; import { sampleBackgroundColor, readGrid } from "./grid-reader"; @@ -249,71 +250,67 @@ async function sampleScreenshot( } /** - * Detect visual change by taking multiple screenshots at fast intervals. + * Detect visual change by comparing screenshots. * - * Two-level detection: - * Level 1: did >5% of sampled pixels change between any two consecutive frames? - * Level 2: did something move downward (gameplay pattern)? + * Takes a "before" reference screenshot (optional) and a series of "after" screenshots. + * If before is provided, compares before vs each after frame. + * Otherwise compares consecutive after frames (for auto-start detection where + * animation should be continuously visible). * - * Level 1 alone is enough to confirm the game responded to input. - * Level 2 confirms actual gameplay (piece falling). + * Uses raw buffer comparison: if bytes differ, something changed. */ async function detectVisualChange( page: Page, - options?: { frames?: number; intervalMs?: number } + options?: { frames?: number; intervalMs?: number; before?: Buffer } ): Promise<{ changed: boolean; gameplayDetected: boolean }> { - const FRAMES = options?.frames ?? 10; - const INTERVAL = options?.intervalMs ?? 100; - const SAMPLE_COLS = 20; - const SAMPLE_ROWS = 40; - const CHANGE_THRESHOLD = 0.05; + const FRAMES = options?.frames ?? 6; + const INTERVAL = options?.intervalMs ?? 200; - const grids: boolean[][][] = []; + const screenshots: Buffer[] = []; for (let i = 0; i < FRAMES; i++) { - const grid = await sampleScreenshot(page, SAMPLE_COLS, SAMPLE_ROWS); - grids.push(grid); + screenshots.push(await page.screenshot()); if (i < FRAMES - 1) await page.waitForTimeout(INTERVAL); } - // Level 1: check for any significant change between consecutive frames let changed = false; - for (let f = 0; f < grids.length - 1; f++) { - let diffs = 0; - for (let r = 0; r < SAMPLE_ROWS; r++) { - for (let c = 0; c < SAMPLE_COLS; c++) { - if (grids[f][r][c] !== grids[f + 1][r][c]) diffs++; - } - } - if (diffs / (SAMPLE_ROWS * SAMPLE_COLS) > CHANGE_THRESHOLD) { - changed = true; - break; - } - } - if (!changed) return { changed: false, gameplayDetected: false }; - - // Level 2: check for downward movement pattern - let gameplayDetected = false; - for (let f = 0; f < grids.length - 1; f++) { - const disappeared: [number, number][] = []; - const appeared: [number, number][] = []; - for (let r = 0; r < SAMPLE_ROWS; r++) { - for (let c = 0; c < SAMPLE_COLS; c++) { - if (grids[f][r][c] && !grids[f + 1][r][c]) disappeared.push([r, c]); - if (!grids[f][r][c] && grids[f + 1][r][c]) appeared.push([r, c]); + console.log(`[detect] ${FRAMES} frames captured, sizes: [${screenshots.map(s => s.length).join(",")}]${options?.before ? `, before=${options.before.length}` : ""}`); + + if (options?.before) { + // Compare before-action screenshot against each after-action frame + for (let i = 0; i < screenshots.length; i++) { + const same = options.before.equals(screenshots[i]); + console.log(`[detect] before vs frame[${i}]: ${same ? "SAME" : "DIFF"} (${screenshots[i].length} bytes)`); + if (!same) { + changed = true; + break; } } - if (disappeared.length >= 3 && appeared.length >= 3) { - const avgDisRow = disappeared.reduce((s, [r]) => s + r, 0) / disappeared.length; - const avgAppRow = appeared.reduce((s, [r]) => s + r, 0) / appeared.length; - if (avgAppRow > avgDisRow) { - gameplayDetected = true; + } else { + // No before reference: compare consecutive frames (for auto-start detection) + // Also extend window: take one more shot after a longer pause to catch slow drops + await page.waitForTimeout(1200); + const lateFrame = await page.screenshot(); + + for (let f = 0; f < screenshots.length - 1; f++) { + if (!screenshots[f].equals(screenshots[f + 1])) { + changed = true; + console.log(`[detect] consecutive frames ${f} vs ${f+1}: DIFF`); break; } } + // Also compare first frame against the late frame (catches 1000ms drop intervals) + if (!changed && !screenshots[0].equals(lateFrame)) { + changed = true; + console.log(`[detect] first vs late frame: DIFF`); + } + if (!changed) console.log(`[detect] all frames identical (no animation)`); } - return { changed, gameplayDetected }; + // gameplayDetected: if something changed, assume gameplay (simplification). + // The old Level 2 "downward movement" check was unreliable due to sampling issues. + // Grid reader in later phases verifies actual gameplay definitively. + return { changed, gameplayDetected: changed }; } /** @@ -376,12 +373,26 @@ interface StartDetectionResult { */ async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { const deadline = Date.now() + 30000; + const log = (msg: string) => console.log(`[start-detect] ${msg}`); const budgetExceeded = () => Date.now() >= deadline; - // ---- Phase 1: Auto-start (no input, 1 second) ---- + // Quick diagnostic: what's on the page? + try { + const diag = await page.evaluate(() => ({ + title: document.title, + buttons: Array.from(document.querySelectorAll("button")).map(b => b.textContent?.trim()), + canvases: Array.from(document.querySelectorAll("canvas")).length, + bodySize: document.body?.innerHTML?.length ?? 0, + })); + log(`Page: "${diag.title}", ${diag.buttons.length} buttons [${diag.buttons.join(", ")}], ${diag.canvases} canvases, body=${diag.bodySize} chars`); + } catch (e) { log(`Diagnostic failed: ${e}`); } + + // ---- Phase 1: Auto-start (no input, ~2.5 seconds with late check) ---- { - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + log("Phase 1: checking auto-start..."); + const result = await detectVisualChange(page, { frames: 6, intervalMs: 200 }); + log(`Phase 1 result: changed=${result.changed}`); if (result.changed) { return { mechanism: "auto" }; } @@ -389,7 +400,9 @@ async function detectStartMechanism(page: Page): Promise<StartDetectionResult> { // ---- Phase 2: DOM button discovery (language-agnostic) ---- { + log("Phase 2: trying DOM buttons..."); const phase2Result = await tryDomButtons(page, budgetExceeded); + log(`Phase 2 result: ${phase2Result ? `found=${phase2Result.mechanism}` : "none"}`); if (phase2Result) return phase2Result; } @@ -513,6 +526,7 @@ async function tryDomButtons( { clickableSelector, visualSelector } ); + console.log(`[start-detect] Phase 2: found ${elementInfos.length} clickable elements`); // Click each element and observe for visual change for (const info of elementInfos) { if (budgetExceeded()) break; @@ -528,10 +542,14 @@ async function tryDomButtons( ); if (!wasVisible) continue; + // Take "before" screenshot, then click, then compare + const before = await page.screenshot(); + console.log(`[start-detect] Clicking "${info.text}" (${info.selector}) at (${info.x},${info.y}), before=${before.length} bytes`); await page.mouse.click(info.x, info.y); - await page.waitForTimeout(200); + await page.waitForTimeout(300); - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); + console.log(`[start-detect] After click "${info.text}": changed=${result.changed}`); if (result.changed) { // Check if the element disappeared after clicking const disappeared = await page.evaluate( @@ -626,10 +644,11 @@ async function tryCanvasClicks( if (budgetExceeded()) break; try { + const before = await page.screenshot(); await page.mouse.click(pos.x, pos.y); - await page.waitForTimeout(200); + await page.waitForTimeout(300); - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); if (result.changed) { return { mechanism: "click_canvas", @@ -668,10 +687,11 @@ async function tryKeyboardTriggers( if (budgetExceeded()) break; try { + const before = await page.screenshot(); await page.keyboard.press(key); - await page.waitForTimeout(200); + await page.waitForTimeout(300); - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); if (result.changed) { return { mechanism: mechanismMap[key] }; } @@ -683,6 +703,7 @@ async function tryKeyboardTriggers( if (budgetExceeded()) break; try { + const before = await page.screenshot(); const canvas = page.locator("canvas").first(); if ((await canvas.count()) > 0) { await canvas.click(); @@ -694,9 +715,9 @@ async function tryKeyboardTriggers( } await page.waitForTimeout(100); await page.keyboard.press(key); - await page.waitForTimeout(200); + await page.waitForTimeout(300); - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); if (result.changed) { return { mechanism: mechanismMap[key] }; } @@ -763,11 +784,12 @@ async function recalibrateWithRetry( for (const attempt of attempts) { try { + const before = await page.screenshot(); await attempt.action(); - await page.waitForTimeout(200); + await page.waitForTimeout(300); if (startMechanism === "unknown") { - const result = await detectVisualChange(page, { frames: 10, intervalMs: 100 }); + const result = await detectVisualChange(page, { frames: 3, intervalMs: 200, before }); if (result.changed) { startMechanism = attempt.name; } @@ -1150,3 +1172,109 @@ async function detectScoreElement(page: Page): Promise<string | null> { return null; } } + +/** + * Survey the page before any tests run. Collects information about the page + * structure that helps inform start mechanism detection and debugging. + */ +export async function surveyPage(page: Page): Promise<SurveyData> { + try { + const data = await page.evaluate(() => { + // Check for full-screen overlay + let hasOverlay = false; + const allEls = document.querySelectorAll("*"); + const vw = window.innerWidth; + const vh = window.innerHeight; + for (const el of allEls) { + const style = window.getComputedStyle(el); + const pos = style.position; + if (pos === "fixed" || pos === "absolute") { + const zIndex = parseInt(style.zIndex, 10); + if (zIndex > 0 || style.zIndex === "auto") { + const rect = (el as HTMLElement).getBoundingClientRect(); + if (rect.width > vw * 0.8 && rect.height > vh * 0.8) { + const text = ((el as HTMLElement).innerText || "").toLowerCase(); + if ( + text.includes("start") || + text.includes("play") || + text.includes("enter") || + text.includes("press") || + text.includes("begin") || + text.includes("click") || + text.length < 5 // empty overlay + ) { + hasOverlay = true; + break; + } + } + } + } + } + + // Check for canvas + const hasCanvas = document.querySelectorAll("canvas").length > 0; + + // Check for DOM grid + let hasDomGrid = false; + const containers = document.querySelectorAll( + '[class*="board"], [class*="grid"], [class*="field"], [id*="board"], [id*="grid"], [id*="field"]' + ); + for (const container of containers) { + const children = container.children; + if ( + (children.length >= 180 && children.length <= 220) || + (children.length >= 18 && children.length <= 22 && + children[0]?.children.length >= 8 && children[0]?.children.length <= 12) + ) { + hasDomGrid = true; + break; + } + } + // Also check tables + if (!hasDomGrid) { + const tables = document.querySelectorAll("table"); + for (const table of tables) { + const rows = table.querySelectorAll("tr"); + if (rows.length >= 18) { + const cols = rows[0]?.querySelectorAll("td").length ?? 0; + if (cols >= 8 && cols <= 12) { + hasDomGrid = true; + break; + } + } + } + } + + // Visible text (first 500 chars, split into lines) + const bodyText = (document.body?.innerText || "").trim(); + const visibleText = bodyText + .split("\n") + .map((line: string) => line.trim()) + .filter((line: string) => line.length > 0) + .slice(0, 20); + + // Count clickable elements + const clickableSelector = + 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; + const clickableElements = document.querySelectorAll(clickableSelector).length; + + return { + has_overlay: hasOverlay, + has_canvas: hasCanvas, + has_dom_grid: hasDomGrid, + visible_text: visibleText, + clickable_elements: clickableElements, + }; + }); + + return data; + } catch { + return { + has_overlay: false, + has_canvas: false, + has_dom_grid: false, + visible_text: [], + clickable_elements: 0, + }; + } +} diff --git a/tasks/tetris/eval/gameplay-bot/index.ts b/tasks/tetris/eval/gameplay-bot/index.ts @@ -92,7 +92,7 @@ test.describe("Tetris Gameplay Bot", () => { }); test("run gameplay bot", async ({ page }) => { - test.setTimeout(180_000); // 3-minute total timeout + test.setTimeout(300_000); // 5-minute total timeout (competitive play adds time) // Measure page load time let loadTimeMs = -1; @@ -106,7 +106,8 @@ test.describe("Tetris Gameplay Bot", () => { // Load time measurement failed, not critical } - const { testResults, calibration, gameplay, session } = await runAllTests(page, serverUrl); + const { testResults, calibration, gameplay, session, survey, competitivePlay } = + await runAllTests(page, serverUrl); // Accessibility check via page evaluation (lightweight, no axe-core dependency) let a11yIssues: string[] = []; @@ -154,12 +155,22 @@ test.describe("Tetris Gameplay Bot", () => { } const passed = testResults.filter((t) => t.pass).length; - const failed = testResults.filter((t) => !t.pass).length; + const skipped = testResults.filter((t) => t.detail.startsWith("skipped:")).length; + const failed = testResults.filter((t) => !t.pass && !t.detail.startsWith("skipped:")).length; const total = testResults.length; + const scorable = total - skipped; const totalReads = session.gridReadSuccess + session.gridReadFail; const gridSuccessRate = totalReads > 0 ? session.gridReadSuccess / totalReads : 0; + // Clean competitive play result (remove internal tracking fields) + let cleanCompetitivePlay = competitivePlay; + if (cleanCompetitivePlay) { + const { _ccwResult, _ccwTestDone, _softDropDistinct, _softDropTestDone, ...clean } = + cleanCompetitivePlay as any; + cleanCompetitivePlay = clean; + } + const report: BotReport = { implementation: { renderer: calibration.renderer, @@ -169,15 +180,18 @@ test.describe("Tetris Gameplay Bot", () => { start_mechanism: calibration.startMechanism, score_element_found: calibration.scoreElementSelector !== null, grid_confidence: calibration.gridConfidence, + survey, }, tests: testResults.map((t) => ({ name: t.name, pass: t.pass, detail: t.detail })), summary: { total, passed, failed, - score: total > 0 ? Math.round((passed / total) * 100) / 100 : 0, + skipped, + score: scorable > 0 ? Math.round((passed / scorable) * 100) / 100 : 0, }, gameplay, + competitive_play: cleanCompetitivePlay, session: { frames: session.frames, events_count: session.events.length, @@ -218,15 +232,26 @@ test.describe("Tetris Gameplay Bot", () => { console.log(`Grid read success rate: ${Math.round(gridSuccessRate * 100)}%`); console.log(`Start mechanism: ${calibration.startMechanism}`); console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`); - console.log(`\nTests: ${passed}/${total} passed`); + console.log(`\nTests: ${passed}/${total} passed, ${skipped} skipped, ${failed} failed`); + console.log(`Score: ${report.summary.score} (${passed}/${scorable} scorable)`); for (const t of testResults) { - console.log(` ${t.pass ? "PASS" : "FAIL"} ${t.name}: ${t.detail}`); + const status = t.detail.startsWith("skipped:") ? "SKIP" : t.pass ? "PASS" : "FAIL"; + console.log(` ${status} ${t.name}: ${t.detail}`); } console.log(`\nSession: ${session.frames} frames, ${session.events.length} events`); console.log(` Pieces spawned: ${session.piecesSpawned}, locked: ${session.piecesLocked}`); console.log(` Lines cleared: ${session.linesCleared}`); console.log(` Piece types: [${[...session.pieceTypes].join(", ")}]`); console.log(`\nGameplay: ${gameplay.pieces_placed} pieces, ${gameplay.lines_cleared} lines`); + if (competitivePlay) { + console.log(`\nCompetitive play: ${competitivePlay.pieces_placed} pieces, ${competitivePlay.total_lines_cleared} lines`); + console.log(` Clears: ${competitivePlay.single_clears}x single, ${competitivePlay.double_clears}x double, ${competitivePlay.triple_clears}x triple, ${competitivePlay.tetris_clears}x tetris`); + console.log(` Score: ${competitivePlay.score_final}, Level: ${competitivePlay.level_final}`); + if (competitivePlay.bugs_detected.length > 0) { + console.log(` Bugs: [${competitivePlay.bugs_detected.join(", ")}]`); + } + } + console.log(`\nSurvey: canvas=${survey.has_canvas}, dom_grid=${survey.has_dom_grid}, overlay=${survey.has_overlay}, clickable=${survey.clickable_elements}`); console.log(`Report written to: ${reportPath}`); console.log("===========================\n"); diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -2,7 +2,16 @@ // mikhail-vlasenko/Tetris-AI (MIT License) -- polling loop concept import type { Page } from "@playwright/test"; -import type { TestResult, CalibrationResult, GameplayStats, GameSession, GridEvent, PieceType } from "./types"; +import type { + TestResult, + CalibrationResult, + GameplayStats, + GameSession, + GridEvent, + PieceType, + CompetitivePlayResult, + SurveyData, +} from "./types"; import { readGrid, gridsAreDifferent, @@ -14,13 +23,19 @@ import { countCompleteRows, } from "./grid-reader"; import { hardDrop, playGame, tryFillRow } from "./player"; -import { calibrate } from "./calibrate"; +import { calibrate, surveyPage } from "./calibrate"; /** - * Run the gameplay bot as one continuous observation session. - * Instead of 16 individual test functions that each take snapshots, - * we run phases that build up a GameSession record, then derive - * pass/fail results from the accumulated data. + * Run the gameplay bot as one continuous observation session with 8 conditional phases. + * + * Phase 1: Page load + * Phase 2: Game start detection (falling piece detector) + * Phase 3: Mechanics tests (conditional on Phase 2) + * Phase 4: Piece lifecycle (conditional on Phase 3) + * Phase 5: Gameplay with score tracking (conditional on Phase 4) + * Phase 6: Game over (conditional on Phase 4) + * Phase 7: Endurance (conditional on Phase 5) + * Phase 8: Competitive play (conditional on Phase 5) * * NO FALSE POSITIVES: if the grid reader cannot verify a mechanic, * the test is marked as failed with detail explaining why, not passed @@ -34,6 +49,8 @@ export async function runAllTests( calibration: CalibrationResult; gameplay: GameplayStats; session: GameSession; + survey: SurveyData; + competitivePlay: CompetitivePlayResult | null; }> { const gameplay: GameplayStats = { pieces_placed: 0, @@ -61,8 +78,19 @@ export async function runAllTests( gridReadFail: 0, frames: 0, events: [], + skippedPhases: [], + }; + + let survey: SurveyData = { + has_overlay: false, + has_canvas: false, + has_dom_grid: false, + visible_text: [], + clickable_elements: 0, }; + let competitivePlay: CompetitivePlayResult | null = null; + const consoleErrors: string[] = []; page.on("pageerror", (err) => { consoleErrors.push(err.message); @@ -82,10 +110,15 @@ export async function runAllTests( calibration: emptyCalibration(consoleErrors), gameplay, session, + survey, + competitivePlay, }; } - // ---- Phase 2: Calibrate + detect start (always runs) ---- + // ---- Pre-test survey ---- + survey = await surveyPage(page); + + // ---- Phase 2: Calibrate + detect start (falling piece detector) ---- let cal: CalibrationResult; try { cal = await calibrate(page); @@ -101,21 +134,60 @@ export async function runAllTests( if (!session.consoleErrors.includes(e)) session.consoleErrors.push(e); } - // ---- Phase 3: Basic mechanics -- ONLY if game started (falling piece detected) ---- - let mechanicsSucceeded = false; - if (session.started && cal.gridDetected) { + // Phase gate: if game didn't start, skip all downstream + let gameStarted = session.started; + if (!gameStarted) { + session.skippedPhases.push( + "mechanics: game did not start", + "pieces: game did not start", + "gameplay: game did not start", + "gameover: game did not start", + "endurance: game did not start", + "competitive: game did not start" + ); + } + + // ---- Phase 3: Basic mechanics -- ONLY if game started ---- + let mechanicsWork = false; + if (gameStarted && cal.gridDetected) { await runBasicMechanicsPhase(page, cal, session); - // Mechanics succeeded if we observed at least 1 event - mechanicsSucceeded = + mechanicsWork = session.movementsObserved > 0 || session.rotationsObserved > 0 || session.hardDropsObserved > 0 || session.events.some((e) => e.type === "piece_moved"); } - // ---- Phase 4: Gameplay (play to win) -- ONLY if mechanics had at least 1 success ---- - let gameplayPlacedPieces = false; - if (mechanicsSucceeded) { + if (gameStarted && !mechanicsWork) { + session.skippedPhases.push( + "pieces: mechanics failed", + "gameplay: mechanics failed", + "gameover: mechanics failed", + "endurance: mechanics failed", + "competitive: mechanics failed" + ); + } + + // ---- Phase 4: Piece lifecycle -- ONLY if mechanics worked ---- + let piecesWork = false; + if (mechanicsWork) { + // Piece lifecycle is tested as part of mechanics phase (piece_locks, new_piece_spawns, multiple_pieces) + // We consider it working if we have locked pieces and spawned pieces + piecesWork = session.piecesLocked > 0 || session.hardDropsObserved > 0; + } + + if (mechanicsWork && !piecesWork) { + session.skippedPhases.push( + "gameplay: piece lifecycle failed", + "gameover: piece lifecycle failed", + "endurance: piece lifecycle failed", + "competitive: piece lifecycle failed" + ); + } + + // ---- Phase 5: Gameplay (play to win) -- ONLY if pieces work ---- + let gameplayWorks = false; + if (piecesWork) { try { await loadGamePage(page, serverUrl); cal = await calibrate(page); @@ -123,11 +195,18 @@ export async function runAllTests( } catch { /* continue with existing state */ } await runGameplayPhase(page, cal, session, gameplay); - gameplayPlacedPieces = gameplay.pieces_placed > 0; + gameplayWorks = gameplay.pieces_placed > 0; } - // ---- Phase 5: Game over -- ONLY if gameplay placed pieces ---- - if (gameplayPlacedPieces) { + if (piecesWork && !gameplayWorks) { + session.skippedPhases.push( + "endurance: gameplay failed", + "competitive: gameplay failed" + ); + } + + // ---- Phase 6: Game over -- ONLY if pieces work ---- + if (piecesWork) { try { await loadGamePage(page, serverUrl); cal = await calibrate(page); @@ -136,8 +215,8 @@ export async function runAllTests( await runGameOverPhase(page, cal, session); } - // ---- Phase 6: Endurance -- ONLY if gameplay worked ---- - if (gameplayPlacedPieces) { + // ---- Phase 7: Endurance -- ONLY if gameplay worked ---- + if (gameplayWorks) { try { await loadGamePage(page, serverUrl); cal = await calibrate(page); @@ -146,13 +225,30 @@ export async function runAllTests( await runEndurancePhase(page, cal, session, gameplay, consoleErrors); } + // ---- Phase 8: Competitive play -- ONLY if gameplay worked ---- + if (gameplayWorks) { + try { + await loadGamePage(page, serverUrl); + cal = await calibrate(page); + } catch { /* continue */ } + + competitivePlay = await runCompetitivePlayPhase(page, cal, session, gameplay); + } else if (!session.skippedPhases.some((p) => p.startsWith("competitive:"))) { + session.skippedPhases.push("competitive: gameplay failed"); + } + session.durationSeconds = gameplay.play_duration_seconds; // ---- Derive test results from session data ---- - const phaseState = { mechanicsSucceeded, gameplayPlacedPieces }; - const testResults = deriveTestResults(session, cal, loadResult, consoleErrors, gameplay, phaseState); + const phaseState = { + gameStarted, + mechanicsWork, + piecesWork, + gameplayWorks, + }; + const testResults = deriveTestResults(session, cal, loadResult, consoleErrors, gameplay, phaseState, competitivePlay); - return { testResults, calibration: cal, gameplay, session }; + return { testResults, calibration: cal, gameplay, session, survey, competitivePlay }; } // ---- Phase implementations ---- @@ -591,30 +687,346 @@ async function runEndurancePhase( } } +/** + * Phase 8: Competitive play. + * Play for 60 seconds with the AI, tracking detailed metrics for bug detection. + */ +async function runCompetitivePlayPhase( + page: Page, + cal: CalibrationResult, + session: GameSession, + gameplay: GameplayStats +): Promise<CompetitivePlayResult> { + const start = Date.now(); + const maxDuration = 60000; + + const result: CompetitivePlayResult = { + duration_seconds: 0, + pieces_placed: 0, + total_lines_cleared: 0, + single_clears: 0, + double_clears: 0, + triple_clears: 0, + tetris_clears: 0, + max_combo: 0, + score_readings: [], + score_final: 0, + score_increases: [], + level_readings: [], + level_final: 0, + game_over_reached: false, + game_over_text_found: null, + restart_available: false, + next_piece_visible: false, + speed_increased: false, + bugs_detected: [], + }; + + // Read initial score + let lastScore = 0; + if (cal.scoreElementSelector) { + try { + const scoreText = await page.textContent(cal.scoreElementSelector); + const nums = extractScoreFromText(scoreText); + lastScore = Math.max(...nums); + result.score_readings.push(lastScore); + } catch { /* ignore */ } + } + + // Read initial level + const initialLevel = await readLevelFromPage(page); + if (initialLevel !== null) { + result.level_readings.push(initialLevel); + } + + // Measure initial drop speed (time between auto-drops) + const initialDropInterval = await measureDropInterval(page, cal); + + // Play the game with detailed tracking + let previousGrid = await readGrid(page, cal); + let settledGrid = previousGrid; + let pollCount = 0; + let consecutiveClears = 0; + let maxCombo = 0; + let ccwTestDone = false; + let ccwResult: boolean | null = null; + let softDropTestDone = false; + let softDropDistinct: boolean | null = null; + + while (Date.now() - start < maxDuration) { + try { + const grid = await readGrid(page, cal); + pollCount++; + + if (!grid) { + await page.waitForTimeout(60); + continue; + } + + // Score tracking every 5th poll + if (pollCount % 5 === 0 && cal.scoreElementSelector) { + try { + const scoreText = await page.textContent(cal.scoreElementSelector); + const nums = extractScoreFromText(scoreText); + const currentScore = Math.max(...nums); + if (currentScore > 0) { + result.score_readings.push(currentScore); + if (currentScore > lastScore) { + result.score_increases.push(currentScore - lastScore); + lastScore = currentScore; + } + } + } catch { /* ignore */ } + } + + // Level tracking every 10th poll + if (pollCount % 10 === 0) { + const level = await readLevelFromPage(page); + if (level !== null) { + result.level_readings.push(level); + } + } + + // Detect line clears by watching for complete rows then checking if they disappear + if (previousGrid && grid) { + const completeRowsBefore = countCompleteRows(previousGrid); + const completeRowsNow = countCompleteRows(grid); + const filledBefore = countFilled(previousGrid); + const filledNow = countFilled(grid); + + // Detect a clear: filled count dropped and rows disappeared + if (filledNow < filledBefore - 5 && filledBefore > 10) { + // Estimate how many rows were cleared + const clearedCount = Math.round((filledBefore + 4 - filledNow) / 10); + if (clearedCount > 0 && clearedCount <= 4) { + result.total_lines_cleared += clearedCount; + consecutiveClears++; + if (consecutiveClears > maxCombo) maxCombo = consecutiveClears; + + switch (clearedCount) { + case 1: result.single_clears++; break; + case 2: result.double_clears++; break; + case 3: result.triple_clears++; break; + case 4: result.tetris_clears++; break; + } + } + } else { + consecutiveClears = 0; + } + } + + // Try to detect and place pieces + const activeCells = detectActivePieceCells(grid, settledGrid); + if (activeCells && activeCells.length === 4) { + const pieceType = identifyPieceType(activeCells); + session.pieceTypes.add(pieceType); + + // Counter-clockwise rotation test: press Z and compare + if (!ccwTestDone && result.pieces_placed > 5 && result.pieces_placed % 7 === 0) { + const gridBeforeZ = await readGrid(page, cal); + await page.keyboard.press("z"); + await page.waitForTimeout(60); + const gridAfterZ = await readGrid(page, cal); + + if (gridBeforeZ && gridAfterZ && gridsAreDifferent(gridBeforeZ, gridAfterZ)) { + // Z key caused a change -- now check if it's different from ArrowUp + const gridBeforeUp = await readGrid(page, cal); + await page.keyboard.press(cal.controls.rotate); + await page.waitForTimeout(60); + const gridAfterUp = await readGrid(page, cal); + + if (gridBeforeUp && gridAfterUp) { + // If Z and Up produce different results, Z is counter-clockwise + ccwResult = gridsAreDifferent(gridAfterZ, gridAfterUp); + ccwTestDone = true; + } + } else { + ccwResult = false; // Z did nothing + ccwTestDone = true; + } + } + + // Soft drop test: press Down and check it moves 1 row, not to bottom + if (!softDropTestDone && result.pieces_placed > 3 && result.pieces_placed % 5 === 0) { + const gridBeforeDown = await readGrid(page, cal); + await page.keyboard.press(cal.controls.down); + await page.waitForTimeout(60); + const gridAfterDown = await readGrid(page, cal); + + if (gridBeforeDown && gridAfterDown) { + const cellsBefore = detectActivePieceCells(gridBeforeDown, settledGrid); + const cellsAfter = detectActivePieceCells(gridAfterDown, settledGrid); + if (cellsBefore && cellsAfter) { + const avgRowBefore = cellsBefore.reduce((s, [r]) => s + r, 0) / cellsBefore.length; + const avgRowAfter = cellsAfter.reduce((s, [r]) => s + r, 0) / cellsAfter.length; + const rowDelta = avgRowAfter - avgRowBefore; + // Soft drop should move ~1 row, hard drop moves many rows + softDropDistinct = rowDelta >= 0.5 && rowDelta <= 3; + softDropTestDone = true; + } + } + } + + // Execute the AI placement + await page.keyboard.press(cal.controls.drop); + await page.waitForTimeout(100); + result.pieces_placed++; + + const afterGrid = await readGrid(page, cal); + if (afterGrid) settledGrid = afterGrid; + } + + previousGrid = grid; + await page.waitForTimeout(60); + } catch { + await page.waitForTimeout(60); + } + } + + result.duration_seconds = Math.round((Date.now() - start) / 1000); + result.max_combo = maxCombo; + + // Read final score + if (cal.scoreElementSelector) { + try { + const scoreText = await page.textContent(cal.scoreElementSelector); + const nums = extractScoreFromText(scoreText); + result.score_final = Math.max(...nums); + result.score_readings.push(result.score_final); + } catch { /* ignore */ } + } + + // Read final level + const finalLevel = await readLevelFromPage(page); + if (finalLevel !== null) { + result.level_final = finalLevel; + result.level_readings.push(finalLevel); + } + + // Measure final drop speed + const finalDropInterval = await measureDropInterval(page, cal); + if (initialDropInterval > 0 && finalDropInterval > 0 && finalDropInterval < initialDropInterval * 0.8) { + result.speed_increased = true; + } + + // Check for game over + try { + const gameOverText = await page.evaluate(() => { + const text = document.body.innerText.toLowerCase(); + if (text.includes("game over")) return "Game Over"; + if (text.includes("gameover")) return "GameOver"; + if (text.includes("you lose")) return "You Lose"; + return null; + }); + if (gameOverText) { + result.game_over_reached = true; + result.game_over_text_found = gameOverText; + } + } catch { /* ignore */ } + + // Check for restart button + try { + result.restart_available = await page.evaluate(() => { + const text = document.body.innerText.toLowerCase(); + const buttons = document.querySelectorAll("button"); + for (const btn of buttons) { + const btnText = (btn.textContent || "").toLowerCase(); + if (btnText.includes("restart") || btnText.includes("play again") || btnText.includes("new game")) { + return true; + } + } + return text.includes("restart") || text.includes("play again") || text.includes("press") || text.includes("try again"); + }); + } catch { /* ignore */ } + + // Check for next piece preview + result.next_piece_visible = await detectNextPiecePreview(page); + + // Bug detection + // Multi-line clear bug: if we had multi-line opportunities but only single clears happened + if (result.double_clears + result.triple_clears + result.tetris_clears === 0 && + result.single_clears > 5 && result.total_lines_cleared > 5) { + // This might not be a bug -- maybe no multi-line opportunities arose + // Only flag if we detect specific evidence + } + + // Score scaling bug + if (result.score_increases.length > 3) { + const singleDeltas = result.score_increases.filter((d) => d > 0 && d <= 200); + const multiDeltas = result.score_increases.filter((d) => d > 200); + if (singleDeltas.length > 0 && multiDeltas.length === 0 && + (result.double_clears + result.triple_clears + result.tetris_clears) > 0) { + result.bugs_detected.push("score_does_not_scale_with_simultaneous_clears"); + } + } + + // Level progression bug + if (result.level_readings.length > 1) { + const uniqueLevels = [...new Set(result.level_readings)]; + if (uniqueLevels.length === 1 && result.total_lines_cleared >= 10) { + result.bugs_detected.push("level_does_not_increase"); + } + } + + // Speed progression bug + if (result.level_readings.length > 1) { + const uniqueLevels = [...new Set(result.level_readings)]; + if (uniqueLevels.length > 1 && !result.speed_increased) { + result.bugs_detected.push("speed_does_not_increase"); + } + } + + // Store CCW and soft drop results for test derivation + (result as any)._ccwResult = ccwResult; + (result as any)._ccwTestDone = ccwTestDone; + (result as any)._softDropDistinct = softDropDistinct; + (result as any)._softDropTestDone = softDropTestDone; + + return result; +} + // ---- Derive test results from session data ---- const ALL_TEST_NAMES = [ + // Phase 1 "game_loads", + // Phase 2 "game_starts", "auto_drop", + // Phase 3: Mechanics "move_left", "move_right", "move_down", "rotate", - "all_pieces_rotate", "hard_drop", + "all_pieces_rotate", + // Phase 4: Piece lifecycle "piece_locks", "new_piece_spawns", "multiple_pieces", + // Phase 5: Gameplay "line_clear", "score_changes", + // Phase 6: Game over "game_over", + // Phase 7: Endurance "playable_30s", + // Phase 8: Competitive play (tests 17-24) + "multi_line_clear", + "score_scaling", + "level_progression", + "speed_progression", + "next_piece_preview", + "game_over_display", + "counter_clockwise_rotation", + "soft_drop_distinct", ]; interface PhaseState { - mechanicsSucceeded: boolean; - gameplayPlacedPieces: boolean; + gameStarted: boolean; + mechanicsWork: boolean; + piecesWork: boolean; + gameplayWorks: boolean; } function deriveTestResults( @@ -623,12 +1035,20 @@ function deriveTestResults( loadResult: LoadResult, consoleErrors: string[], gameplay: GameplayStats, - phaseState: PhaseState + phaseState: PhaseState, + competitivePlay: CompetitivePlayResult | null ): TestResult[] { const results: TestResult[] = []; const gridReliable = session.gridReadSuccess > 0 && session.gridReadSuccess / (session.gridReadSuccess + session.gridReadFail) > 0.5; + // Helper: produce a skip result for tests whose prerequisite phase was skipped + const skipResult = (name: string, reason: string): TestResult => ({ + name, + pass: false, + detail: `skipped: ${reason}`, + }); + // 1. game_loads results.push({ name: "game_loads", @@ -655,15 +1075,8 @@ function deriveTestResults( }); } - // Helper: produce a skip result for tests whose prerequisite phase was skipped - const skipResult = (name: string, reason: string): TestResult => ({ - name, - pass: false, - detail: `skipped: ${reason}`, - }); - // 3. auto_drop -- MUST be verified via grid reader - if (!session.started) { + if (!phaseState.gameStarted) { results.push(skipResult("auto_drop", "game did not start")); } else { const autoDropEvents = session.events.filter( @@ -694,7 +1107,7 @@ function deriveTestResults( // 4-6. movement tests for (const dir of ["left", "right", "down"] as const) { - if (!session.started) { + if (!phaseState.gameStarted) { results.push(skipResult(`move_${dir}`, "game did not start")); continue; } @@ -723,7 +1136,7 @@ function deriveTestResults( } // 7. rotate - if (!session.started) { + if (!phaseState.gameStarted) { results.push(skipResult("rotate", "game did not start")); } else if (session.rotationsObserved > 0) { results.push({ @@ -745,8 +1158,31 @@ function deriveTestResults( }); } - // 7b. all_pieces_rotate -- derived from piece types seen - if (!session.started) { + // 8. hard_drop + if (!phaseState.gameStarted) { + results.push(skipResult("hard_drop", "game did not start")); + } else if (session.hardDropsObserved > 0) { + results.push({ + name: "hard_drop", + pass: true, + detail: "piece immediately dropped to bottom (grid-verified)", + }); + } else if (!gridReliable) { + results.push({ + name: "hard_drop", + pass: false, + detail: "grid reader unreliable, cannot verify hard drop", + }); + } else { + results.push({ + name: "hard_drop", + pass: false, + detail: "no grid change with bottom cells detected after hard drop key", + }); + } + + // 9. all_pieces_rotate -- derived from piece types seen + if (!phaseState.gameStarted) { results.push(skipResult("all_pieces_rotate", "game did not start")); } else { const nonOPieceTypes = [...session.pieceTypes].filter((t) => t !== "O" && t !== "unknown"); @@ -771,31 +1207,8 @@ function deriveTestResults( } } - // 8. hard_drop - if (!session.started) { - results.push(skipResult("hard_drop", "game did not start")); - } else if (session.hardDropsObserved > 0) { - results.push({ - name: "hard_drop", - pass: true, - detail: "piece immediately dropped to bottom (grid-verified)", - }); - } else if (!gridReliable) { - results.push({ - name: "hard_drop", - pass: false, - detail: "grid reader unreliable, cannot verify hard drop", - }); - } else { - results.push({ - name: "hard_drop", - pass: false, - detail: "no grid change with bottom cells detected after hard drop key", - }); - } - - // 9. piece_locks -- only trust if grid is reliable - if (!session.started) { + // 10. piece_locks -- only trust if grid is reliable + if (!phaseState.gameStarted) { results.push(skipResult("piece_locks", "game did not start")); } else if (!gridReliable) { results.push({ @@ -832,8 +1245,8 @@ function deriveTestResults( } } - // 10. new_piece_spawns - if (!session.started) { + // 11. new_piece_spawns + if (!phaseState.gameStarted) { results.push(skipResult("new_piece_spawns", "game did not start")); } else if (session.piecesSpawned > 0) { results.push({ @@ -849,9 +1262,9 @@ function deriveTestResults( }); } - // 11. multiple_pieces - if (!phaseState.mechanicsSucceeded) { - results.push(skipResult("multiple_pieces", "mechanics phase not met")); + // 12. multiple_pieces + if (!phaseState.mechanicsWork) { + results.push(skipResult("multiple_pieces", "mechanics phase failed")); } else if (session.piecesLocked >= 3 && session.piecesSpawned > 0) { results.push({ name: "multiple_pieces", @@ -866,9 +1279,9 @@ function deriveTestResults( }); } - // 12. line_clear - if (!phaseState.mechanicsSucceeded) { - results.push(skipResult("line_clear", "mechanics phase not met")); + // 13. line_clear + if (!phaseState.mechanicsWork) { + results.push(skipResult("line_clear", "mechanics phase failed")); } else if (session.linesCleared > 0) { results.push({ name: "line_clear", @@ -883,9 +1296,9 @@ function deriveTestResults( }); } - // 13. score_changes - if (!phaseState.mechanicsSucceeded) { - results.push(skipResult("score_changes", "mechanics phase not met")); + // 14. score_changes + if (!phaseState.mechanicsWork) { + results.push(skipResult("score_changes", "mechanics phase failed")); } else if (session.scoreValues.length >= 2) { const min = Math.min(...session.scoreValues); const max = Math.max(...session.scoreValues); @@ -916,9 +1329,9 @@ function deriveTestResults( }); } - // 14. game_over -- requires gameplay to have placed pieces - if (!phaseState.gameplayPlacedPieces) { - results.push(skipResult("game_over", "gameplay phase not met")); + // 15. game_over -- requires pieces to work + if (!phaseState.piecesWork) { + results.push(skipResult("game_over", "piece lifecycle failed")); } else { results.push({ name: "game_over", @@ -929,9 +1342,9 @@ function deriveTestResults( }); } - // 15. playable_30s -- requires gameplay to have worked - if (!phaseState.gameplayPlacedPieces) { - results.push(skipResult("playable_30s", "gameplay phase not met")); + // 16. playable_30s -- requires gameplay to have worked + if (!phaseState.gameplayWorks) { + results.push(skipResult("playable_30s", "gameplay phase failed")); } else { const crashed = session.consoleErrors.length > 0 || gameplay.errors_during_play > 3; if (!crashed && gameplay.play_duration_seconds >= 10) { @@ -955,6 +1368,146 @@ function deriveTestResults( } } + // ---- Phase 8: Competitive play tests (17-24) ---- + + // 17. multi_line_clear + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("multi_line_clear", "competitive play phase did not run")); + } else if (competitivePlay.double_clears + competitivePlay.triple_clears + competitivePlay.tetris_clears > 0) { + const hasMultiLineBug = competitivePlay.bugs_detected.includes("multi_line_clear_only_removes_one_row"); + results.push({ + name: "multi_line_clear", + pass: !hasMultiLineBug, + detail: hasMultiLineBug + ? "multi-line clear detected but only 1 row was removed" + : `multi-line clears work: ${competitivePlay.double_clears}x double, ${competitivePlay.triple_clears}x triple, ${competitivePlay.tetris_clears}x tetris`, + }); + } else { + results.push(skipResult("multi_line_clear", "no multi-line clear opportunity occurred during play")); + } + + // 18. score_scaling + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("score_scaling", "competitive play phase did not run")); + } else if (competitivePlay.double_clears + competitivePlay.triple_clears + competitivePlay.tetris_clears > 0) { + const hasBug = competitivePlay.bugs_detected.includes("score_does_not_scale_with_simultaneous_clears"); + results.push({ + name: "score_scaling", + pass: !hasBug, + detail: hasBug + ? "multi-line clears give same points as single clears" + : `score scales with clear type (${competitivePlay.score_increases.length} score changes observed)`, + }); + } else { + results.push(skipResult("score_scaling", "no multi-line clear occurred to test scaling")); + } + + // 19. level_progression + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("level_progression", "competitive play phase did not run")); + } else if (competitivePlay.total_lines_cleared < 10) { + results.push(skipResult("level_progression", `only ${competitivePlay.total_lines_cleared} lines cleared (need 10+)`)); + } else { + const hasBug = competitivePlay.bugs_detected.includes("level_does_not_increase"); + if (competitivePlay.level_readings.length < 2) { + results.push(skipResult("level_progression", "could not read level display")); + } else { + results.push({ + name: "level_progression", + pass: !hasBug, + detail: hasBug + ? `level stayed at ${competitivePlay.level_readings[0]} despite ${competitivePlay.total_lines_cleared} lines cleared` + : `level progressed from ${competitivePlay.level_readings[0]} to ${competitivePlay.level_final}`, + }); + } + } + + // 20. speed_progression + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("speed_progression", "competitive play phase did not run")); + } else if (competitivePlay.level_readings.length < 2 || new Set(competitivePlay.level_readings).size <= 1) { + results.push(skipResult("speed_progression", "level did not increase, cannot test speed change")); + } else { + const hasBug = competitivePlay.bugs_detected.includes("speed_does_not_increase"); + results.push({ + name: "speed_progression", + pass: !hasBug && competitivePlay.speed_increased, + detail: competitivePlay.speed_increased + ? "drop speed increased with level" + : "drop speed did not change after level increased", + }); + } + + // 21. next_piece_preview + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("next_piece_preview", "competitive play phase did not run")); + } else { + results.push({ + name: "next_piece_preview", + pass: competitivePlay.next_piece_visible, + detail: competitivePlay.next_piece_visible + ? "next piece preview display found" + : "no next piece preview found", + }); + } + + // 22. game_over_display + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("game_over_display", "competitive play phase did not run")); + } else if (!competitivePlay.game_over_reached && !session.gameOverDetected) { + results.push(skipResult("game_over_display", "game over not reached during play")); + } else { + const hasText = competitivePlay.game_over_text_found !== null; + const hasRestart = competitivePlay.restart_available; + results.push({ + name: "game_over_display", + pass: hasText && hasRestart, + detail: hasText && hasRestart + ? `game over display: "${competitivePlay.game_over_text_found}", restart available` + : `missing: ${!hasText ? "game over text" : ""}${!hasText && !hasRestart ? " and " : ""}${!hasRestart ? "restart option" : ""}`, + }); + } + + // 23. counter_clockwise_rotation + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("counter_clockwise_rotation", "competitive play phase did not run")); + } else { + const ccwTestDone = (competitivePlay as any)._ccwTestDone === true; + const ccwResult = (competitivePlay as any)._ccwResult; + if (!ccwTestDone) { + results.push(skipResult("counter_clockwise_rotation", "could not test rotation direction")); + } else { + results.push({ + name: "counter_clockwise_rotation", + pass: ccwResult === true, + detail: ccwResult === true + ? "Z key rotates opposite direction from Up arrow" + : ccwResult === false + ? "Z key does same as Up arrow or does not rotate" + : "could not determine rotation direction", + }); + } + } + + // 24. soft_drop_distinct + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("soft_drop_distinct", "competitive play phase did not run")); + } else { + const softDropTestDone = (competitivePlay as any)._softDropTestDone === true; + const softDropDistinct = (competitivePlay as any)._softDropDistinct; + if (!softDropTestDone) { + results.push(skipResult("soft_drop_distinct", "could not test soft drop behavior")); + } else { + results.push({ + name: "soft_drop_distinct", + pass: softDropDistinct === true, + detail: softDropDistinct === true + ? "Down arrow moves piece 1 row (distinct from hard drop)" + : "Down arrow acts like hard drop (drops to bottom)", + }); + } + } + return results; } @@ -994,31 +1547,14 @@ function extractScoreFromText(text: string | null): number[] { } async function loadGamePage(page: Page, serverUrl: string): Promise<void> { - // Try root first (serve SPA mode redirects /index.html to /) - const candidates = [ - "", - "index.html", - "dist/index.html", - "public/index.html", - "build/index.html", - ]; - - for (const candidate of candidates) { - try { - const url = candidate ? `${serverUrl}/${candidate}` : `${serverUrl}/`; - const response = await page.goto(url, { - timeout: 15000, - waitUntil: "commit", - }); - if (response && response.ok()) { - // Give the page a moment to render after commit - await page.waitForTimeout(2000); - return; - } - } catch { - continue; - } + const response = await page.goto(serverUrl, { + timeout: 15000, + waitUntil: "networkidle", + }); + if (!response || !response.ok()) { + throw new Error(`Failed to load ${serverUrl}: ${response?.status()}`); } + await page.waitForTimeout(1000); } function emptyCalibration(consoleErrors: string[]): CalibrationResult { @@ -1042,3 +1578,125 @@ function emptyCalibration(consoleErrors: string[]): CalibrationResult { gridConfidence: 0, }; } + +/** + * Read the level display from the page. + */ +async function readLevelFromPage(page: Page): Promise<number | null> { + try { + return await page.evaluate(() => { + const allElements = document.querySelectorAll("*"); + for (const el of allElements) { + const text = ((el as HTMLElement).innerText || "").toLowerCase(); + if (text.includes("level") && el.children.length < 5) { + const match = text.match(/level\s*[:\-=]?\s*(\d+)/i); + if (match) return parseInt(match[1], 10); + + // Check child elements for a standalone number + const children = el.querySelectorAll("span, div, p, td, strong, em, b"); + for (const child of children) { + const childText = (child.textContent || "").trim(); + if (/^\d+$/.test(childText)) return parseInt(childText, 10); + } + + // Check next sibling + const next = el.nextElementSibling; + if (next) { + const nextText = (next.textContent || "").trim(); + if (/^\d+$/.test(nextText)) return parseInt(nextText, 10); + } + } + } + return null; + }); + } catch { + return null; + } +} + +/** + * Measure the auto-drop interval by watching for grid changes without input. + * Returns the average interval in ms, or 0 if unable to measure. + */ +async function measureDropInterval( + page: Page, + cal: CalibrationResult +): Promise<number> { + try { + const intervals: number[] = []; + let lastChangeTime = Date.now(); + let prevGrid = await readGrid(page, cal); + + for (let i = 0; i < 10; i++) { + await page.waitForTimeout(100); + const grid = await readGrid(page, cal); + if (grid && prevGrid && gridsAreDifferent(grid, prevGrid)) { + const now = Date.now(); + const interval = now - lastChangeTime; + if (interval > 50 && interval < 3000) { + intervals.push(interval); + } + lastChangeTime = now; + prevGrid = grid; + } + } + + if (intervals.length >= 2) { + return intervals.reduce((a, b) => a + b, 0) / intervals.length; + } + } catch { /* ignore */ } + return 0; +} + +/** + * Detect if there's a next piece preview display on the page. + */ +async function detectNextPiecePreview(page: Page): Promise<boolean> { + try { + return await page.evaluate(() => { + // Check for text mentioning "next" + const allElements = document.querySelectorAll("*"); + for (const el of allElements) { + const text = ((el as HTMLElement).innerText || "").toLowerCase(); + if (text.includes("next") && el.children.length < 10) { + // Check for a canvas or grid-like element nearby + const rect = (el as HTMLElement).getBoundingClientRect(); + if (rect.width > 20 && rect.height > 20) { + return true; + } + } + } + + // Check for secondary canvases (common next piece implementation) + const canvases = document.querySelectorAll("canvas"); + if (canvases.length >= 2) { + // Multiple canvases -- one might be the next piece preview + const mainCanvas = canvases[0]; + const mainRect = mainCanvas.getBoundingClientRect(); + for (let i = 1; i < canvases.length; i++) { + const rect = canvases[i].getBoundingClientRect(); + // Next piece preview is typically smaller than the main grid + if (rect.width < mainRect.width * 0.5 && rect.height < mainRect.height * 0.5 && + rect.width > 20 && rect.height > 20) { + return true; + } + } + } + + // Check for a small div/container with "next" in class/id + const nextContainers = document.querySelectorAll( + '[class*="next"], [id*="next"], [class*="preview"], [id*="preview"]' + ); + for (const container of nextContainers) { + const rect = (container as HTMLElement).getBoundingClientRect(); + if (rect.width > 20 && rect.height > 20) { + return true; + } + } + + return false; + }); + } catch { + return false; + } +} diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -33,6 +33,38 @@ export type StartMechanism = | "anykey" | "unknown"; +/** Pre-test survey data collected before any tests run. */ +export interface SurveyData { + has_overlay: boolean; + has_canvas: boolean; + has_dom_grid: boolean; + visible_text: string[]; + clickable_elements: number; +} + +/** Competitive play results (Phase 8). */ +export interface CompetitivePlayResult { + duration_seconds: number; + pieces_placed: number; + total_lines_cleared: number; + single_clears: number; + double_clears: number; + triple_clears: number; + tetris_clears: number; + max_combo: number; + score_readings: number[]; + score_final: number; + score_increases: number[]; + level_readings: number[]; + level_final: number; + game_over_reached: boolean; + game_over_text_found: string | null; + restart_available: boolean; + next_piece_visible: boolean; + speed_increased: boolean; + bugs_detected: string[]; +} + /** Result of the calibration phase. */ export interface CalibrationResult { renderer: RendererType; @@ -108,6 +140,8 @@ export interface GameSession { gridReadFail: number; frames: number; events: GridEvent[]; + /** Phases that were skipped and why. */ + skippedPhases: string[]; } /** Gameplay statistics gathered during the play phase. */ @@ -129,15 +163,18 @@ export interface BotReport { start_mechanism: string; score_element_found: boolean; grid_confidence: number; + survey: SurveyData; }; tests: Array<{ name: string; pass: boolean; detail: string }>; summary: { total: number; passed: number; failed: number; + skipped: number; score: number; }; gameplay: GameplayStats; + competitive_play: CompetitivePlayResult | null; session: { frames: number; events_count: number;

Impressum · Datenschutz