loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 2644610c24ac12d9ef707571aec1e31a934389a8
parent 8dc9ec566791cf32913b7ea8f3ba37a789ef0b86
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 10 Apr 2026 20:04:37 +0200

V2: control discovery system

Bot no longer assumes default Tetris controls. Driver now probes each
candidate key to discover what it actually does:
- ArrowDown might be hard drop instead of soft drop
- Space might be pause instead of hard drop
- Some games have no soft drop at all (skip move_down test as N/A)

Discovery is reload-safe (clears game state between probes), classifies
based on grid delta (movement direction, distance, shape change),
budget capped at 50s.

New types: GameAction, ControlMapping, ControlMap with confidence levels
New driver methods: discoverControls(), getControl()
Bot updated: move_down and soft_drop_distinct skip when soft_drop not found
Report includes control_discovery field showing what each key does

Results:
- 1d08ee76 (control swap): 67% -> 83-89%
- 4c7db3b9 (working game): 86% -> 100%
- 8fe72fce (held): 95%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 38++++++++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot-v2/driver.ts | 701+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot-v2/index.ts | 39++++++++++++++++++++++++++++++++++++++-
Mtasks/tetris/eval/gameplay-bot-v2/playwright.config.ts | 2+-
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 844 insertions(+), 2 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -319,6 +319,7 @@ function emptyCalibration(consoleErrors: string[]): DriverCalibration { cellWidth: 0, cellHeight: 0, controls: { left: "ArrowLeft", right: "ArrowRight", down: "ArrowDown", rotate: "ArrowUp", drop: "Space" }, + controlMap: null, startMechanism: "unknown", scoreElementSelector: null, levelElementSelector: null, @@ -455,6 +456,31 @@ export async function runAllTests( } catch { /* keep original */ } } + // Control discovery: probe what each key actually does. This is the + // difference between "ArrowDown is soft drop" (assumption) and "on this + // particular game, ArrowDown is hard drop" (measurement). Runs only once + // per session -- the result is cached on the driver and flows through + // subsequent pressKey() calls automatically. + if (gameStarted && cal.gridDetected) { + try { + const controlMap = await driver.discoverControls(serverUrl); + console.log( + `[bot] control discovery complete: ` + + `move_left=${controlMap.move_left.key ?? "?"}, ` + + `move_right=${controlMap.move_right.key ?? "?"}, ` + + `soft_drop=${controlMap.soft_drop.key ?? "NONE"}, ` + + `hard_drop=${controlMap.hard_drop.key ?? "?"}, ` + + `rotate_cw=${controlMap.rotate_cw.key ?? "?"}` + ); + // Refresh the working calibration from the driver -- discoverControls() + // updates cal.controlMap and cal.controls on the cached calibration + // object in-place. + cal = driver.getCalibration(); + } catch (err) { + console.log(`[bot] control discovery threw: ${err instanceof Error ? err.message : String(err)}`); + } + } + // ---- Phase 3: Basic mechanics ---- let mechanicsWork = false; if (gameStarted && cal.gridDetected) { @@ -1590,6 +1616,15 @@ function deriveTestResults( results.push(skipResult(`move_${dir}`, "game did not start")); continue; } + // Not applicable: if control discovery determined that this game has no + // soft_drop (no key produced a single-row downward move), then move_down + // is a feature the game genuinely lacks rather than something that's + // broken. Report as skipped with a clear reason so it does not drag the + // score down. + if (dir === "down" && cal.controlMap && cal.controlMap.soft_drop.confidence === "not_found") { + results.push(skipResult("move_down", "no soft_drop key (game has only hard_drop)")); + continue; + } const moveEvents = session.events.filter((e) => e.type === "piece_moved" && e.direction === dir); if (moveEvents.length > 0) { results.push({ name: `move_${dir}`, pass: true, detail: "grid state changed after key press (grid-verified)" }); @@ -1849,6 +1884,9 @@ function deriveTestResults( // 24. soft_drop_distinct if (!phaseState.gameplayWorks || !competitivePlay) { results.push(skipResult("soft_drop_distinct", "competitive play phase did not run")); + } else if (cal.controlMap && cal.controlMap.soft_drop.confidence === "not_found") { + // Game has no soft drop at all -- not applicable. + results.push(skipResult("soft_drop_distinct", "no soft_drop key (game has only hard_drop)")); } else { const softDropTestDone = (competitivePlay as any)._softDropTestDone === true; const softDropDistinct = (competitivePlay as any)._softDropDistinct; diff --git a/tasks/tetris/eval/gameplay-bot-v2/driver.ts b/tasks/tetris/eval/gameplay-bot-v2/driver.ts @@ -7,6 +7,9 @@ import type { GridBounds, RendererType, Controls, + ControlMap, + ControlMapping, + GameAction, StartMechanism, StartCandidate, TryStartResult, @@ -29,6 +32,46 @@ const DEFAULT_CONTROLS: Controls = { drop: "Space", }; +/** + * Candidate keys to try for each abstract game action, in priority order. + * The discovery loop tries these from top to bottom until one matches the + * expected behaviour (e.g. moves piece 1 column left). + */ +const CONTROL_CANDIDATES: Record<GameAction, string[]> = { + move_left: ["ArrowLeft", "a", "h"], + move_right: ["ArrowRight", "d", "l"], + // ArrowDown LAST: some games treat it as hard_drop, so we try alternatives + // first and fall through. This is the key insight of the discovery system. + soft_drop: ["s", "ArrowDown"], + // ArrowDown also gets tried as hard_drop because some games bind the down + // arrow to hard drop instead of soft drop. Order: conventional hard-drop + // keys first, then the ambiguous ArrowDown last. + hard_drop: ["Space", "Enter", "ArrowUp", "ArrowDown"], + rotate_cw: ["ArrowUp", "x", "w"], + rotate_ccw: ["z", "Control"], + // pause/hold are not currently discovered; we keep these entries for type + // completeness but they remain at default "not_found". + pause: ["p", "Escape"], + hold: ["c", "Shift"], +}; + +function emptyControlMap(): ControlMap { + const notFound = (): ControlMapping => ({ + key: null, + confidence: "not_found", + observation: "", + }); + return { + move_left: notFound(), + move_right: notFound(), + soft_drop: notFound(), + hard_drop: notFound(), + rotate_cw: notFound(), + rotate_ccw: notFound(), + key_observations: {}, + }; +} + // --------------------------------------------------------------------------- // GridSnapshot factory // --------------------------------------------------------------------------- @@ -176,6 +219,7 @@ function cloneCalibration(cal: DriverCalibration): DriverCalibration { cellWidth: cal.cellWidth, cellHeight: cal.cellHeight, controls: { ...cal.controls }, + controlMap: cal.controlMap ? cloneControlMap(cal.controlMap) : cal.controlMap ?? null, startMechanism: cal.startMechanism, scoreElementSelector: cal.scoreElementSelector, levelElementSelector: cal.levelElementSelector, @@ -195,6 +239,18 @@ function cloneCalibration(cal: DriverCalibration): DriverCalibration { return copy; } +function cloneControlMap(map: ControlMap): ControlMap { + return { + move_left: { ...map.move_left }, + move_right: { ...map.move_right }, + soft_drop: { ...map.soft_drop }, + hard_drop: { ...map.hard_drop }, + rotate_cw: { ...map.rotate_cw }, + rotate_ccw: { ...map.rotate_ccw }, + key_observations: { ...map.key_observations }, + }; +} + function gridBoundsSimilar(a: GridBounds, b: GridBounds): boolean { // Tolerate rendering jitter but flag anything beyond ~10% size change. const tol = Math.max(20, Math.min(a.width, b.width) * 0.15); @@ -262,6 +318,10 @@ export class PlaywrightDriver implements TetrisDriver { // Set by the bot when bridge verification definitively failed -- the legacy // detectStartMechanism() fallback must NOT run and override the bot's verdict. private startRejected: boolean = false; + // Result of discoverControls(). Persists across calibration cache hits so + // downstream phases reuse the discovered mapping without re-running the + // expensive discovery loop. null = not yet discovered. + private discoveredControls: ControlMap | null = null; // Cumulative drift info across the session. private drift: CalibrationDrift = { drifted: false, @@ -603,6 +663,7 @@ export class PlaywrightDriver implements TetrisDriver { cellWidth: grid.cellWidth, cellHeight: grid.cellHeight, controls: { ...base.controls }, + controlMap: base.controlMap ? cloneControlMap(base.controlMap) : null, startMechanism: base.startMechanism, scoreElementSelector: base.scoreElementSelector, levelElementSelector: base.levelElementSelector, @@ -1228,15 +1289,655 @@ export class PlaywrightDriver implements TetrisDriver { // -- Input -- async pressKey(action: "left" | "right" | "down" | "rotate" | "drop"): Promise<void> { + // Prefer the discovered control map when available, falling back to the + // legacy controls field. If the discovered soft_drop is null (no soft + // drop on this game), pressing "down" becomes a no-op -- callers that + // care about distinguishing the two should check getControl("soft_drop") + // first. + const discovered = this.discoveredControls; + if (discovered) { + const mapped = this.mapLegacyActionToDiscovered(action, discovered); + if (mapped === null) { + // Explicit no-op: the game doesn't have this action. We still swallow + // the call silently rather than throwing so the bot's play loops that + // sprinkle in optional soft_drop presses don't crash. + return; + } + if (mapped !== undefined) { + await this.page.keyboard.press(mapped); + return; + } + } const cal = this.cal; const key = cal ? cal.controls[action] : DEFAULT_CONTROLS[action]; await this.page.keyboard.press(key); } + /** + * Translate the legacy pressKey action names to the discovered control map. + * Returns: + * - a string key if discovered and valid + * - null if the action maps to soft_drop and soft_drop is explicitly + * not_found (no-op signal) + * - undefined if no discovery info for this action (caller falls back) + */ + private mapLegacyActionToDiscovered( + action: "left" | "right" | "down" | "rotate" | "drop", + map: ControlMap + ): string | null | undefined { + switch (action) { + case "left": + return map.move_left.key ?? undefined; + case "right": + return map.move_right.key ?? undefined; + case "down": + // Distinguish "discovery ran and found no soft drop" from "discovery + // never ran for this action". The first returns null (no-op), the + // second returns undefined (fall through to defaults). + if (map.soft_drop.confidence === "not_found") return null; + return map.soft_drop.key ?? undefined; + case "rotate": + return map.rotate_cw.key ?? undefined; + case "drop": + return map.hard_drop.key ?? undefined; + } + } + async pressRawKey(key: string): Promise<void> { await this.page.keyboard.press(key); } + // -- Control discovery -- + + getControl(action: GameAction): string | null { + const map = this.discoveredControls; + if (map) { + switch (action) { + case "move_left": return map.move_left.key; + case "move_right": return map.move_right.key; + case "soft_drop": return map.soft_drop.key; + case "hard_drop": return map.hard_drop.key; + case "rotate_cw": return map.rotate_cw.key; + case "rotate_ccw": return map.rotate_ccw.key; + case "pause": return null; + case "hold": return null; + } + } + // Fallback to legacy controls field. + const cal = this.cal; + const controls = cal ? cal.controls : DEFAULT_CONTROLS; + switch (action) { + case "move_left": return controls.left; + case "move_right": return controls.right; + case "soft_drop": return controls.down; + case "hard_drop": return controls.drop; + case "rotate_cw": return controls.rotate; + case "rotate_ccw": return null; + case "pause": return null; + case "hold": return null; + } + } + + /** + * Discover the control mapping by pressing candidate keys and watching + * grid deltas. This is expensive (can reload the page several times) so + * callers should only invoke it once per session. + * + * The result is cached on the driver and flows through getControl() and + * pressKey() from the moment it returns. + */ + async discoverControls(serverUrl: string): Promise<ControlMap> { + const log = (msg: string) => console.log(`[discover] ${msg}`); + // 50s hard budget. Discovery happens between bridge verification and the + // first test phase, so we want to be quick. Each reload costs ~2s, so + // this is ~25 reloads max. + const deadline = Date.now() + 50_000; + const budgetExceeded = () => Date.now() >= deadline; + + // Start from an empty map. If any key probe fails, we simply leave that + // slot as "not_found" with no observation. + const map = emptyControlMap(); + + // IMPORTANT: tests run after discovery expect the driver state to + // resemble "game freshly started". Discovery is destructive (it presses + // keys, stacks pieces, may even trigger game_over), so we always + // RELOAD between discovery trials and after discovery finishes. + + // Helper: reload the page, re-apply the confirmed start, and wait for + // a piece to become observable. + const freshStart = async (): Promise<GridSnapshot | null> => { + try { + await this.loadPage(serverUrl); + } catch { + return null; + } + try { + // calibrate() will replay the confirmed candidate (if one is set) + // and populate this.cal. Discovery runs AFTER bridge verification, + // so confirmedCandidate is already committed at this point. + await this.calibrate(); + } catch { + return null; + } + // Grid might still be spawning; give it a brief window. + await this.wait(300); + // Fall back to refresh in case the grid wasn't detected at the first + // calibrate() pass (DOM games that build cells post-click). + if (!this.cal?.gridDetected) { + try { + await this.refreshGridDetection(); + } catch { /* ignore */ } + } + // Poll for an active piece so the delta classification works. + const emptyGrid: Grid = Array.from({ length: GRID_ROWS }, () => + Array.from({ length: GRID_COLS }, () => false) + ); + let snap = await this.readGrid(emptyGrid); + let tries = 0; + while ((!snap.grid || snap.filledCount === 0) && tries < 20) { + await this.wait(100); + snap = await this.readGrid(emptyGrid); + tries++; + } + if (!snap.grid) return null; + return snap; + }; + + // Helper: classify the delta between two grids. + // Returns a label indicating what probably happened, or "no_change". + const classifyDelta = ( + before: Grid, + after: Grid + ): + | { kind: "no_change" } + | { kind: "move_left"; distance: number } + | { kind: "move_right"; distance: number } + | { kind: "move_down"; distance: number } + | { kind: "hard_drop"; distance: number } + | { kind: "rotate" } + | { kind: "other"; detail: string } => { + // Extract fill cells from each grid. + const cellsA: [number, number][] = []; + const cellsB: [number, number][] = []; + for (let r = 0; r < before.length; r++) { + for (let c = 0; c < before[r].length; c++) { + if (before[r][c]) cellsA.push([r, c]); + if (after[r][c]) cellsB.push([r, c]); + } + } + // Same cells? No change. + const keyA = cellsA.map(([r, c]) => `${r},${c}`).sort().join("|"); + const keyB = cellsB.map(([r, c]) => `${r},${c}`).sort().join("|"); + if (keyA === keyB) return { kind: "no_change" }; + + // If cell counts are similar (+/- 1), try to detect a rigid translation + // of the active piece. We do this by looking at the symmetric + // differences: cells that disappeared and cells that appeared. + const setA = new Set(keyA.split("|")); + const setB = new Set(keyB.split("|")); + const disappeared: [number, number][] = []; + const appeared: [number, number][] = []; + for (const [r, c] of cellsA) { + if (!setB.has(`${r},${c}`)) disappeared.push([r, c]); + } + for (const [r, c] of cellsB) { + if (!setA.has(`${r},${c}`)) appeared.push([r, c]); + } + if (disappeared.length === 0 && appeared.length === 0) { + return { kind: "no_change" }; + } + + // Compute centroids + shapes of the disappeared/appeared sets. + const avg = (arr: [number, number][]) => { + const sumR = arr.reduce((s, [r]) => s + r, 0); + const sumC = arr.reduce((s, [, c]) => s + c, 0); + return [sumR / arr.length, sumC / arr.length] as [number, number]; + }; + const normalize = (cells: [number, number][]): string => { + if (cells.length === 0) return ""; + const minR = Math.min(...cells.map(([r]) => r)); + const minC = Math.min(...cells.map(([, c]) => c)); + return cells + .map(([r, c]) => `${r - minR},${c - minC}`) + .sort() + .join("|"); + }; + + // Case 1: equal-size symmetric diff -> pure translation or rotation + // of the active piece. When a 4-cell piece moves 1 column, some cells + // overlap between old/new position (e.g. O-piece has 2 overlapping + // cells), so the symmetric diff can be as small as 2 cells. We allow + // 1-4 cells here. + if ( + disappeared.length === appeared.length && + disappeared.length >= 1 && + disappeared.length <= 4 + ) { + const [avgRA, avgCA] = avg(disappeared); + const [avgRB, avgCB] = avg(appeared); + const dRow = avgRB - avgRA; + const dCol = avgCB - avgCA; + const shapeA = normalize(disappeared); + const shapeB = normalize(appeared); + // Compare full-piece footprints: if a 4-cell piece rotates, the + // set of cells in the "disappeared" union with the settled grid + // can be very different from before. A cleaner rotation signal is + // the total cell count in before vs after. For a pure translation, + // the piece has the same bounding-box footprint (same shape) but + // shifted. For a rotation, the bounding box dimensions usually + // change (tall->wide or wide->tall). + const fullBB = (cells: [number, number][]) => { + if (cells.length === 0) return { w: 0, h: 0 }; + return { + w: Math.max(...cells.map(([, c]) => c)) - Math.min(...cells.map(([, c]) => c)) + 1, + h: Math.max(...cells.map(([r]) => r)) - Math.min(...cells.map(([r]) => r)) + 1, + }; + }; + const bbA = fullBB(cellsA); + const bbB = fullBB(cellsB); + // Total-cells test: if the piece count didn't change and the + // bounding box aspect flipped (e.g. 4x1 -> 1x4), that's rotation. + // This catches I-piece rotation reliably. + if ( + cellsA.length === cellsB.length && + ((bbA.w !== bbB.w) || (bbA.h !== bbB.h)) && + Math.abs(bbA.w - bbB.h) <= 1 && Math.abs(bbA.h - bbB.w) <= 1 + ) { + return { kind: "rotate" }; + } + + // Rotation by shape change: if the disappeared/appeared shapes + // differ, and the centroid drift doesn't look like a clean 1-col + // horizontal translation, classify as rotation. Checked BEFORE the + // horizontal/vertical translation tests so rotations that coincide + // with auto-drop are still tagged as rotations, not translations. + if (shapeA !== shapeB) { + // A clean horizontal translation has dCol >= 1 and dRow small. + // A clean vertical translation has dRow >= 1 and dCol small. + // Anything else is rotation when the shape differs. + const looksLikeClearHorizontal = + Math.abs(dCol) >= 0.9 && Math.abs(dRow) < 0.6; + const looksLikeClearVertical = + Math.abs(dCol) < 0.6 && dRow >= 0.9; + if (!looksLikeClearHorizontal && !looksLikeClearVertical) { + return { kind: "rotate" }; + } + } + + // Horizontal translation: large dCol dominates, shape is stable. + if (shapeA === shapeB && Math.abs(dCol) >= 0.9 && Math.abs(dRow) <= 1.2) { + if (dCol <= -0.9) return { kind: "move_left", distance: Math.max(1, Math.round(-dCol)) }; + if (dCol >= 0.9) return { kind: "move_right", distance: Math.max(1, Math.round(dCol)) }; + } + // Vertical translation: dRow dominates, small dCol drift allowed. + if (shapeA === shapeB && Math.abs(dCol) < 0.7 && dRow >= 0.5) { + const distance = Math.max(1, Math.round(dRow)); + if (distance >= 5) return { kind: "hard_drop", distance }; + return { kind: "move_down", distance }; + } + // Shape same but neither clearly horizontal nor vertical -> fall + // through to "other". This avoids misclassifying noise as motion. + } + + // Case 2: 4 cells appeared near the bottom and 4 (or fewer) disappeared + // from higher up -> hard drop that teleported the piece. + if (appeared.length >= 3 && disappeared.length >= 3) { + const avgAppearedRow = avg(appeared)[0]; + const avgDisappearedRow = avg(disappeared)[0]; + const dRow = avgAppearedRow - avgDisappearedRow; + if (dRow >= 4) { + return { kind: "hard_drop", distance: Math.round(dRow) }; + } + } + + // Case 3: more appeared than disappeared (a new piece spawned while + // the old one dropped and locked). Treat as hard drop if the old + // piece ended up near the bottom. + if (appeared.length > disappeared.length && appeared.length >= 4) { + // Find the set of appeared cells in the bottom half. + const bottomAppeared = appeared.filter(([r]) => r >= GRID_ROWS / 2); + if (bottomAppeared.length >= 3 && disappeared.length <= 4) { + const avgDisappearedRow = disappeared.length > 0 + ? avg(disappeared)[0] + : 0; + const avgBottomRow = avg(bottomAppeared)[0]; + const dRow = avgBottomRow - avgDisappearedRow; + if (dRow >= 4) { + return { kind: "hard_drop", distance: Math.round(dRow) }; + } + } + } + + return { kind: "other", detail: `disappeared=${disappeared.length}, appeared=${appeared.length}` }; + }; + + // Helper: try a single candidate key for an action, return whether it + // matched the expected classification. + const tryCandidateKey = async ( + action: GameAction, + key: string, + expected: ( + delta: ReturnType<typeof classifyDelta> + ) => { matched: boolean; observation: string } + ): Promise<boolean> => { + // Snapshot before. + const before = await this.readGrid(); + if (!before.grid) { + map.key_observations[key] = "grid read failed before press"; + return false; + } + // Ignore keys on a fully-empty grid -- the candidate might do nothing + // simply because there's no active piece yet. + if (before.filledCount === 0) { + map.key_observations[key] = "grid empty before press (no piece)"; + return false; + } + try { + await this.pressRawKey(key); + } catch { + map.key_observations[key] = "keyboard press threw"; + return false; + } + await this.wait(120); + const after = await this.readGrid(); + if (!after.grid) { + map.key_observations[key] = "grid read failed after press"; + return false; + } + const delta = classifyDelta(before.grid, after.grid); + const result = expected(delta); + // Only record this observation if we don't already have one for the key + // (first observation wins -- keeps the report meaningful). + if (!map.key_observations[key]) { + map.key_observations[key] = result.observation; + } + if (result.matched) { + const slot: ControlMapping = { + key, + confidence: "suspected", + observation: result.observation, + }; + switch (action) { + case "move_left": map.move_left = slot; break; + case "move_right": map.move_right = slot; break; + case "soft_drop": map.soft_drop = slot; break; + case "hard_drop": map.hard_drop = slot; break; + case "rotate_cw": map.rotate_cw = slot; break; + case "rotate_ccw": map.rotate_ccw = slot; break; + default: break; + } + return true; + } + return false; + }; + + // ---- Movement discovery (order: least disruptive first) ---- + // Try each action in priority order, reloading between actions to get + // a fresh piece that hasn't already moved from the previous probe. + + // move_left + if (!budgetExceeded()) { + log("phase: move_left"); + await freshStart(); + for (const key of CONTROL_CANDIDATES.move_left) { + if (budgetExceeded()) break; + const matched = await tryCandidateKey("move_left", key, (delta) => { + if (delta.kind === "move_left") { + return { matched: true, observation: `moved ${delta.distance} col(s) left` }; + } + if (delta.kind === "move_right") { + return { matched: false, observation: `moved ${delta.distance} col(s) right (wrong direction)` }; + } + if (delta.kind === "hard_drop") { + return { matched: false, observation: `hard_drop (${delta.distance} rows)` }; + } + if (delta.kind === "move_down") { + return { matched: false, observation: `moved ${delta.distance} row(s) down` }; + } + if (delta.kind === "rotate") { + return { matched: false, observation: "rotation" }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change (${delta.detail})` }; + }); + if (matched) { log(` move_left: ${key}`); break; } + } + } + + // move_right -- no need to reload if move_left probe succeeded without + // disrupting the board meaningfully. + if (!budgetExceeded()) { + log("phase: move_right"); + await freshStart(); + for (const key of CONTROL_CANDIDATES.move_right) { + if (budgetExceeded()) break; + const matched = await tryCandidateKey("move_right", key, (delta) => { + if (delta.kind === "move_right") { + return { matched: true, observation: `moved ${delta.distance} col(s) right` }; + } + if (delta.kind === "move_left") { + return { matched: false, observation: `moved ${delta.distance} col(s) left (wrong direction)` }; + } + if (delta.kind === "hard_drop") { + return { matched: false, observation: `hard_drop (${delta.distance} rows)` }; + } + if (delta.kind === "move_down") { + return { matched: false, observation: `moved ${delta.distance} row(s) down` }; + } + if (delta.kind === "rotate") { + return { matched: false, observation: "rotation" }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change (${delta.detail})` }; + }); + if (matched) { log(` move_right: ${key}`); break; } + } + } + + // rotate_cw. Fast path: if an earlier phase observed this key producing + // a rotation, promote it without re-testing. + if (!budgetExceeded()) { + log("phase: rotate_cw"); + // Check if a prior phase already saw one of the rotate candidates as + // a rotation. + let promotedEarly = false; + for (const key of CONTROL_CANDIDATES.rotate_cw) { + const obs = map.key_observations[key]; + if (obs === "rotation" || obs === "shape changed (rotation)") { + map.rotate_cw = { + key, + confidence: "suspected", + observation: "rotation (promoted from earlier phase)", + }; + log(` rotate_cw: ${key} (promoted from observation)`); + promotedEarly = true; + break; + } + } + if (!promotedEarly) { + await freshStart(); + for (const key of CONTROL_CANDIDATES.rotate_cw) { + if (budgetExceeded()) break; + const matched = await tryCandidateKey("rotate_cw", key, (delta) => { + if (delta.kind === "rotate") { + return { matched: true, observation: "shape changed (rotation)" }; + } + if (delta.kind === "hard_drop") { + return { matched: false, observation: `hard_drop (${delta.distance} rows)` }; + } + if (delta.kind === "move_down") { + return { matched: false, observation: `moved ${delta.distance} row(s) down` }; + } + if (delta.kind === "move_left") { + return { matched: false, observation: `moved ${delta.distance} col(s) left` }; + } + if (delta.kind === "move_right") { + return { matched: false, observation: `moved ${delta.distance} col(s) right` }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change (${delta.detail})` }; + }); + if (matched) { log(` rotate_cw: ${key}`); break; } + } + } + } + + // hard_drop (do this BEFORE soft_drop so we know which key teleports). + // IMPORTANT: hard drop candidates include keys like Space that may be + // bound to other functions (pause, confirm, etc). Pressing Space on a + // game that uses it for pause will freeze all subsequent probes. So we + // RELOAD before every hard_drop attempt to guarantee a clean state. + if (!budgetExceeded()) { + log("phase: hard_drop"); + for (const key of CONTROL_CANDIDATES.hard_drop) { + if (budgetExceeded()) break; + if (map.hard_drop.confidence !== "not_found") break; + // Prior observation fast-path: if we already saw this key act like + // a rotation/left/right/etc. in an earlier phase, skip retesting. + const priorObs = map.key_observations[key]; + if (priorObs && !priorObs.includes("teleported") && !priorObs.includes("hard_drop")) { + continue; + } + // Always reload before a hard_drop probe. + await freshStart(); + const matched = await tryCandidateKey("hard_drop", key, (delta) => { + if (delta.kind === "hard_drop") { + return { matched: true, observation: `teleported ${delta.distance} rows to bottom` }; + } + if (delta.kind === "move_down") { + return { + matched: false, + observation: `moved ${delta.distance} row(s) down (soft drop, not hard drop)`, + }; + } + if (delta.kind === "rotate") { + return { matched: false, observation: "rotation" }; + } + if (delta.kind === "move_left") { + return { matched: false, observation: `moved ${delta.distance} col(s) left` }; + } + if (delta.kind === "move_right") { + return { matched: false, observation: `moved ${delta.distance} col(s) right` }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change (${delta.detail})` }; + }); + if (matched) { log(` hard_drop: ${key}`); break; } + } + } + + // soft_drop -- this is where the main bug lives. Try alternatives FIRST. + // ArrowDown is tried only if it wasn't already claimed as hard_drop. + if (!budgetExceeded()) { + log("phase: soft_drop"); + await freshStart(); + // Skip ArrowDown if we already discovered it maps to hard_drop on + // this game. Same for any key already claimed by another action. + const claimedKeys = new Set<string>(); + for (const slot of [map.move_left, map.move_right, map.hard_drop, map.rotate_cw]) { + if (slot.key) claimedKeys.add(slot.key); + } + for (const key of CONTROL_CANDIDATES.soft_drop) { + if (budgetExceeded()) break; + if (claimedKeys.has(key)) { + // ArrowDown was already claimed as hard_drop etc. + map.key_observations[key] = + map.key_observations[key] || "already claimed by another action"; + continue; + } + const matched = await tryCandidateKey("soft_drop", key, (delta) => { + if (delta.kind === "move_down") { + return { + matched: delta.distance >= 1 && delta.distance <= 3, + observation: `moved ${delta.distance} row(s) down`, + }; + } + if (delta.kind === "hard_drop") { + return { + matched: false, + observation: `teleported ${delta.distance} rows (hard_drop, not soft_drop)`, + }; + } + if (delta.kind === "rotate") { + return { matched: false, observation: "rotation" }; + } + if (delta.kind === "move_left") { + return { matched: false, observation: `moved ${delta.distance} col(s) left` }; + } + if (delta.kind === "move_right") { + return { matched: false, observation: `moved ${delta.distance} col(s) right` }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change (${delta.detail})` }; + }); + if (matched) { log(` soft_drop: ${key}`); break; } + } + if (map.soft_drop.confidence === "not_found") { + log(" soft_drop: NOT FOUND"); + } + } + + // rotate_ccw (best effort; cheap skip if budget exceeded) + if (!budgetExceeded()) { + log("phase: rotate_ccw"); + await freshStart(); + for (const key of CONTROL_CANDIDATES.rotate_ccw) { + if (budgetExceeded()) break; + const matched = await tryCandidateKey("rotate_ccw", key, (delta) => { + if (delta.kind === "rotate") { + return { matched: true, observation: "shape changed (rotation)" }; + } + if (delta.kind === "no_change") { + return { matched: false, observation: "no change" }; + } + return { matched: false, observation: `other change` }; + }); + if (matched) { log(` rotate_ccw: ${key}`); break; } + } + } + + // Final reload so tests start from a clean state. + await freshStart(); + + this.discoveredControls = map; + // Mirror into the cached calibration so bot-side code that reads + // cal.controlMap sees the discovery result. + if (this.cal) { + this.cal.controlMap = map; + // Also back-port the discovered keys into the legacy controls field + // so any code that still reads cal.controls.<x> keeps working. + if (map.move_left.key) this.cal.controls.left = map.move_left.key; + if (map.move_right.key) this.cal.controls.right = map.move_right.key; + if (map.rotate_cw.key) this.cal.controls.rotate = map.rotate_cw.key; + if (map.hard_drop.key) this.cal.controls.drop = map.hard_drop.key; + // Only override the legacy `down` if we found a distinct soft drop key. + // If soft_drop is not_found, leave `down` alone so legacy callers still + // have SOMETHING to press -- pressKey() will intercept and no-op anyway. + if (map.soft_drop.key) this.cal.controls.down = map.soft_drop.key; + } + // Persist into the first-cal baseline too so cache replays preserve the + // discovered map. + if (this.firstCal) { + this.firstCal.controlMap = cloneControlMap(map); + if (map.move_left.key) this.firstCal.controls.left = map.move_left.key; + if (map.move_right.key) this.firstCal.controls.right = map.move_right.key; + if (map.rotate_cw.key) this.firstCal.controls.rotate = map.rotate_cw.key; + if (map.hard_drop.key) this.firstCal.controls.drop = map.hard_drop.key; + if (map.soft_drop.key) this.firstCal.controls.down = map.soft_drop.key; + } + return map; + } + async wait(ms: number): Promise<void> { await this.page.waitForTimeout(ms); } diff --git a/tasks/tetris/eval/gameplay-bot-v2/index.ts b/tasks/tetris/eval/gameplay-bot-v2/index.ts @@ -89,7 +89,7 @@ test.describe("Tetris Gameplay Bot v2", () => { }); test("run gameplay bot", async ({ page }) => { - test.setTimeout(300_000); // 5-minute total timeout + test.setTimeout(360_000); // 6-minute total timeout (discovery adds ~35s) // Measure page load time let loadTimeMs = -1; @@ -151,6 +151,34 @@ test.describe("Tetris Gameplay Bot v2", () => { cleanCompetitivePlay = clean; } + // Build the control_discovery report field from the discovered map + // (may be null if discovery never ran because the game didn't start). + let controlDiscoveryReport: Record<string, string> | undefined; + if (calibration.controlMap) { + const cm = calibration.controlMap; + controlDiscoveryReport = {}; + const actionDescriptions: Array<[string, { key: string | null; observation: string; confidence: string }]> = [ + ["move_left", cm.move_left], + ["move_right", cm.move_right], + ["soft_drop", cm.soft_drop], + ["hard_drop", cm.hard_drop], + ["rotate_cw", cm.rotate_cw], + ["rotate_ccw", cm.rotate_ccw], + ]; + for (const [name, mapping] of actionDescriptions) { + if (mapping.key) { + controlDiscoveryReport[name] = `${mapping.key}${mapping.observation ? ` (${mapping.observation})` : ""}`; + } else { + controlDiscoveryReport[name] = "NOT FOUND"; + } + } + // Also report per-key observations so readers can see what every + // candidate did during discovery. + for (const [key, obs] of Object.entries(cm.key_observations)) { + if (obs) controlDiscoveryReport[`key:${key}`] = obs; + } + } + const report: BotReport = { implementation: { renderer: calibration.renderer, @@ -158,6 +186,7 @@ test.describe("Tetris Gameplay Bot v2", () => { grid_detected_at: calibration.gridDetectedAt || "initial", grid_bounds: calibration.gridBounds, controls: calibration.controls as unknown as Record<string, string>, + control_discovery: controlDiscoveryReport, start_mechanism: calibration.startMechanism, score_element_found: calibration.scoreElementSelector !== null, grid_confidence: calibration.gridConfidence, @@ -213,6 +242,14 @@ test.describe("Tetris Gameplay Bot v2", () => { console.log(`Grid read success rate: ${Math.round(gridSuccessRate * 100)}%`); console.log(`Start mechanism: ${calibration.startMechanism}`); console.log(`Score element: ${calibration.scoreElementSelector ?? "none"}`); + if (calibration.controlMap) { + const cm = calibration.controlMap; + console.log( + `Controls: left=${cm.move_left.key ?? "?"}, right=${cm.move_right.key ?? "?"}, ` + + `rotate=${cm.rotate_cw.key ?? "?"}, hard_drop=${cm.hard_drop.key ?? "?"}, ` + + `soft_drop=${cm.soft_drop.key ?? "NONE"}` + ); + } console.log(`\nTests: ${passed}/${total} passed, ${skipped} skipped, ${failed} failed`); console.log(`Score: ${report.summary.score} (${passed}/${scorable} scorable)`); for (const t of testResults) { diff --git a/tasks/tetris/eval/gameplay-bot-v2/playwright.config.ts b/tasks/tetris/eval/gameplay-bot-v2/playwright.config.ts @@ -3,7 +3,7 @@ import { defineConfig } from "@playwright/test"; export default defineConfig({ testDir: ".", testMatch: "index.ts", - timeout: 60_000, + timeout: 360_000, retries: 0, workers: 1, reporter: [["list"]], diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -21,6 +21,50 @@ export interface Controls { drop: string; } +/** + * Abstract game action -- the thing the bot wants to do, independent of which + * physical key performs it. + */ +export type GameAction = + | "move_left" + | "move_right" + | "soft_drop" + | "hard_drop" + | "rotate_cw" + | "rotate_ccw" + | "pause" + | "hold"; + +/** + * The result of trying a candidate key for a particular action. + * - "suspected": the key did something plausible on one trial + * - "confirmed": verified on a fresh reload (currently unused, reserved) + * - "not_found": no candidate key produced the expected behaviour + */ +export interface ControlMapping { + /** The discovered key, or null if no candidate matched. */ + key: string | null; + confidence: "suspected" | "confirmed" | "not_found"; + /** Human-readable description of what was observed. */ + observation: string; +} + +/** + * Discovered control map. Produced by driver.discoverControls() after the + * game is started. Unlike the legacy Controls, individual actions may be null + * (e.g. soft_drop is optional -- some games don't implement it). + */ +export interface ControlMap { + move_left: ControlMapping; + move_right: ControlMapping; + soft_drop: ControlMapping; + hard_drop: ControlMapping; + rotate_cw: ControlMapping; + rotate_ccw: ControlMapping; + /** Observations for every key tried, keyed by the raw key name. */ + key_observations: Record<string, string>; +} + /** How the game was started. */ export type StartMechanism = | "auto" @@ -87,6 +131,8 @@ export interface DriverCalibration { cellWidth: number; cellHeight: number; controls: Controls; + /** Discovered control map, or null if discovery has not run yet. */ + controlMap?: ControlMap | null; startMechanism: StartMechanism; scoreElementSelector: string | null; levelElementSelector: string | null; @@ -175,6 +221,25 @@ export interface TetrisDriver { pressRawKey(key: string): Promise<void>; wait(ms: number): Promise<void>; + // -- Control discovery -- + /** + * Run the control discovery loop against the currently-started game. + * Tries candidate keys for each abstract game action, observes grid deltas, + * and classifies each. Populates the driver's cached control map so + * subsequent pressKey() calls use the discovered mapping. + * + * Expensive (multiple reloads). Callers should invoke this at most once + * per session; the result is cached and re-applied across calibration + * cache hits. + */ + discoverControls(serverUrl: string): Promise<ControlMap>; + /** + * Return the discovered key for a given action, or null if not found. + * Before discoverControls() has run, returns the legacy default key from + * the Controls struct. + */ + getControl(action: GameAction): string | null; + // -- Score/Level/Lines Reading -- readScore(): Promise<number | null>; readLevel(): Promise<number | null>; @@ -290,6 +355,7 @@ export interface BotReport { grid_detected_at: string; grid_bounds: GridBounds | null; controls: Record<string, string>; + control_discovery?: Record<string, string>; start_mechanism: string; score_element_found: boolean; grid_confidence: number;

Impressum · Datenschutz