loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 3bde26d36a17e8b79525bbe582d3ab13b8d8387b
parent d162c5ba603ac08e3db2a7fe0919dd0494c4f14d
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 10 Apr 2026 14:36:48 +0200

V2 bot: caching, bot/driver bridge, fixed CCW rotation test

Three improvements merged:

1. Calibration caching (driver.ts): caches start mechanism, controls,
   grid bounds across reloads. Detects drift, flags conflicts. Eliminates
   timeouts from repeated full calibration.

2. Bot/driver bridge (bot.ts, driver.ts, types.ts): bot verifies game
   actually started before driver commits to a mechanism. Checks grid
   populated, movement responsive, no game-over text. discoverStartCandidates,
   tryStartMechanism, confirmStartMechanism, rejectStartMechanism methods.

3. CCW rotation test (bot.ts): fixed broken sequential test that was
   tautologically true. Now reloads page between Z and X tests, compares
   rotation states from same baseline.

Results vs human calibration (9 games):
- V1: 56/97 = 58% agreement
- V2: 80/98 = 82% agreement

Major wins: e2e04e75 (Spanish 18% -> 85%, perfect agreement),
4949d521 (trail bug 18% -> 67%), cbbff570 (18% -> 67%),
9805c24a (80% -> 95%), 7a348b81 (correctly finds working start button).

Known regression: 8fe72fce went 44% -> 0% because bridge's strict
verification rejects start mechanisms when benign startup console
errors occur. Needs follow-up: distinguish pre-start errors from
fatal errors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 447+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mtasks/tetris/eval/gameplay-bot-v2/driver.ts | 737++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtasks/tetris/eval/gameplay-bot-v2/index.ts | 8+++++++-
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1219 insertions(+), 39 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -6,7 +6,9 @@ import type { PieceType, TetrisDriver, DriverCalibration, + CalibrationDrift, GridSnapshot, + StartCandidate, TestResult, GameplayStats, GameSession, @@ -341,6 +343,7 @@ export async function runAllTests( session: GameSession; survey: SurveyData; competitivePlay: CompetitivePlayResult | null; + calibrationDrift: CalibrationDrift; }> { const gameplay: GameplayStats = { pieces_placed: 0, @@ -391,15 +394,32 @@ export async function runAllTests( testResults: failedTests, calibration: emptyCalibration(driver.getConsoleErrors()), gameplay, session, survey, competitivePlay, + calibrationDrift: driver.getCalibrationDrift(), }; } // ---- Pre-test survey ---- survey = await driver.surveyPage(); - // ---- Phase 2: Calibrate + detect start ---- + // ---- Phase 2: Discover + verify start, then calibrate ---- + // Bridge flow: try each candidate, ask verifyGameStarted() to confirm, + // commit the first verified candidate. On false positive, reload and try + // the next. If nothing verifies, tell the driver to reject and skip the + // legacy fuzzy detector (which historically clicked Pause buttons etc.). let cal: DriverCalibration; + let verified: { candidate: StartCandidate } | null = null; try { + verified = await detectStartWithVerification(driver, serverUrl); + } catch (err) { + console.log(`[bot] bridge detection threw: ${err instanceof Error ? err.message : String(err)}`); + } + + try { + if (verified) { + driver.confirmStartMechanism(verified.candidate); + } else { + driver.rejectStartMechanism(); + } cal = await driver.calibrate(); session.started = cal.startMechanism !== "unknown"; session.startMechanism = cal.startMechanism; @@ -538,7 +558,7 @@ export async function runAllTests( } } catch { /* continue */ } - competitivePlay = await runCompetitivePlayPhase(driver, session, gameplay); + competitivePlay = await runCompetitivePlayPhase(driver, session, gameplay, serverUrl); } else if (!session.skippedPhases.some((p) => p.startsWith("competitive:"))) { session.skippedPhases.push("competitive: gameplay failed"); } @@ -549,7 +569,8 @@ export async function runAllTests( const phaseState = { gameStarted, mechanicsWork, piecesWork, gameplayWorks }; const testResults = deriveTestResults(session, cal, loadResult, driver.getConsoleErrors(), gameplay, phaseState, competitivePlay); - return { testResults, calibration: cal, gameplay, session, survey, competitivePlay }; + const calibrationDrift = driver.getCalibrationDrift(); + return { testResults, calibration: cal, gameplay, session, survey, competitivePlay, calibrationDrift }; } // --------------------------------------------------------------------------- @@ -858,11 +879,209 @@ async function runEndurancePhase( } } +/** + * Test whether the game supports counter-clockwise rotation. + * + * The naive "press Z then press rotate, compare" approach is broken because + * rotation state is ordinal: after Z (state 0->3) then rotate (3->0), the + * piece returns to the original state regardless of direction, so the two + * intermediate snapshots always differ. That test is a tautology. + * + * Instead we run each key against a FRESH baseline game by reloading the + * page between presses. Both presses are then measured from rotation + * state 0. If the two resulting grids match, both keys rotate in the same + * direction; if they differ, they rotate opposite. + * + * Returns { done: false, ccw: null } when a reliable signal is unavailable + * (e.g. no active piece, O-piece baseline, rotate key doesn't rotate at all). + */ +async function testRotationDirection( + driver: TetrisDriver, + serverUrl: string +): Promise<{ done: boolean; ccw: boolean | null }> { + // Helper: reload, start game, wait for an active piece to be visible. + // On a fresh game the settled grid is empty, so we read WITHOUT passing a + // settled grid and let readGrid use its top-6-rows fallback to detect the + // active piece. + const freshBaseline = async (): Promise<{ + piece: PieceType | null; + grid: Grid | null; + settled: Grid | null; + }> => { + try { + const load = await driver.loadPage(serverUrl); + if (!load.loaded) return { piece: null, grid: null, settled: null }; + } catch { + return { piece: null, grid: null, settled: null }; + } + try { + await driver.calibrate(); + } catch { + return { piece: null, grid: null, settled: null }; + } + + // Use an empty grid as the "settled" reference so the active piece is + // detected as the full delta (any filled cell in current == active). + // This is robust even when the game hasn't yet spawned a piece at the + // moment of the first read. + const emptySettled: Grid = Array.from({ length: GRID_ROWS }, () => + Array.from({ length: GRID_COLS }, () => false) + ); + + // Poll up to ~3s for an active piece to appear. + let snap = await driver.readGrid(emptySettled); + let attempts = 0; + while ( + (!snap.activePieceCells || snap.activePieceCells.length !== 4) && + attempts < 30 + ) { + await driver.wait(100); + snap = await driver.readGrid(emptySettled); + attempts++; + } + + if (!snap.activePieceCells || snap.activePieceCells.length !== 4) { + return { piece: null, grid: snap.grid, settled: emptySettled }; + } + return { + piece: snap.activePieceType, + grid: snap.grid, + settled: emptySettled, + }; + }; + + // Extract the active piece cells from a grid (assumes the grid contains + // only the active piece, which is the case on a fresh game where the + // settled grid is empty). Returns a position-normalized shape string so + // that comparisons ignore where on the board the piece sits. + const shapeKey = (grid: Grid): string | null => { + const cells: [number, number][] = []; + for (let r = 0; r < grid.length; r++) { + for (let c = 0; c < grid[r].length; c++) { + if (grid[r][c]) cells.push([r, c]); + } + } + if (cells.length !== 4) return null; + const minR = Math.min(...cells.map(([r]) => r)); + const minC = Math.min(...cells.map(([, c]) => c)); + return cells + .map(([r, c]) => `${r - minR},${c - minC}`) + .sort() + .join("|"); + }; + + // Helper: press a key, wait briefly for the game to process it, and + // return the post-press grid if the piece's SHAPE differs from baseline. + // We compare shape (not full grid) so that piece falling during the wait + // does not confound the rotation measurement. + const measureKeyShape = async ( + pressFn: () => Promise<void>, + baseShape: string + ): Promise<string | null> => { + await pressFn(); + await driver.wait(80); + const snap = await driver.readGrid(); + if (!snap.grid) return null; + const shape = shapeKey(snap.grid); + if (!shape) return null; + if (shape === baseShape) return null; + return shape; + }; + + // Only J, L, T pieces have 4 visually-distinct rotation states. I, S, Z + // have only 2 (rotating CW vs CCW from state 0 produces an identical + // visual). O is rotationally symmetric. So we can only distinguish + // rotation directions using J, L, or T pieces. + const DISTINGUISHABLE: Set<PieceType> = new Set<PieceType>(["J", "L", "T"]); + const log = (msg: string) => console.log(`[ccw] ${msg}`); + + // Per-piece-type sample: the shape after pressing the calibrated rotate + // key from a fresh-spawn baseline. Keyed by piece type so Trial 2 can + // match whatever piece type its fresh reload happens to produce. + const trial1Shapes = new Map<PieceType, string>(); + + // ----- Trial 1: collect rotate-key shape samples for several piece types ----- + for (let attempt = 0; attempt < 10; attempt++) { + const b = await freshBaseline(); + if (!b.grid || !b.piece) continue; + if (!DISTINGUISHABLE.has(b.piece)) continue; + if (trial1Shapes.has(b.piece)) continue; + const baseShape = shapeKey(b.grid); + if (!baseShape) continue; + const afterShape = await measureKeyShape( + () => driver.pressKey("rotate"), + baseShape + ); + if (afterShape) { + trial1Shapes.set(b.piece, afterShape); + log(`trial1: rotate changed ${b.piece} (samples: ${trial1Shapes.size})`); + if (trial1Shapes.size >= DISTINGUISHABLE.size) break; + } else { + log(`trial1: rotate did NOT change ${b.piece}`); + } + } + if (trial1Shapes.size === 0) { + log("could not establish any Trial 1 reference direction"); + return { done: false, ccw: null }; + } + + // ----- Trial 2: press the raw "z" key from a fresh baseline whose piece + // type matches one of our Trial 1 samples, and compare the resulting + // shape to the corresponding Trial 1 shape. ----- + for (let attempt = 0; attempt < 10; attempt++) { + const b = await freshBaseline(); + if (!b.grid || !b.piece) continue; + const rotateShape = trial1Shapes.get(b.piece); + if (!rotateShape) continue; + const baseShape = shapeKey(b.grid); + if (!baseShape) continue; + const afterShape = await measureKeyShape( + () => driver.pressRawKey("z"), + baseShape + ); + if (!afterShape) { + log(`trial2: z caused no shape change on ${b.piece} -> CCW not supported`); + return { done: true, ccw: false }; + } + const opposite = afterShape !== rotateShape; + log( + `trial2: ${b.piece} rotate=${rotateShape} z=${afterShape} opposite=${opposite}` + ); + return { done: true, ccw: opposite }; + } + + log("could not find Trial 2 baseline with matching piece"); + return { done: false, ccw: null }; +} + async function runCompetitivePlayPhase( driver: TetrisDriver, session: GameSession, - gameplay: GameplayStats + gameplay: GameplayStats, + serverUrl: string ): Promise<CompetitivePlayResult> { + // Dedicated rotation-direction test (run BEFORE the main play loop so that + // each key press is measured from a fresh baseline game state). See + // testRotationDirection() for details. + let ccwTestDone = false; + let ccwResult: boolean | null = null; + try { + const rotResult = await testRotationDirection(driver, serverUrl); + ccwTestDone = rotResult.done; + ccwResult = rotResult.ccw; + } catch { + ccwTestDone = false; + ccwResult = null; + } + + // Reload once more so competitive play starts from a clean game state. + try { + await driver.loadPage(serverUrl); + await driver.calibrate(); + } catch { + /* continue: play loop will still attempt to run */ + } + const start = Date.now(); const maxDuration = 60000; @@ -914,8 +1133,6 @@ async function runCompetitivePlayPhase( let pollCount = 0; let consecutiveClears = 0; let maxCombo = 0; - let ccwTestDone = false; - let ccwResult: boolean | null = null; let softDropTestDone = false; let softDropDistinct: boolean | null = null; @@ -979,29 +1196,6 @@ async function runCompetitivePlayPhase( const pieceType = snap.activePieceType || "unknown"; session.pieceTypes.add(pieceType); - // CCW rotation test - if (!ccwTestDone && result.pieces_placed > 5 && result.pieces_placed % 7 === 0) { - const gridBeforeZ = await driver.readGrid(settledGrid); - await driver.pressRawKey("z"); - await driver.wait(60); - const gridAfterZ = await driver.readGrid(settledGrid); - - if (gridBeforeZ.grid && gridAfterZ.grid && driver.gridsAreDifferent(gridBeforeZ.grid, gridAfterZ.grid)) { - const gridBeforeUp = await driver.readGrid(settledGrid); - await driver.pressKey("rotate"); - await driver.wait(60); - const gridAfterUp = await driver.readGrid(settledGrid); - - if (gridBeforeUp.grid && gridAfterUp.grid) { - ccwResult = driver.gridsAreDifferent(gridAfterZ.grid, gridAfterUp.grid); - ccwTestDone = true; - } - } else { - ccwResult = false; - ccwTestDone = true; - } - } - // Soft drop test if (!softDropTestDone && result.pieces_placed > 3 && result.pieces_placed % 5 === 0) { const snapBeforeDown = await driver.readGrid(settledGrid); @@ -1688,3 +1882,198 @@ function deriveTestResults( return results; } + +// --------------------------------------------------------------------------- +// Start-mechanism verification bridge +// --------------------------------------------------------------------------- + +/** + * The bot's gameplay-grounded check for "did the game really start?" + * + * This is the feedback channel the driver leans on. The driver can only see + * pixel and DOM deltas, so it can be fooled by Pause buttons, overlays, or + * spurious animations. The bot reads the grid, presses real gameplay keys, + * and watches for tetris-like behavior. + * + * Returns true only when the evidence clearly points to a started game: + * - grid detected and populated in a sane range (pieces, not chrome) + * - ArrowLeft causes a piece-like change, OR + * - waiting ~1s causes the grid to change (auto-drop), OR + * - the grid transitions in any measurable way that's not just chrome + * - no immediate game over text + */ +async function verifyGameStarted(driver: TetrisDriver): Promise<{ + ok: boolean; + reason: string; +}> { + // 1. Instant rejection: game over text visible means we started into a + // dead state, or clicked a Restart that then immediately ended again. + try { + const gameOverText = await driver.detectGameOverText(); + if (gameOverText) { + return { ok: false, reason: `immediate game over: "${gameOverText}"` }; + } + } catch { /* continue */ } + + // 2. tryStartMechanism() populated a minimal calibration for us. If it + // couldn't find a grid, the candidate is not a real start. + let cal; + try { + cal = driver.getCalibration(); + } catch { + cal = null; + } + if (!cal || !cal.gridDetected) { + return { ok: false, reason: "no grid detected after start attempt" }; + } + + // 3. Read the grid. Need a sane fill level (pieces, not chrome). + const snap = await driver.readGrid(); + if (!snap.grid) { + return { ok: false, reason: "grid read failed" }; + } + const totalCells = snap.grid.length * (snap.grid[0]?.length || 0); + if (totalCells === 0) { + return { ok: false, reason: "grid has zero cells" }; + } + const fillRatio = snap.filledCount / totalCells; + // A running game may legitimately start empty, so 0 cells is allowed. + // But >60% filled likely means we're reading chrome as cells. + if (fillRatio > 0.6) { + return { + ok: false, + reason: `grid ${Math.round(fillRatio * 100)}% filled (likely reading chrome)`, + }; + } + + // 4. Evidence: press ArrowLeft and see if the grid changes (movement works). + let movementSeen = false; + try { + const before = await driver.readGrid(); + await driver.pressKey("left"); + await driver.wait(250); + const after = await driver.readGrid(); + if (before.grid && after.grid && driver.gridsAreDifferent(before.grid, after.grid)) { + movementSeen = true; + } + } catch { /* fall through to auto-drop check */ } + + // 5. Evidence: wait 1.1s and see if the grid changes on its own (auto-drop). + let autoDropSeen = false; + try { + const before = await driver.readGrid(); + await driver.wait(1100); + const after = await driver.readGrid(); + if (before.grid && after.grid && driver.gridsAreDifferent(before.grid, after.grid)) { + autoDropSeen = true; + } + } catch { /* fall through */ } + + // 6. Second chance at game-over after interaction. + try { + const gameOverText = await driver.detectGameOverText(); + if (gameOverText) { + return { ok: false, reason: `game over after interaction: "${gameOverText}"` }; + } + } catch { /* continue */ } + + if (movementSeen && autoDropSeen) { + return { ok: true, reason: "movement and auto-drop both observed" }; + } + if (movementSeen) { + return { ok: true, reason: "movement key changes the grid" }; + } + if (autoDropSeen) { + return { ok: true, reason: "grid changes on its own (auto-drop)" }; + } + + // 7. Weaker fallback: if the grid is populated in a plausible range + // (some pieces visible somewhere) and there's no game over, accept it + // provisionally. The downstream phases will weed out dead starts. + if (snap.filledCount > 0 && snap.filledCount < totalCells * 0.5) { + return { + ok: false, + reason: `grid populated (${snap.filledCount} cells) but no movement or auto-drop observed`, + }; + } + + return { ok: false, reason: "no gameplay evidence detected" }; +} + +/** + * Full discovery loop: ask the driver for candidates, try each, verify with + * verifyGameStarted(), and return the first candidate the bot trusts. Reloads + * the page between candidates so each attempt starts from a clean state. + */ +async function detectStartWithVerification( + driver: TetrisDriver, + serverUrl: string +): Promise<{ candidate: StartCandidate } | null> { + const log = (msg: string) => console.log(`[bot:start] ${msg}`); + + const candidates = await driver.discoverStartCandidates(); + log(`discovered ${candidates.length} candidate(s)`); + + for (let i = 0; i < candidates.length; i++) { + const candidate = candidates[i]; + log(`(${i + 1}/${candidates.length}) trying: ${candidate.label}`); + + // Apply without committing. + let tryResult; + try { + tryResult = await driver.tryStartMechanism(candidate); + } catch (err) { + log(` tryStartMechanism threw: ${err instanceof Error ? err.message : String(err)}`); + await reloadAndClear(driver, serverUrl); + continue; + } + + // Skip candidates with no observable effect at all. + if (!tryResult.visualChanged && !tryResult.domChanged && candidate.mechanism !== "auto") { + log(` no visual/DOM change, skipping`); + continue; + } + if (tryResult.errorOccurred) { + log(` JS error fired during attempt, skipping`); + await reloadAndClear(driver, serverUrl); + continue; + } + + // Ask the bot's own verification. + let verification; + try { + verification = await verifyGameStarted(driver); + } catch (err) { + log(` verifyGameStarted threw: ${err instanceof Error ? err.message : String(err)}`); + await reloadAndClear(driver, serverUrl); + continue; + } + + if (verification.ok) { + log(` VERIFIED: ${verification.reason}`); + // Important: the page is already in a started state. We clear the + // driver's cached calibration (without reloading) so the follow-up + // calibrate() call will re-apply the candidate from scratch -- that + // way the phase separation (load -> apply -> verify) stays consistent + // across downstream phases that reload the page. + await reloadAndClear(driver, serverUrl); + return { candidate }; + } + + log(` REJECTED: ${verification.reason}`); + await reloadAndClear(driver, serverUrl); + } + + log("no candidate verified"); + return null; +} + +/** Reload the page and clear any in-flight confirmed candidate. */ +async function reloadAndClear(driver: TetrisDriver, serverUrl: string): Promise<void> { + try { + driver.clearConfirmedStartMechanism(); + } catch { /* ignore */ } + try { + await driver.loadPage(serverUrl); + } catch { /* ignore */ } +} diff --git a/tasks/tetris/eval/gameplay-bot-v2/driver.ts b/tasks/tetris/eval/gameplay-bot-v2/driver.ts @@ -8,9 +8,12 @@ import type { RendererType, Controls, StartMechanism, + StartCandidate, + TryStartResult, SurveyData, PieceType, DriverCalibration, + CalibrationDrift, GridSnapshot, TetrisDriver, } from "./types"; @@ -162,12 +165,111 @@ function identifyPieceType(cells: [number, number][]): PieceType { } // --------------------------------------------------------------------------- +// Calibration cache helpers +// --------------------------------------------------------------------------- + +function cloneCalibration(cal: DriverCalibration): DriverCalibration { + const copy: DriverCalibration = { + renderer: cal.renderer, + gridDetected: cal.gridDetected, + gridBounds: cal.gridBounds ? { ...cal.gridBounds } : null, + cellWidth: cal.cellWidth, + cellHeight: cal.cellHeight, + controls: { ...cal.controls }, + startMechanism: cal.startMechanism, + scoreElementSelector: cal.scoreElementSelector, + levelElementSelector: cal.levelElementSelector, + backgroundColor: cal.backgroundColor ? [...cal.backgroundColor] as [number, number, number] : null, + consoleErrors: [...cal.consoleErrors], + gridConfidence: cal.gridConfidence, + gridDetectedAt: cal.gridDetectedAt, + }; + if (cal.startButton) { + copy.startButton = { + selector: cal.startButton.selector, + text: cal.startButton.text, + disappeared: cal.startButton.disappeared, + position: { ...cal.startButton.position }, + }; + } + return copy; +} + +function gridBoundsSimilar(a: GridBounds, b: GridBounds): boolean { + // Tolerate rendering jitter but flag anything beyond ~10% size change. + const tol = Math.max(20, Math.min(a.width, b.width) * 0.15); + return ( + Math.abs(a.x - b.x) < tol && + Math.abs(a.y - b.y) < tol && + Math.abs(a.width - b.width) < tol && + Math.abs(a.height - b.height) < tol + ); +} + +/** + * Returns a list of field names that differ between the baseline calibration + * and a fresh one. Empty list means no drift detected. + */ +function diffCalibrations(baseline: DriverCalibration, fresh: DriverCalibration): string[] { + const changes: string[] = []; + + if (baseline.startMechanism !== fresh.startMechanism) { + changes.push("start_mechanism"); + } + const baseSel = baseline.startButton?.selector ?? null; + const freshSel = fresh.startButton?.selector ?? null; + if (baseSel !== freshSel) changes.push("start_button_selector"); + + if (baseline.renderer !== fresh.renderer) changes.push("renderer"); + + if (!!baseline.gridBounds !== !!fresh.gridBounds) { + changes.push("grid_bounds"); + } else if (baseline.gridBounds && fresh.gridBounds) { + if (!gridBoundsSimilar(baseline.gridBounds, fresh.gridBounds)) { + changes.push("grid_bounds"); + } + } + + const bc = baseline.controls; + const fc = fresh.controls; + if (bc.left !== fc.left || bc.right !== fc.right || bc.down !== fc.down || + bc.rotate !== fc.rotate || bc.drop !== fc.drop) { + changes.push("controls"); + } + + if (baseline.scoreElementSelector !== fresh.scoreElementSelector) { + changes.push("score_element"); + } + if (baseline.levelElementSelector !== fresh.levelElementSelector) { + changes.push("level_element"); + } + + return changes; +} + +// --------------------------------------------------------------------------- // PlaywrightDriver // --------------------------------------------------------------------------- export class PlaywrightDriver implements TetrisDriver { private page: Page; private cal: DriverCalibration | null = null; + // First successful calibration, used as the cache baseline across reloads. + private firstCal: DriverCalibration | null = null; + // Candidate confirmed by the bot's verification bridge. When set, calibrate() + // replays this candidate instead of rediscovering the start mechanism. + private confirmedCandidate: StartCandidate | null = null; + // Set by the bot when bridge verification definitively failed -- the legacy + // detectStartMechanism() fallback must NOT run and override the bot's verdict. + private startRejected: boolean = false; + // Cumulative drift info across the session. + private drift: CalibrationDrift = { + drifted: false, + changes: [], + recalibrations: 0, + cacheHits: 0, + cacheMisses: 0, + }; private consoleErrors: string[] = []; private log = (msg: string) => console.log(`[driver] ${msg}`); @@ -314,11 +416,83 @@ export class PlaywrightDriver implements TetrisDriver { } async calibrate(): Promise<DriverCalibration> { + // Fast path: try applying the cached calibration from a prior run. + if (this.firstCal) { + this.drift.recalibrations++; + const cached = await this.applyCachedCalibration(); + if (cached) { + this.drift.cacheHits++; + this.cal = cached; + this.log( + `[cache] hit: replayed start="${cached.startMechanism}" renderer=${cached.renderer} ` + + `(hits=${this.drift.cacheHits}, misses=${this.drift.cacheMisses})` + ); + return cached; + } + this.drift.cacheMisses++; + this.log( + `[cache] miss: cached calibration no longer works, doing full recalibration ` + + `(hits=${this.drift.cacheHits}, misses=${this.drift.cacheMisses})` + ); + } + + const fresh = await this.fullCalibrate(); + this.cal = fresh; + + if (!this.firstCal) { + // First time -- freeze a copy as the baseline for drift detection. + this.firstCal = cloneCalibration(fresh); + } else { + // Not the first time: compute drift vs baseline. + const changes = diffCalibrations(this.firstCal, fresh); + if (changes.length > 0) { + this.drift.drifted = true; + for (const c of changes) { + if (!this.drift.changes.includes(c)) this.drift.changes.push(c); + } + this.log(`CONFLICT: calibration drifted: [${changes.join(", ")}]`); + } + } + + return fresh; + } + + // Runs the full (expensive) calibration flow. Does not touch firstCal/drift. + private async fullCalibrate(): Promise<DriverCalibration> { await this.page.waitForTimeout(2000); - let startResult = await this.detectStartMechanism(); - let startMechanism: StartMechanism = startResult.mechanism; - let startButton = startResult.startButton; + let startMechanism: StartMechanism; + let startButton: DriverCalibration["startButton"] | undefined; + + if (this.confirmedCandidate) { + // Bot already verified the start. Replay it instead of rediscovering. + this.log( + `[bridge] replaying confirmed candidate: ${this.confirmedCandidate.label}` + ); + const applied = await this.applyCandidate(this.confirmedCandidate); + startMechanism = applied.ok ? this.confirmedCandidate.mechanism : "unknown"; + if (applied.ok && (this.confirmedCandidate.mechanism === "button" || this.confirmedCandidate.mechanism === "click_canvas")) { + startButton = { + selector: this.confirmedCandidate.selector ?? "canvas", + text: this.confirmedCandidate.text ?? this.confirmedCandidate.label, + disappeared: false, + position: this.confirmedCandidate.position ?? { x: 0, y: 0 }, + }; + } + await this.page.waitForTimeout(this.confirmedCandidate.waitMs ?? 400); + } else if (this.startRejected) { + // Bot's bridge verification rejected every candidate. Do NOT run the + // legacy fallback; it has historically produced false positives + // (e.g. clicking Pause) that the bridge was designed to prevent. + this.log(`[bridge] start rejected by bot; skipping legacy detection`); + startMechanism = "unknown"; + startButton = undefined; + } else { + const startResult = await this.detectStartMechanism(); + startMechanism = startResult.mechanism; + startButton = startResult.startButton; + } + let gridDetection = await this.detectGrid(); let { renderer, gridBounds, cellWidth, cellHeight } = gridDetection; let backgroundColor = @@ -326,8 +500,8 @@ export class PlaywrightDriver implements TetrisDriver { ? await this.sampleBackgroundColor(gridBounds, cellWidth, cellHeight) : null; - // Re-calibration fallback - if (startMechanism === "unknown" || gridBounds === null) { + // Re-calibration fallback (skipped when bot already confirmed or rejected the start). + if (!this.confirmedCandidate && !this.startRejected && (startMechanism === "unknown" || gridBounds === null)) { const retry = await this.recalibrateWithRetry(startMechanism, gridBounds); if (retry.startMechanism !== "unknown") startMechanism = retry.startMechanism; if (retry.startButton) startButton = retry.startButton; @@ -354,7 +528,7 @@ export class PlaywrightDriver implements TetrisDriver { gridDetectedAt: "initial", }); - this.cal = { + const cal: DriverCalibration = { renderer, gridDetected: gridBounds !== null, gridBounds, @@ -370,11 +544,157 @@ export class PlaywrightDriver implements TetrisDriver { gridDetectedAt: "initial", }; - if (startButton) { - this.cal.startButton = startButton; + if (startButton) cal.startButton = startButton; + return cal; + } + + /** + * Attempt to replay the cached calibration on the current page. + * Returns a completed DriverCalibration on success, null on failure. + * On success, the game should be started and the grid detected. + */ + private async applyCachedCalibration(): Promise<DriverCalibration | null> { + const base = this.firstCal; + if (!base) return null; + + try { + // Small settle delay -- a freshly-loaded page may still be booting. + await this.page.waitForTimeout(800); + + // Step 1: re-apply the cached start mechanism. + const started = await this.replayStartMechanism(base); + if (!started) { + this.log( + `CONFLICT: cached start mechanism '${base.startMechanism}` + + (base.startButton ? ` ${base.startButton.selector}` : "") + + `' no longer works` + ); + return null; + } + + // Step 2: verify the grid is back (same renderer, similar bounds). + await this.page.waitForTimeout(300); + const grid = await this.detectGrid(); + if (!grid.gridBounds) { + this.log("CONFLICT: cached start worked but no grid detected"); + return null; + } + if (base.gridBounds && !gridBoundsSimilar(base.gridBounds, grid.gridBounds)) { + this.log( + `CONFLICT: grid bounds changed significantly ` + + `(was ${JSON.stringify(base.gridBounds)}, now ${JSON.stringify(grid.gridBounds)})` + ); + return null; + } + if (base.renderer !== "unknown" && grid.renderer !== base.renderer) { + this.log(`CONFLICT: renderer changed from ${base.renderer} to ${grid.renderer}`); + return null; + } + + const backgroundColor = + grid.renderer === "canvas" && grid.gridBounds + ? await this.sampleBackgroundColor(grid.gridBounds, grid.cellWidth, grid.cellHeight) + : base.backgroundColor; + + const cal: DriverCalibration = { + renderer: grid.renderer, + gridDetected: true, + gridBounds: grid.gridBounds, + cellWidth: grid.cellWidth, + cellHeight: grid.cellHeight, + controls: { ...base.controls }, + startMechanism: base.startMechanism, + scoreElementSelector: base.scoreElementSelector, + levelElementSelector: base.levelElementSelector, + backgroundColor, + consoleErrors: [...this.consoleErrors], + gridConfidence: base.gridConfidence, + gridDetectedAt: "initial", + fromCache: true, + }; + if (base.startButton) cal.startButton = { ...base.startButton }; + return cal; + } catch (err) { + this.log( + `[cache] replay threw: ${err instanceof Error ? err.message : String(err)}` + ); + return null; } + } - return this.cal; + /** + * Perform the cached start action. Returns true if a visual change occurred. + */ + private async replayStartMechanism(base: DriverCalibration): Promise<boolean> { + try { + const before = await this.page.screenshot(); + + switch (base.startMechanism) { + case "auto": + // Nothing to replay -- game should already be running. + await this.page.waitForTimeout(400); + break; + case "enter": + await this.page.keyboard.press("Enter"); + break; + case "space": + await this.page.keyboard.press("Space"); + break; + case "anykey": + await this.page.keyboard.press("ArrowDown"); + break; + case "click_canvas": { + const pos = base.startButton?.position; + if (pos) { + await this.page.mouse.click(pos.x, pos.y); + } else { + const canvas = this.page.locator("canvas").first(); + if ((await canvas.count()) > 0) await canvas.click(); + else return false; + } + break; + } + case "button": { + // Prefer the cached selector; fall back to coordinate click. + let clicked = false; + const sel = base.startButton?.selector; + if (sel) { + try { + const locator = this.page.locator(sel).first(); + const count = await locator.count(); + if (count > 0) { + await locator.click({ timeout: 2000 }); + clicked = true; + } + } catch { /* fall through to coordinate click */ } + } + if (!clicked && base.startButton?.position) { + const pos = base.startButton.position; + await this.page.mouse.click(pos.x, pos.y); + clicked = true; + } + if (!clicked) return false; + break; + } + default: + return false; + } + + await this.page.waitForTimeout(500); + + // For auto-start, we already have no input -- just verify something changed + // relative to the blank/initial page state. + if (base.startMechanism === "auto") { + const after = await this.page.screenshot(); + return !before.equals(after); + } + + // For the other mechanisms, a visual change after the action is the signal. + const result = await this.detectVisualChange({ frames: 3, intervalMs: 100, before }); + return result.changed; + } catch { + return false; + } } async recalibrate(): Promise<DriverCalibration> { @@ -410,6 +730,405 @@ export class PlaywrightDriver implements TetrisDriver { return this.cal; } + getCalibrationDrift(): CalibrationDrift { + return { + drifted: this.drift.drifted, + changes: [...this.drift.changes], + recalibrations: this.drift.recalibrations, + cacheHits: this.drift.cacheHits, + cacheMisses: this.drift.cacheMisses, + }; + } + + // -- Start-mechanism verification bridge -- + // + // The bot drives start detection explicitly via this trio: + // 1. discoverStartCandidates() -- returns ordered list + // 2. tryStartMechanism(candidate) -- applies one, reports deltas + // 3. confirmStartMechanism(candidate) -- commits after bot verification + // + // Unlike detectStartMechanism(), tryStartMechanism() does NOT judge the + // outcome. It only reports observable deltas so the bot can run its own + // gameplay-based checks before committing. + + async discoverStartCandidates(): Promise<StartCandidate[]> { + const candidates: StartCandidate[] = []; + + // 1. Auto-start: no action, just wait briefly. + candidates.push({ + mechanism: "auto", + label: "auto-start (wait 1.2s)", + waitMs: 1200, + }); + + // 2. DOM buttons, sorted by prominence (start-ish first, disabled/pause-ish last). + try { + const buttons = await this.collectButtonCandidates(); + for (const b of buttons) { + // Skip disabled buttons -- they cannot start a game. + if (b.disabled) continue; + candidates.push({ + mechanism: "button", + label: `button "${b.text || b.selector}"`, + selector: b.selector, + text: b.text, + position: { x: b.x, y: b.y }, + }); + } + } catch { /* no buttons */ } + + // 3. Keyboard triggers. + candidates.push({ mechanism: "enter", label: "key Enter", key: "Enter" }); + candidates.push({ mechanism: "space", label: "key Space", key: "Space" }); + candidates.push({ mechanism: "anykey", label: "key ArrowDown", key: "ArrowDown" }); + + // 4. Canvas clicks (if a canvas exists). + try { + const canvas = this.page.locator("canvas").first(); + if ((await canvas.count()) > 0) { + const box = await canvas.boundingBox(); + if (box) { + const cx = box.x + box.width / 2; + const cy = box.y + box.height / 2; + candidates.push({ + mechanism: "click_canvas", + label: "canvas click center", + position: { x: Math.round(cx), y: Math.round(cy) }, + }); + candidates.push({ + mechanism: "click_canvas", + label: "canvas click upper", + position: { x: Math.round(cx), y: Math.round(box.y + box.height * 0.25) }, + }); + candidates.push({ + mechanism: "click_canvas", + label: "canvas click lower", + position: { x: Math.round(cx), y: Math.round(box.y + box.height * 0.75) }, + }); + } + } + } catch { /* no canvas */ } + + return candidates; + } + + async tryStartMechanism(candidate: StartCandidate): Promise<TryStartResult> { + const errorsBefore = this.consoleErrors.length; + + let before: Buffer | null = null; + let domBefore = ""; + let clickableBefore = 0; + try { + before = await this.page.screenshot(); + const snap = await this.snapshotDomState(); + domBefore = snap.domKey; + clickableBefore = snap.clickableCount; + } catch { /* screenshot can fail on teardown */ } + + let applied = { ok: false }; + try { + applied = await this.applyCandidate(candidate); + } catch { /* treat as not applied */ } + + if (!applied.ok) { + return { + visualChanged: false, + domChanged: false, + errorOccurred: this.consoleErrors.length > errorsBefore, + newClickableElements: 0, + removedElements: 0, + }; + } + + // Give the game a moment to react. + await this.page.waitForTimeout(candidate.waitMs ?? 300); + + let visualChanged = false; + let domChanged = false; + let newClickableElements = 0; + let removedElements = 0; + + try { + if (before) { + const after = await this.page.screenshot(); + visualChanged = !before.equals(after); + } + const snap = await this.snapshotDomState(); + domChanged = snap.domKey !== domBefore; + const delta = snap.clickableCount - clickableBefore; + if (delta > 0) newClickableElements = delta; + else if (delta < 0) removedElements = -delta; + } catch { /* report what we have */ } + + // Populate a minimal calibration so verifyGameStarted can call readGrid(). + // The bot may reject this candidate, in which case clearConfirmedStartMechanism() + // will wipe this.cal along with the rest of the bridge state. + try { + const gridDetection = await this.detectGrid(); + if (gridDetection.gridBounds) { + const backgroundColor = + gridDetection.renderer === "canvas" + ? await this.sampleBackgroundColor( + gridDetection.gridBounds, + gridDetection.cellWidth, + gridDetection.cellHeight + ) + : null; + this.cal = { + renderer: gridDetection.renderer, + gridDetected: true, + gridBounds: gridDetection.gridBounds, + cellWidth: gridDetection.cellWidth, + cellHeight: gridDetection.cellHeight, + controls: { ...DEFAULT_CONTROLS }, + startMechanism: candidate.mechanism, + scoreElementSelector: null, + levelElementSelector: null, + backgroundColor, + consoleErrors: [...this.consoleErrors], + gridConfidence: 0, + gridDetectedAt: "after_start", + }; + } + } catch { /* no grid detected yet */ } + + return { + visualChanged, + domChanged, + errorOccurred: this.consoleErrors.length > errorsBefore, + newClickableElements, + removedElements, + }; + } + + confirmStartMechanism(candidate: StartCandidate): void { + this.confirmedCandidate = candidate; + this.startRejected = false; + this.log(`[bridge] confirmed start candidate: ${candidate.label}`); + } + + clearConfirmedStartMechanism(): void { + if (this.confirmedCandidate) { + this.log(`[bridge] cleared confirmed start candidate`); + } + this.confirmedCandidate = null; + // Drop cached calibrations so a reload starts fresh. + this.firstCal = null; + this.cal = null; + } + + rejectStartMechanism(): void { + this.startRejected = true; + this.confirmedCandidate = null; + // Drop cached calibrations; subsequent calibrate() calls must run fresh + // but MUST NOT attempt any start detection. + this.firstCal = null; + this.cal = null; + this.log(`[bridge] start mechanism rejected by bot`); + } + + /** Shared helper: apply a candidate without judging the outcome. */ + private async applyCandidate(candidate: StartCandidate): Promise<{ ok: boolean }> { + try { + switch (candidate.mechanism) { + case "auto": + // Nothing to click/press; the wait happens in tryStartMechanism. + return { ok: true }; + case "enter": + case "space": + case "anykey": { + const key = candidate.key + ?? (candidate.mechanism === "enter" ? "Enter" + : candidate.mechanism === "space" ? "Space" + : "ArrowDown"); + await this.page.keyboard.press(key); + return { ok: true }; + } + case "button": { + const sel = candidate.selector; + let clicked = false; + if (sel) { + try { + const locator = this.page.locator(sel).first(); + if ((await locator.count()) > 0) { + await locator.click({ timeout: 2000 }); + clicked = true; + } + } catch { /* fall through */ } + } + if (!clicked && candidate.position) { + await this.page.mouse.click(candidate.position.x, candidate.position.y); + clicked = true; + } + return { ok: clicked }; + } + case "click_canvas": { + if (candidate.position) { + await this.page.mouse.click(candidate.position.x, candidate.position.y); + return { ok: true }; + } + const canvas = this.page.locator("canvas").first(); + if ((await canvas.count()) > 0) { + await canvas.click(); + return { ok: true }; + } + return { ok: false }; + } + default: + return { ok: false }; + } + } catch { + return { ok: false }; + } + } + + /** + * DOM snapshot used to cheaply detect whether tryStartMechanism() caused + * meaningful structural changes on the page. + */ + private async snapshotDomState(): Promise<{ domKey: string; clickableCount: number }> { + try { + return await this.page.evaluate(() => { + const clickableSelector = + 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; + const clickable = document.querySelectorAll(clickableSelector); + const clickableCount = clickable.length; + + // Compact key describing the interactive skeleton. + const parts: string[] = []; + clickable.forEach((el, i) => { + if (i > 40) return; + const rect = (el as HTMLElement).getBoundingClientRect(); + parts.push( + `${el.tagName.toLowerCase()}:${(el.textContent || "").trim().slice(0, 20)}:${Math.round(rect.width)}x${Math.round(rect.height)}` + ); + }); + const canvasCount = document.querySelectorAll("canvas").length; + parts.push(`canvas=${canvasCount}`); + + // Also include a short excerpt of body text so things like "Paused" + // toggling to "Game Over" register as changes. + const bodyText = (document.body?.innerText || "") + .replace(/\s+/g, " ") + .trim() + .slice(0, 300); + parts.push(`body=${bodyText}`); + + return { domKey: parts.join("|"), clickableCount }; + }); + } catch { + return { domKey: "", clickableCount: 0 }; + } + } + + /** + * Return clickable elements sorted by prominence. Used by + * discoverStartCandidates(). Boosts "start"-like labels and demotes + * "pause"-like labels. + */ + private async collectButtonCandidates(): Promise<Array<{ + text: string; selector: string; x: number; y: number; disabled: boolean; + }>> { + return await this.page.evaluate(() => { + const seen = new Set<Element>(); + const results: Array<{ + index: number; text: string; x: number; y: number; + width: number; height: number; area: number; centerDist: number; + selector: string; hasBackground: boolean; disabled: boolean; + }> = []; + + const clickableSelector = + 'button, a, [role="button"], [onclick], input[type="button"], input[type="submit"]'; + for (const el of document.querySelectorAll(clickableSelector)) { + if (!seen.has(el)) seen.add(el); + } + + const allEls = document.querySelectorAll("*"); + for (const el of allEls) { + if (seen.has(el)) continue; + try { + const style = window.getComputedStyle(el); + if (style.cursor === "pointer") seen.add(el); + } catch { /* skip */ } + } + + const pageW = window.innerWidth; + const pageH = window.innerHeight; + const pageCenterX = pageW / 2; + const pageCenterY = pageH / 2; + + let idx = 0; + for (const el of seen) { + const rect = el.getBoundingClientRect(); + if (rect.width < 5 || rect.height < 5) continue; + // Skip elements that are far outside the document (negative coords or + // > 3x the viewport) but allow buttons that are below the fold -- we + // use locator.click() which scrolls them into view. + if (rect.top < -200 || rect.left < -200) continue; + if (rect.top > pageH * 3 || rect.left > pageW * 3) continue; + if (rect.width > pageW * 0.8 && rect.height > pageH * 0.8) continue; + + const cx = rect.left + rect.width / 2; + const cy = rect.top + rect.height / 2; + const centerDist = Math.sqrt((cx - pageCenterX) ** 2 + (cy - pageCenterY) ** 2); + + let hasBackground = false; + try { + const style = window.getComputedStyle(el as HTMLElement); + const bg = style.backgroundColor; + if (bg && bg !== "transparent" && bg !== "rgba(0, 0, 0, 0)") hasBackground = true; + } catch { /* skip */ } + + // Check for a stable id or class selector; disabled buttons are still + // surfaced so the bot can verify that clicking them doesn't start + // the game (fail fast on false positives). + const disabled = (el as HTMLInputElement).disabled === true + || el.getAttribute("aria-disabled") === "true"; + + let selector = ""; + if (el.id) selector = `#${el.id}`; + else if ((el as HTMLElement).className) { + const cls = (el as HTMLElement).className.toString().split(" ")[0]; + if (cls) selector = `${el.tagName.toLowerCase()}.${cls}`; + } + if (!selector) selector = `${el.tagName.toLowerCase()}:nth-of-type(${idx + 1})`; + + results.push({ + index: idx, text: (el.textContent || "").trim().slice(0, 50), + x: Math.round(cx), y: Math.round(cy), + width: rect.width, height: rect.height, + area: rect.width * rect.height, centerDist, selector, hasBackground, + disabled, + }); + idx++; + } + + // Sort: prefer "start"-like labels first, "pause"/"restart"-like last, + // disabled elements demoted, then prominence. + const isStartish = (text: string): number => { + const t = text.toLowerCase(); + if (/\bstart\b|\bplay\b|\bbegin\b|\bgo\b|\binicio\b|\bjugar\b|\bempezar\b|\bcomenzar\b|\bnueva\b|new game/.test(t)) return 0; + if (/\brestart\b|\breset\b|\bplay again\b/.test(t)) return 2; + if (/\bpause\b|\bstop\b|\bquit\b|\bexit\b|\bpausa\b|\bsalir\b/.test(t)) return 3; + return 1; + }; + + results.sort((a, b) => { + const ai = isStartish(a.text); + const bi = isStartish(b.text); + if (ai !== bi) return ai - bi; + if (a.disabled !== b.disabled) return a.disabled ? 1 : -1; + if (a.hasBackground !== b.hasBackground) return a.hasBackground ? -1 : 1; + if (Math.abs(b.area - a.area) > 100) return b.area - a.area; + return a.centerDist - b.centerDist; + }); + + return results.map(r => ({ + text: r.text, selector: r.selector, x: r.x, y: r.y, disabled: r.disabled, + })); + }); + } + // -- Grid Reading -- async readGrid(settledGrid?: Grid | null): Promise<GridSnapshot> { diff --git a/tasks/tetris/eval/gameplay-bot-v2/index.ts b/tasks/tetris/eval/gameplay-bot-v2/index.ts @@ -106,7 +106,7 @@ test.describe("Tetris Gameplay Bot v2", () => { const driver = new PlaywrightDriver(page); // Create the Bot (which gets the Driver) and run everything - const { testResults, calibration, gameplay, session, survey, competitivePlay } = + const { testResults, calibration, gameplay, session, survey, competitivePlay, calibrationDrift } = await runAllTests(driver, serverUrl); // Accessibility check @@ -190,6 +190,7 @@ test.describe("Tetris Gameplay Bot v2", () => { issue_count: a11yIssues.length, pass: a11yIssues.length === 0, }, + calibration_drift: calibrationDrift, }; // Write report to file @@ -231,6 +232,11 @@ test.describe("Tetris Gameplay Bot v2", () => { console.log(` Bugs: [${competitivePlay.bugs_detected.join(", ")}]`); } } + console.log( + `\nCalibration cache: ${calibrationDrift.cacheHits} hits / ${calibrationDrift.cacheMisses} misses ` + + `(${calibrationDrift.recalibrations} recalibrations)` + + (calibrationDrift.drifted ? ` -- DRIFTED: [${calibrationDrift.changes.join(", ")}]` : "") + ); console.log(`\nSurvey: canvas=${survey.has_canvas}, dom_grid=${survey.has_dom_grid}, overlay=${survey.has_overlay}, clickable=${survey.clickable_elements}`); console.log(`Report written to: ${reportPath}`); console.log("==============================\n"); diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -31,6 +31,42 @@ export type StartMechanism = | "anykey" | "unknown"; +/** + * A candidate start mechanism discovered by the driver and verified by the bot. + * The bot iterates candidates, asks the driver to try each, then decides + * whether the result actually represents a started Tetris game. + */ +export interface StartCandidate { + /** Which mechanism type this candidate represents. */ + mechanism: StartMechanism; + /** Human-readable label for logs. */ + label: string; + /** CSS selector for buttons. */ + selector?: string; + /** Visible text for buttons. */ + text?: string; + /** Key to press for keyboard triggers. */ + key?: string; + /** Pixel position for clicks. */ + position?: { x: number; y: number }; + /** Milliseconds to wait before measuring (for auto-start). */ + waitMs?: number; +} + +/** What happened when a start mechanism was applied, without committing. */ +export interface TryStartResult { + /** Did the screenshot pixels change? */ + visualChanged: boolean; + /** Did the DOM snapshot change? */ + domChanged: boolean; + /** Was there a JS error during the attempt? */ + errorOccurred: boolean; + /** Clickable elements that appeared after applying. */ + newClickableElements: number; + /** Clickable elements that disappeared after applying. */ + removedElements: number; +} + /** Standard Tetris piece types. */ export type PieceType = "I" | "O" | "T" | "S" | "Z" | "J" | "L" | "unknown"; @@ -64,6 +100,16 @@ export interface DriverCalibration { disappeared: boolean; position: { x: number; y: number }; }; + fromCache?: boolean; +} + +/** Summary of how much the latest calibration differs from the first one. */ +export interface CalibrationDrift { + drifted: boolean; + changes: string[]; + recalibrations: number; + cacheHits: number; + cacheMisses: number; } /** Grid snapshot: the grid state plus derived information the bot needs. */ @@ -93,6 +139,25 @@ export interface TetrisDriver { recalibrate(): Promise<DriverCalibration>; getCalibration(): DriverCalibration; + // -- Start mechanism discovery/verification bridge -- + /** Return candidate start mechanisms in priority order. Does not apply them. */ + discoverStartCandidates(): Promise<StartCandidate[]>; + /** Apply a candidate and report observable deltas. Does NOT commit. */ + tryStartMechanism(candidate: StartCandidate): Promise<TryStartResult>; + /** Commit a verified start mechanism so subsequent calibrations reuse it. */ + confirmStartMechanism( + candidate: StartCandidate + ): void; + /** Forget the confirmed mechanism (e.g. after reloading to try a different candidate). */ + clearConfirmedStartMechanism(): void; + /** + * Tell the driver the bot's bridge verification rejected every candidate. + * This prevents calibrate() from running the legacy fallback detector, + * which historically produced false positives like clicking Pause. + */ + rejectStartMechanism(): void; + getCalibrationDrift(): CalibrationDrift; + // -- Grid Reading -- readGrid(settledGrid?: Grid | null): Promise<GridSnapshot>; gridsAreDifferent(a: Grid | null, b: Grid | null): boolean; @@ -230,4 +295,5 @@ export interface BotReport { issue_count: number; pass: boolean; }; + calibration_drift?: CalibrationDrift; }

Impressum · Datenschutz