loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 13710ed75e85f22717f6463b1b3bc0b822a0a66b
parent d1b5c77738368fcf645c325b912383d5c69f22ed
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat, 11 Apr 2026 09:59:29 +0200

V2: partial landmarks work (agent hit limit)

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/types.ts | 28++++++++++++++++++++++++++++
1 file changed, 28 insertions(+), 0 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/types.ts b/tasks/tetris/eval/gameplay-bot-v2/types.ts @@ -123,6 +123,21 @@ export interface SurveyData { clickable_elements: number; } +/** + * Landmarks detected on the loaded page to determine whether a game has + * actually rendered (as opposed to a blank 200 OK or an empty DOM). Used by + * the game_loads test. A game is considered "loaded" if the body has content + * and at least one game-shaped element is present. + */ +export interface GameLandmarks { + bodyHasContent: boolean; + hasCanvas: boolean; + hasDomGrid: boolean; + hasTetrisRatioElement: boolean; + hasManyCellsContainer: boolean; + landmarksFound: string[]; +} + /** Configuration returned by calibration. */ export interface DriverCalibration { renderer: RendererType; @@ -181,6 +196,13 @@ export interface TetrisDriver { // -- Lifecycle -- loadPage(url: string): Promise<{ loaded: boolean; detail: string; errorsOnLoad: number }>; surveyPage(): Promise<SurveyData>; + /** + * Detect high-level "game is present" landmarks on the loaded page. Used by + * the game_loads test to decide pass/fail based on what the user would see + * (canvas, DOM grid, tetris-ratio element, or a many-cells container) rather + * than on whether load-time console errors occurred. + */ + detectGameLandmarks(): Promise<GameLandmarks>; calibrate(): Promise<DriverCalibration>; recalibrate(): Promise<DriverCalibration>; /** @@ -342,6 +364,12 @@ export interface GameSession { * that multiple piece types can each rotate through 2+ distinct shapes. */ rotationShapesByPiece: Map<string, Set<string>>; + /** + * Landmarks detected immediately after the initial page load. Used by the + * game_loads test to decide whether a game rendered at all, independent of + * any console errors emitted during load. + */ + gameLoadLandmarks?: GameLandmarks; } /** An event observed during continuous grid scanning. */

Impressum · Datenschutz