loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 58c112b941608fed3c655c638c0e4c17daa5bb19
parent 67bd49c6e259f78aade3caeae40c3418dedf8071
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 20:15:12 +0200

Add test #25 rendering_clean, update calibration data

New test detects rendering trail bugs where falling pieces leave old
positions still colored. Checks filled cell growth vs pieces placed
during competitive play (threshold: 8x = trail bug).

Updated calibration: 1d08ee76 (broken rotation, no soft drop),
4949d521 (trail rendering bug, lines never clear).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json | 39+++++++++++++++++++--------------------
Mtasks/tetris/eval/gameplay-bot/calibration/4949d521.json | 37++++++++++++++++++-------------------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot/types.ts | 1+
4 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json b/tasks/tetris/eval/gameplay-bot/calibration/1d08ee76.json @@ -2,32 +2,32 @@ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=js_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run3", "short_id": "1d08ee76", "label": "DOM game (haiku-4.5, en)", - "notes": "", - "human_tested_at": "", + "notes": "Super weird play. L-shape rotation is broken: flips and rotates simultaneously (wrong rotation matrix). Down arrow is instant drop (no soft drop, acts as hard drop). Scores only increase on line clear, not on drops. Multi-line clear works.", + "human_tested_at": "2026-04-09", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": false, "rotate": null, - "hard_drop": null, + "hard_drop": true, "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, - "score_changes": null, - "game_over": null, - "playable_30s": null, - "multi_line_clear": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": true, + "score_changes": true, + "game_over": true, + "playable_30s": true, + "multi_line_clear": true, "score_scaling": null, "level_progression": null, "speed_progression": null, "next_piece_preview": null, "game_over_display": null, "counter_clockwise_rotation": null, - "soft_drop_distinct": null + "soft_drop_distinct": false } -} -\ No newline at end of file +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/4949d521.json b/tasks/tetris/eval/gameplay-bot/calibration/4949d521.json @@ -2,25 +2,25 @@ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=ts_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=off_tgrep=on_tread=on_twrite=on_web=on_run1", "short_id": "4949d521", "label": "DOM game (haiku-4.5, en)", - "notes": "", - "human_tested_at": "", + "notes": "Rendering bug: falling piece leaves a trail behind it (previous position never repainted to black). Lines never clear despite filling a row. Score does increase though. Very confusing visually.", + "human_tested_at": "2026-04-09", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, - "rotate": null, - "hard_drop": null, + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": true, "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, - "score_changes": null, - "game_over": null, - "playable_30s": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": false, + "score_changes": true, + "game_over": true, + "playable_30s": true, "multi_line_clear": null, "score_scaling": null, "level_progression": null, @@ -30,4 +30,4 @@ "counter_clockwise_rotation": null, "soft_drop_distinct": null } -} -\ No newline at end of file +} diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -768,6 +768,10 @@ async function runCompetitivePlayPhase( let softDropTestDone = false; let softDropDistinct: boolean | null = null; + // Rendering trail detection: track filled cell growth vs pieces placed + let filledCellSamples: number[] = []; + let trailCheckPieceMark = 0; + while (Date.now() - start < maxDuration) { try { const grid = await readGrid(page, cal); @@ -882,6 +886,15 @@ async function runCompetitivePlayPhase( } } + // Rendering trail sampling: every ~10 pieces, snapshot filled count + if (result.pieces_placed > 0 && result.pieces_placed % 10 === 0 && result.pieces_placed !== trailCheckPieceMark) { + trailCheckPieceMark = result.pieces_placed; + const sampleGrid = await readGrid(page, cal); + if (sampleGrid) { + filledCellSamples.push(countFilled(sampleGrid)); + } + } + // Execute the AI placement await page.keyboard.press(cal.controls.drop); await page.waitForTimeout(100); @@ -991,6 +1004,33 @@ async function runCompetitivePlayPhase( } } + // Rendering trail detection: if filled cells grow much faster than pieces placed, + // the renderer is leaving trails (old piece positions not cleared) + if (result.pieces_placed >= 10 && filledCellSamples.length >= 2) { + // In a normal game, filled cells = locked cells - cleared cells. + // Each piece adds 4 cells, each line clear removes 10. + // With trails, filled cells grow unchecked because old positions stay colored. + // Heuristic: if filled count exceeds pieces_placed * 8, trails are likely. + // (Normal max without any clears would be pieces * 4; * 8 gives 2x headroom.) + const maxFilled = Math.max(...filledCellSamples); + if (maxFilled > result.pieces_placed * 8) { + result.rendering_trail_detected = true; + result.bugs_detected.push("rendering_trail"); + } else { + // Also check if filled cells only ever increase across samples (never decrease + // from line clears) AND the latest sample is unreasonably high + const onlyIncreasing = filledCellSamples.every((v, i) => + i === 0 || v >= filledCellSamples[i - 1] + ); + if (onlyIncreasing && filledCellSamples.length >= 3 && maxFilled > result.pieces_placed * 6) { + result.rendering_trail_detected = true; + result.bugs_detected.push("rendering_trail"); + } else { + result.rendering_trail_detected = false; + } + } + } + // Store CCW and soft drop results for test derivation (result as any)._ccwResult = ccwResult; (result as any)._ccwTestDone = ccwTestDone; @@ -1035,6 +1075,8 @@ const ALL_TEST_NAMES = [ "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct", + // Phase 8 continued: rendering quality (test 25) + "rendering_clean", ]; interface PhaseState { @@ -1523,6 +1565,21 @@ function deriveTestResults( } } + // 25. rendering_clean + if (!phaseState.gameplayWorks || !competitivePlay) { + results.push(skipResult("rendering_clean", "competitive play phase did not run")); + } else if (competitivePlay.rendering_trail_detected === undefined) { + results.push(skipResult("rendering_clean", "not enough data to assess rendering trails")); + } else { + results.push({ + name: "rendering_clean", + pass: !competitivePlay.rendering_trail_detected, + detail: competitivePlay.rendering_trail_detected + ? "rendering trail bug: falling piece leaves old cells colored after moving" + : "piece movement clears old cells correctly", + }); + } + return results; } diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -63,6 +63,7 @@ export interface CompetitivePlayResult { next_piece_visible: boolean; speed_increased: boolean; bugs_detected: string[]; + rendering_trail_detected?: boolean; } /** Result of the calibration phase. */

Impressum · Datenschutz