loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 669aa68861617a446e5ae56aea0a462995d183f0
parent 71f0c4b7931a2d58ae370587a6a0f62ba352b716
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat, 11 Apr 2026 07:30:25 +0200

V2: game_over_display test passes on overlay OR restart presence

Previous check required BOTH a modal and a restart button. Now accepts
either signal because different games show game-over UI differently
(some have full modal, some just show a restart button overlay).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot-v2/bot.ts | 30++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot-v2/bot.ts b/tasks/tetris/eval/gameplay-bot-v2/bot.ts @@ -1873,30 +1873,36 @@ function deriveTestResults( } // 22. game_over_display - // Prefer the Phase 6 capture (taken immediately after game over was - // triggered) because Phase 8's competitive-play check happens after a fresh - // page reload and rarely sees a real game-over screen. Fall back to the - // Phase 8 reading only if Phase 6 never reached game over. + // Verifies the game SHOWS a game-over UI after the internal game over + // state is reached. Two structural signals (no language matching): + // 1. A modal or overlay element appeared (position fixed/absolute, + // covering >15% viewport, visible). Captured in Phase 6 after + // game over was triggered via grid stacking. + // 2. A clickable element (restart) is visible inside that overlay. + // The test passes if EITHER a modal is present OR a restart option is + // visible, because different games show different UI styles. if (!phaseState.gameplayWorks || !competitivePlay) { results.push(skipResult("game_over_display", "competitive play phase did not run")); } else if (!competitivePlay.game_over_reached && !session.gameOverDetected) { results.push(skipResult("game_over_display", "game over not reached during play")); } else { const usePhase6 = session.gameOverDetected; - const text = usePhase6 - ? session.gameOverText ?? null - : competitivePlay.game_over_text_found; + const modalFound = usePhase6 + ? (session.gameOverText !== null && session.gameOverText !== undefined) + : (competitivePlay.game_over_text_found !== null); const hasRestart = usePhase6 ? session.gameOverRestartAvailable === true : competitivePlay.restart_available; - const hasText = text !== null && text !== undefined; const source = usePhase6 ? "phase6" : "phase8"; + const pass = modalFound || hasRestart; + const details: string[] = []; + if (modalFound) details.push("overlay detected"); + if (hasRestart) details.push("restart clickable present"); + if (!pass) details.push("no overlay or restart UI found"); results.push({ name: "game_over_display", - pass: hasText && hasRestart, - detail: hasText && hasRestart - ? `game over display: "${text}", restart available (${source})` - : `missing: ${!hasText ? "game over text" : ""}${!hasText && !hasRestart ? " and " : ""}${!hasRestart ? "restart option" : ""} (${source})`, + pass, + detail: `${details.join(", ")} (${source})`, }); }

Impressum · Datenschutz