loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit daf7a8d631b1f765ce5c665c04d380933ecdd207
parent c7d67a0208000a33a1957c4a7c045a94d3bf9427
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 14:02:48 +0200

Fix cell_id length, add SonarQube details, rebuild gameplay bot

- Abbreviate axis names in cell_id to stay under ext4 255-char limit
  (257 chars -> 181 max). Fixes Plackett-Burman runs failing on long configs.
- Add SonarQube detail card on run page (ratings, bugs, smells, complexity)
- Iframe: single iframe with sandbox="allow-scripts" instead of double
  iframe that caused CORS errors from null origin
- Gameplay bot: two-phase architecture (mechanics test, then play-to-win),
  60 pieces / 45s extended play (was 30/20s), integrated score tracking
  during play instead of separate 5-drop observation, 60ms polling (was 150ms)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/CellDetail.tsx | 4++--
Mdashboard/src/components/RunDetail.tsx | 34++++++++++++++++++++++++++++++++--
Mharness/lib/compute_grid.py | 24++++++++++++++++++++++--
Mharness/lib/experiment_design.py | 3++-
Mtasks/tetris/eval/gameplay-bot/player.ts | 39+++++++++++++++++++++++++++++----------
Mtasks/tetris/eval/gameplay-bot/tests.ts | 199++++++++++++++++++++++++++++---------------------------------------------------
6 files changed, 158 insertions(+), 145 deletions(-)

diff --git a/dashboard/src/components/CellDetail.tsx b/dashboard/src/components/CellDetail.tsx @@ -507,14 +507,14 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) { </span> </div> <iframe - srcDoc={`<!DOCTYPE html><html style="height:100%"><head><meta charset="UTF-8"></head><body style="margin:0;height:100%"><iframe src="${artifactUrl}" style="width:100%;height:100%;border:none" sandbox="allow-scripts"></iframe></body></html>`} + src={artifactUrl} style={{ width: "100%", height: "50vh", border: "none", background: "#fff", }} - sandbox="allow-scripts allow-same-origin" + sandbox="allow-scripts" title={`Run #${r.meta.run_number} preview`} /> </div> diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx @@ -378,6 +378,36 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon </div> )} + {/* SonarQube details */} + {(eval_results as Record<string, any>)?.sonarqube && !(eval_results as Record<string, any>).sonarqube.error && ( + <div className="card" style={{ padding: "16px" }}> + <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>SonarQube</h4> + {(() => { + const sq = (eval_results as Record<string, any>).sonarqube; + const ratingColor = (r: string) => r === "A" ? "var(--green)" : r === "B" ? "var(--yellow)" : "var(--red)"; + return ( + <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}> + <div style={{ display: "flex", gap: "12px", marginBottom: "4px" }}> + {["maintainability", "reliability", "security"].map((k) => sq[k] && ( + <div key={k} style={{ textAlign: "center" }}> + <div style={{ fontFamily: "var(--font-mono)", fontWeight: 700, fontSize: "1.1rem", color: ratingColor(sq[k]) }}>{sq[k]}</div> + <div style={{ fontSize: "0.55rem", color: "var(--text-muted)", textTransform: "capitalize" }}>{k}</div> + </div> + ))} + </div> + <Stat label="Bugs" value={sq.bugs ?? 0} /> + <Stat label="Vulnerabilities" value={sq.vulnerabilities ?? 0} /> + <Stat label="Code smells" value={sq.code_smells ?? 0} /> + <Stat label="Cognitive complexity" value={sq.cognitive_complexity ?? "-"} /> + <Stat label="Duplication" value={`${sq.duplication_pct ?? 0}%`} /> + <Stat label="Tech debt" value={sq.tech_debt_minutes != null ? `${sq.tech_debt_minutes} min` : "-"} /> + <Stat label="Lines analyzed" value={sq.lines_of_code ?? "-"} /> + </div> + ); + })()} + </div> + )} + {/* Quality details */} {eval_results?.quality && ( <div className="card" style={{ padding: "16px" }}> @@ -484,14 +514,14 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon </a> </div> <iframe - srcDoc={`<!DOCTYPE html><html style="height:100%"><head><meta charset="UTF-8"></head><body style="margin:0;height:100%"><iframe src="${artifactUrl}" style="width:100%;height:100%;border:none" sandbox="allow-scripts"></iframe></body></html>`} + src={artifactUrl} style={{ width: "100%", height: "70vh", border: "none", background: "#fff", }} - sandbox="allow-scripts allow-same-origin" + sandbox="allow-scripts" title="Result preview" /> </div> diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py @@ -17,6 +17,26 @@ from itertools import product import yaml +# Short axis names for cell_id to avoid filesystem path length limits (ext4: 255 chars) +AXIS_ABBREV = { + "context_file": "ctx", + "effort": "eff", + "human_language": "hlang", + "language": "lang", + "linter": "lint", + "max_budget": "budget", + "model": "model", + "playwright": "pw", + "prompt_style": "prompt", + "sub_agents": "agents", + "tool_edit": "tedit", + "tool_glob": "tglob", + "tool_grep": "tgrep", + "tool_read": "tread", + "tool_write": "twrite", + "web_search": "web", +} + def load_grid(path): with open(path) as f: @@ -95,8 +115,8 @@ def compute_cells(grid, profile_name): if excluded: continue - # Build cell ID from task + all axis values (deterministic, filename-safe) - cell_id_parts = [task] + [f"{k}={cell[k]}" for k in axis_names] + # Build cell ID from task + abbreviated axis values (deterministic, filename-safe) + cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={cell[k]}" for k in axis_names] cell_id = "_".join(cell_id_parts) # Resolve budget value diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py @@ -444,8 +444,9 @@ def _is_excluded(cell, grid): def _build_cell(task, cell, defaults, grid): + from compute_grid import AXIS_ABBREV axis_names = sorted(cell.keys()) - cell_id_parts = [task] + [f"{k}={cell[k]}" for k in axis_names] + cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={cell[k]}" for k in axis_names] result = dict(cell) result["task"] = task diff --git a/tasks/tetris/eval/gameplay-bot/player.ts b/tasks/tetris/eval/gameplay-bot/player.ts @@ -85,10 +85,11 @@ interface Placement { export async function playGame( page: Page, cal: CalibrationResult, - options: { maxPieces?: number; maxDurationMs?: number } -): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number }> { + options: { maxPieces?: number; maxDurationMs?: number; scoreSelector?: string } +): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> { const maxPieces = options.maxPieces ?? 100; const maxDuration = options.maxDurationMs ?? 30000; + const scoreSelector = options.scoreSelector ?? null; const start = Date.now(); let piecesPlaced = 0; let linesCleared = 0; @@ -96,6 +97,8 @@ export async function playGame( let gridReads = 0; let gridReadFails = 0; let consecutiveReadFails = 0; + const scoreValues: number[] = []; + let scorePollCounter = 0; let previousGrid: Grid | null = null; let settledGrid: Grid | null = null; @@ -115,13 +118,29 @@ export async function playGame( piecesPlaced += 3; break; } - await page.waitForTimeout(150); + await page.waitForTimeout(60); continue; } gridReads++; consecutiveReadFails = 0; + // Lightweight score tracking: read score every ~5 polls + if (scoreSelector) { + scorePollCounter++; + if (scorePollCounter % 5 === 0) { + try { + const scoreText = await page.textContent(scoreSelector); + if (scoreText) { + const nums = (scoreText.match(/\d+/g) || []).map(Number); + if (nums.length > 0) { + scoreValues.push(Math.max(...nums)); + } + } + } catch { /* ignore score read failures */ } + } + } + // Detect if anything changed if (previousGrid && !gridsAreDifferent(grid, previousGrid)) { // Nothing changed, wait and poll again @@ -131,7 +150,7 @@ export async function playGame( await page.keyboard.press(cal.controls.drop); lastPlacementTime = Date.now(); } - await page.waitForTimeout(150); + await page.waitForTimeout(60); continue; } @@ -142,7 +161,7 @@ export async function playGame( waitingForNewPiece = false; lastPlacementTime = Date.now(); previousGrid = grid; - await page.waitForTimeout(100); + await page.waitForTimeout(60); continue; } @@ -169,7 +188,7 @@ export async function playGame( } // Wait for the piece to lock and next piece to spawn - await page.waitForTimeout(200); + await page.waitForTimeout(100); // Read the settled state const afterGrid = await readGrid(page, cal); @@ -197,16 +216,16 @@ export async function playGame( } previousGrid = grid; - await page.waitForTimeout(150); + await page.waitForTimeout(60); } catch { errors++; await playRandomMove(page, cal); piecesPlaced++; - await page.waitForTimeout(100); + await page.waitForTimeout(60); } } - return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails }; + return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails, scoreValues }; } /** @@ -254,7 +273,7 @@ async function executePlacement( // Hard drop await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(100); + await page.waitForTimeout(60); } /** diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -107,28 +107,17 @@ export async function runAllTests( await runBasicMechanicsPhase(page, cal, session); } - // ---- Phase 4: Multi-piece play session ---- - // Reload for clean state + // ---- Phase 4: Reload + calibrate for gameplay ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); session.started = session.started || cal.startMechanism !== "unknown"; } catch { /* continue with existing state */ } - await runPlayPhase(page, cal, session, gameplay); + // ---- Phase 5: Extended gameplay with integrated score tracking ---- + await runGameplayPhase(page, cal, session, gameplay); - // ---- Phase 5: Line clear attempts ---- - try { - await loadGamePage(page, serverUrl); - cal = await calibrate(page); - } catch { /* continue */ } - - await runLineClearPhase(page, cal, session, gameplay); - - // ---- Phase 6: Score observation ---- - await observeScore(page, cal, session, gameplay); - - // ---- Phase 7: Game over test ---- + // ---- Phase 6: Game over test ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); @@ -136,7 +125,7 @@ export async function runAllTests( await runGameOverPhase(page, cal, session); - // ---- Phase 8: 30-second play test ---- + // ---- Phase 7: 30-second endurance play ---- try { await loadGamePage(page, serverUrl); cal = await calibrate(page); @@ -353,15 +342,16 @@ async function runBasicMechanicsPhase( } /** - * Play multiple pieces and track what happens. + * Extended gameplay phase with integrated score tracking. + * Plays up to 60 pieces / 45 seconds using the AI, reads score on every + * poll cycle, and falls back to brute-force line clearing if needed. */ -async function runPlayPhase( +async function runGameplayPhase( page: Page, cal: CalibrationResult, session: GameSession, gameplay: GameplayStats ): Promise<void> { - // Drop 10 pieces to test multiple pieces mechanic const gridBefore = await readGrid(page, cal); const filledBefore = gridBefore ? countFilled(gridBefore) : 0; if (gridBefore) { @@ -371,69 +361,36 @@ async function runPlayPhase( } session.frames++; - let settledGrid = gridBefore; - - for (let i = 0; i < 10; i++) { - await hardDrop(page, cal); - await page.waitForTimeout(300); - gameplay.pieces_placed++; - session.piecesLocked++; - - const grid = await readGrid(page, cal); - if (grid) { - session.gridReadSuccess++; - session.frames++; - - // Detect piece type from diff - if (settledGrid) { - const cells = detectActivePieceCells(grid, settledGrid); - if (cells) { - const pt = identifyPieceType(cells); - session.pieceTypes.add(pt); - session.piecesSpawned++; - } - } - settledGrid = grid; - } else { - session.gridReadFail++; - session.frames++; - } - } - - const gridAfter = await readGrid(page, cal); - if (gridAfter) { - session.gridReadSuccess++; - session.frames++; - const filledAfter = countFilled(gridAfter); - if (filledAfter > filledBefore) { - session.events.push({ - type: "piece_locked", - frame: session.frames, - filledDelta: filledAfter - filledBefore, - }); - } + // Read initial score before play begins + if (cal.scoreElementSelector) { + try { + const scoreText = await page.textContent(cal.scoreElementSelector); + const nums = extractScoreFromText(scoreText); + const val = Math.max(...nums); + session.scoreValues.push(val); + } catch { /* ignore */ } } -} -/** - * Attempt to clear lines using AI play and brute-force methods. - */ -async function runLineClearPhase( - page: Page, - cal: CalibrationResult, - session: GameSession, - gameplay: GameplayStats -): Promise<void> { - const gridBefore = await readGrid(page, cal); - const filledBefore = gridBefore ? countFilled(gridBefore) : 0; - - // Play strategically using the AI - const result = await playGame(page, cal, { maxPieces: 30, maxDurationMs: 20000 }); + // Play strategically using the AI with integrated score tracking + const result = await playGame(page, cal, { + maxPieces: 60, + maxDurationMs: 45000, + scoreSelector: cal.scoreElementSelector ?? undefined, + }); gameplay.pieces_placed += result.piecesPlaced; gameplay.errors_during_play += result.errors; session.gridReadSuccess += result.gridReads; session.gridReadFail += result.gridReadFails; session.frames += result.gridReads + result.gridReadFails; + session.piecesLocked += result.piecesPlaced; + + // Merge score values collected during play + for (const sv of result.scoreValues) { + session.scoreValues.push(sv); + if (sv > gameplay.max_score_observed) { + gameplay.max_score_observed = sv; + } + } if (result.linesCleared > 0) { session.linesCleared += result.linesCleared; @@ -443,40 +400,21 @@ async function runLineClearPhase( } } - // If no lines cleared yet, try brute-force approach - if (session.linesCleared === 0) { - const cleared = await tryFillRow(page, cal, 10); - gameplay.pieces_placed += 10; - if (cleared) { - session.linesCleared++; - gameplay.lines_cleared++; - session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); - } - } - - // Check if total filled decreased (indicates clearing happened) - if (session.linesCleared === 0) { - const gridAfter = await readGrid(page, cal); - const filledAfter = gridAfter ? countFilled(gridAfter) : 0; - if (filledAfter < filledBefore && filledBefore > 0) { - session.linesCleared++; - gameplay.lines_cleared++; - session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); - } + // Read final score after play + if (cal.scoreElementSelector) { + try { + const scoreText = await page.textContent(cal.scoreElementSelector); + const nums = extractScoreFromText(scoreText); + const val = Math.max(...nums); + session.scoreValues.push(val); + if (val > gameplay.max_score_observed) { + gameplay.max_score_observed = val; + } + } catch { /* ignore */ } } -} -/** - * Observe the score element during gameplay. - */ -async function observeScore( - page: Page, - cal: CalibrationResult, - session: GameSession, - gameplay: GameplayStats -): Promise<void> { - if (!cal.scoreElementSelector) { - // Try to find any number on the page that changes + // If no score element found, try to detect changing numbers on page + if (!cal.scoreElementSelector && session.scoreValues.length === 0) { try { const textBefore = await page.evaluate(() => document.body.innerText); const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number); @@ -497,33 +435,38 @@ async function observeScore( } } } catch { /* ignore */ } - return; } - try { - const scoreBefore = await page.textContent(cal.scoreElementSelector); - const numsBefore = extractScoreFromText(scoreBefore); - session.scoreValues.push(Math.max(...numsBefore)); + // Record pieces for multi-piece detection + if (result.piecesPlaced > 0) { + session.events.push({ + type: "piece_locked", + frame: session.frames, + filledDelta: result.piecesPlaced * 4, + }); + } - // Play a bit - for (let i = 0; i < 5; i++) { - await page.keyboard.press(cal.controls.drop); - await page.waitForTimeout(300); + // If no lines cleared by AI, try brute-force approach + if (session.linesCleared === 0) { + const cleared = await tryFillRow(page, cal, 10); + gameplay.pieces_placed += 10; + if (cleared) { + session.linesCleared++; + gameplay.lines_cleared++; + session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); } + } - // Poll for score change - for (let poll = 0; poll < 4; poll++) { - await page.waitForTimeout(500); - const scoreAfter = await page.textContent(cal.scoreElementSelector); - const numsAfter = extractScoreFromText(scoreAfter); - const maxAfter = Math.max(...numsAfter); - session.scoreValues.push(maxAfter); - if (maxAfter > gameplay.max_score_observed) { - gameplay.max_score_observed = maxAfter; - } - if (maxAfter > Math.max(...numsBefore)) break; + // Check if total filled decreased (indicates clearing happened) + if (session.linesCleared === 0) { + const gridAfter = await readGrid(page, cal); + const filledAfter = gridAfter ? countFilled(gridAfter) : 0; + if (filledAfter < filledBefore && filledBefore > 0) { + session.linesCleared++; + gameplay.lines_cleared++; + session.events.push({ type: "line_cleared", count: 1, frame: session.frames }); } - } catch { /* ignore */ } + } } /**

Impressum · Datenschutz