loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit b17300f58730efeae5ef71f559fe28e087deabfc
parent 53625f81965fd7797eaf3cabb4dd67b44a4a8fd5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:15:38 +0200

Fix bad run detection, wire gameplay bot, fix compare page, improve rotation test

Harness (run.py):
- Added is_valid_run() that catches all failure modes: timeout, null cost,
  1 turn, invalid API key, short transcript
- Resume now auto-deletes invalid runs instead of skipping them
- Gameplay bot integrated into evaluate(): runs Playwright bot for tasks
  that have it, captures the JSON report, includes score in overall eval

Dashboard:
- Compare page fixed: avg_time now reads from avg_wall_time correctly

Eval:
- Rotation test uses grid diffing to isolate active piece from settled cells
- Increased rotation test attempts from 40 to 60
- Scoring weights updated: gameplay_bot at 10%, functional reduced to 25%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/pages/compare.astro | 2+-
Mharness/run.py | 125+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mtasks/tetris/eval/gameplay-bot/tests.ts | 127+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mtasks/tetris/scoring.yaml | 4++--
4 files changed, 221 insertions(+), 37 deletions(-)

diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro @@ -50,7 +50,7 @@ for (const axis of AXIS_NAMES) { stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-", avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-", avg_time: - stats.avg_time != null ? Math.round(stats.avg_time) + "s" : "-", + stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-", }); } } diff --git a/harness/run.py b/harness/run.py @@ -256,6 +256,7 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): "quality": None, "code_analysis": None, "transcript_analysis": None, + "gameplay_bot": None, "score": None, } @@ -300,6 +301,60 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): except Exception as e: results["transcript_analysis"] = {"error": str(e), "score": 0} + # Gameplay bot (Playwright-based interactive testing, e.g. Tetris) + gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts" + if gameplay_bot_entry.exists(): + report_path = run_dir / "gameplay-bot-report.json" + playwright_config = task_dir / "eval" / "playwright.config.ts" + try: + bot_env = os.environ.copy() + bot_env["WORKSPACE_PATH"] = str(workspace) + bot_env["REPORT_OUTPUT_PATH"] = str(report_path) + bot_result = subprocess.run( + ["npx", "playwright", "test", "--config", str(playwright_config)], + cwd=str(PROJECT_DIR), + capture_output=True, + text=True, + timeout=180, + env=bot_env, + ) + if report_path.exists(): + report_data = json.loads(report_path.read_text()) + summary = report_data.get("summary", {}) + results["gameplay_bot"] = { + "pass": summary.get("failed", 1) == 0, + "score": summary.get("score", 0), + "total": summary.get("total", 0), + "passed": summary.get("passed", 0), + "failed": summary.get("failed", 0), + "report": report_data, + } + else: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": f"Report file not created. Exit code: {bot_result.returncode}. " + f"stderr: {bot_result.stderr[:1000]}", + } + except FileNotFoundError: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test", + } + except subprocess.TimeoutExpired: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": "Gameplay bot timed out after 180 seconds", + } + except Exception as e: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": str(e), + } + # Compute weighted score from scoring.yaml try: scoring_file = task_dir / "scoring.yaml" @@ -376,6 +431,66 @@ def log(msg: str): print(msg, flush=True) +def is_valid_run(run_dir: Path) -> bool: + """Check whether a completed run directory contains valid results. + + Returns False (invalid) if any of these are true: + - claude_output.json has total_cost_usd of 0, null, or missing + - claude_output.json has num_turns of 1, null, or missing (real runs always have >1) + - meta.json has exit_code of 124 (timeout) + - claude_output.json contains "Invalid API key" in the result field + - transcript.jsonl has fewer than 5 lines (too short to be a real session) + """ + # Check meta.json for timeout + meta_path = run_dir / "meta.json" + if meta_path.exists(): + try: + meta = json.loads(meta_path.read_text()) + if meta.get("exit_code") == 124: + return False + except (json.JSONDecodeError, OSError): + return False + + # Check transcript.jsonl line count + transcript_path = run_dir / "transcript.jsonl" + if transcript_path.exists(): + try: + lines = transcript_path.read_text().strip().split("\n") + if len(lines) < 5: + return False + except OSError: + return False + else: + return False + + # Check claude_output.json + output_path = run_dir / "claude_output.json" + if output_path.exists(): + try: + output = json.loads(output_path.read_text()) + except (json.JSONDecodeError, OSError): + return False + + # total_cost_usd: 0, null, or missing + cost = output.get("total_cost_usd") + if not cost: # catches None, 0, 0.0, and missing (None from .get) + return False + + # num_turns: 1, null, or missing + num_turns = output.get("num_turns") + if num_turns is None or num_turns <= 1: + return False + + # "Invalid API key" in result field + result_text = output.get("result", "") + if isinstance(result_text, str) and "Invalid API key" in result_text: + return False + else: + return False + + return True + + def run_single( cell: dict, run_num: int, @@ -391,10 +506,14 @@ def run_single( run_id = f"{cell_id}_run{run_num}" run_dir = results_dir / "runs" / run_id - # Resume support + # Resume support: skip only if the run completed AND is valid if (run_dir / "eval_results.json").exists(): - log(f"SKIP: {run_id}") - return "skipped" + if is_valid_run(run_dir): + log(f"SKIP: {run_id}") + return "skipped" + else: + log(f"INVALID: {run_id} - deleting and re-running") + shutil.rmtree(run_dir) log(f"START: {task} | {model} | {prompt_style} | run{run_num}") diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -287,7 +287,8 @@ export async function runAllTests( try { if (cal.scoreElementSelector) { const scoreText = await page.textContent(cal.scoreElementSelector); - const score = parseInt(scoreText?.replace(/\D/g, "") || "0", 10); + const nums = extractScoreFromText(scoreText); + const score = Math.max(...nums); if (score > gameplay.max_score_observed) { gameplay.max_score_observed = score; } @@ -387,19 +388,35 @@ async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResul } /** - * Detect the active piece's shape from the grid by finding cells that are - * filled in the current grid but weren't in a "settled" snapshot. - * Returns the bounding box dimensions (width x height) or null. + * Detect the active piece's shape by diffing two grids: one taken before + * the piece spawned (or the settled state) and the current grid. + * Cells present in `current` but absent in `settled` are the active piece. + * Falls back to scanning the top 6 rows if no settled grid is provided. */ -function detectPieceShape(grid: boolean[][] | null): { w: number; h: number; cells: number } | null { - if (!grid) return null; +function detectPieceShape( + current: boolean[][] | null, + settled?: boolean[][] | null, +): { w: number; h: number; cells: number } | null { + if (!current) return null; - // Find filled cells in the top 6 rows (where new pieces spawn/fall) const activeCells: Array<[number, number]> = []; - for (let row = 0; row < Math.min(6, grid.length); row++) { - for (let col = 0; col < grid[row].length; col++) { - if (grid[row][col]) { - activeCells.push([row, col]); + + if (settled && settled.length === current.length) { + // Diff approach: cells in current but not in settled = the active piece + for (let row = 0; row < current.length; row++) { + for (let col = 0; col < current[row].length; col++) { + if (current[row][col] && !settled[row][col]) { + activeCells.push([row, col]); + } + } + } + } else { + // Fallback: scan top 6 rows (original behavior, used when no settled grid) + for (let row = 0; row < Math.min(6, current.length); row++) { + for (let col = 0; col < current[row].length; col++) { + if (current[row][col]) { + activeCells.push([row, col]); + } } } } @@ -439,34 +456,39 @@ async function testAllPiecesRotate( await page.reload(); await page.waitForTimeout(1000); - // Start the game - if (cal.start_mechanism === "button") { + // Start the game (use camelCase startMechanism from CalibrationResult) + if (cal.startMechanism === "button") { const btn = page.locator("button").filter({ hasText: /start|play|begin|new/i }).first(); if (await btn.count() > 0) await btn.click(); - } else if (cal.start_mechanism === "space") { + } else if (cal.startMechanism === "space") { await page.keyboard.press("Space"); - } else if (cal.start_mechanism === "enter") { + } else if (cal.startMechanism === "enter") { await page.keyboard.press("Enter"); - } else if (cal.start_mechanism === "click") { + } else if (cal.startMechanism === "click_canvas") { await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true }); } await page.waitForTimeout(1000); const rotatedPieces = new Set<string>(); const failedPieces = new Set<string>(); - const maxAttempts = 40; // Play up to 40 pieces to find all types + const maxAttempts = 60; // Play up to 60 pieces to find all types + + // Capture the settled grid (state right after drop, before next piece spawns) + let settledGrid: boolean[][] | null = null; for (let attempt = 0; attempt < maxAttempts; attempt++) { await page.waitForTimeout(300); const gridBefore = await readGrid(page, cal); - const shapeBefore = detectPieceShape(gridBefore); + const shapeBefore = detectPieceShape(gridBefore, settledGrid); if (!shapeBefore) { // Can't read the piece, drop it and try the next one await page.keyboard.press(cal.controls.drop); gameplay.pieces_placed++; await page.waitForTimeout(500); + // Capture settled state right after a piece lands + settledGrid = await readGrid(page, cal); continue; } @@ -477,6 +499,7 @@ async function testAllPiecesRotate( await page.keyboard.press(cal.controls.drop); gameplay.pieces_placed++; await page.waitForTimeout(500); + settledGrid = await readGrid(page, cal); continue; } @@ -485,6 +508,7 @@ async function testAllPiecesRotate( await page.keyboard.press(cal.controls.drop); gameplay.pieces_placed++; await page.waitForTimeout(500); + settledGrid = await readGrid(page, cal); continue; } @@ -493,7 +517,7 @@ async function testAllPiecesRotate( await page.waitForTimeout(300); const gridAfter = await readGrid(page, cal); - const shapeAfter = detectPieceShape(gridAfter); + const shapeAfter = detectPieceShape(gridAfter, settledGrid); if (shapeAfter) { const changed = shapeBefore.w !== shapeAfter.w || shapeBefore.h !== shapeAfter.h; @@ -521,9 +545,11 @@ async function testAllPiecesRotate( gameplay.pieces_placed++; await page.waitForTimeout(500); + // Capture settled state right after a piece lands (before next piece spawns) + settledGrid = await readGrid(page, cal); + // Check if game is over - const currentGrid = await readGrid(page, cal); - if (currentGrid && hasFilledInTopRows(currentGrid, 2)) { + if (settledGrid && hasFilledInTopRows(settledGrid, 2)) { break; } } @@ -732,6 +758,25 @@ async function testLineClear( return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" }; } +/** + * Extract the score number from potentially concatenated text. + * Handles cases like "Score: 100Level: 1Lines: 5" or "Score100Level1Lines5" + * by looking for a labeled "score" value, or falling back to the first number. + */ +function extractScoreFromText(text: string | null): number[] { + if (!text) return [0]; + + // Try labeled extraction: "Score: 100" or "Score100" or "score 100" + const labeledMatch = text.match(/score\s*[:\-=]?\s*(\d+)/i); + if (labeledMatch) { + return [parseInt(labeledMatch[1], 10)]; + } + + // Extract all individual numbers from the text + const allNumbers = (text.match(/\d+/g) || []).map(Number); + return allNumbers.length > 0 ? allNumbers : [0]; +} + async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> { if (!cal.scoreElementSelector) { // Try to find any number on the page that changes @@ -756,7 +801,7 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes try { const scoreBefore = await page.textContent(cal.scoreElementSelector); - const numBefore = parseInt(scoreBefore?.replace(/\D/g, "") || "0", 10); + const numsBefore = extractScoreFromText(scoreBefore); // Play a bit to change the score for (let i = 0; i < 5; i++) { @@ -764,20 +809,40 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes await page.waitForTimeout(300); } - const scoreAfter = await page.textContent(cal.scoreElementSelector); - const numAfter = parseInt(scoreAfter?.replace(/\D/g, "") || "0", 10); + // Poll for score change: check multiple times over 2 seconds + for (let poll = 0; poll < 4; poll++) { + await page.waitForTimeout(500); - if (numAfter > numBefore) { - return { - name: "score_changes", - pass: true, - detail: `score changed from ${numBefore} to ${numAfter}`, - }; + const scoreAfter = await page.textContent(cal.scoreElementSelector); + const numsAfter = extractScoreFromText(scoreAfter); + + // Compare each extracted number: if any number increased, score changed + for (let i = 0; i < Math.min(numsBefore.length, numsAfter.length); i++) { + if (numsAfter[i] > numsBefore[i]) { + return { + name: "score_changes", + pass: true, + detail: `score changed from ${numsBefore[i]} to ${numsAfter[i]}`, + }; + } + } + + // Also check if any new number appeared that's larger than any before number + const maxBefore = Math.max(...numsBefore); + const maxAfter = Math.max(...numsAfter); + if (maxAfter > maxBefore) { + return { + name: "score_changes", + pass: true, + detail: `score changed: max value ${maxBefore} -> ${maxAfter}`, + }; + } } + return { name: "score_changes", pass: false, - detail: `score did not increase: ${numBefore} -> ${numAfter}`, + detail: `score did not increase: [${numsBefore.join(", ")}] -> no change after polling`, }; } catch { return { name: "score_changes", pass: false, detail: "could not read score element" }; diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml @@ -1,7 +1,7 @@ weights: - functional: 0.35 + functional: 0.25 structural: 0.10 quality: 0.20 + gameplay_bot: 0.10 code_analysis: 0.15 transcript_analysis: 0.10 - # gameplay_bot will be added here once wired (0.10 from functional)

Impressum · Datenschutz