loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 7346c07832df9d13c0567d5c870406db23840ba4
parent 8ab7efe2a47ba0517f58bc80bfe9398d68d92abf
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 22:12:01 +0200

Fix inflated scores for empty/broken games

Code analysis:
- Score 0 if workspace has < 50 LOC (empty builds scored 100% before)
- -40 penalty for no HTML files (game can't be played)

Gameplay bot:
- Score 0 if workspace has no HTML files (skip Playwright entirely)
- Added all_pieces_rotate to the fail-all list when page doesn't load

These fixes prevent budget-killed or failed runs from showing
artificially high quality scores.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 98+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mtasks/tetris/eval/code-analysis.py | 11+++++++++++
Mtasks/tetris/eval/gameplay-bot/tests.ts | 2+-
3 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/harness/run.py b/harness/run.py @@ -319,56 +319,66 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): # Gameplay bot (Playwright-based interactive testing, e.g. Tetris) gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts" if gameplay_bot_entry.exists(): - report_path = run_dir / "gameplay-bot-report.json" - playwright_config = task_dir / "eval" / "playwright.config.ts" - try: - bot_env = os.environ.copy() - bot_env["WORKSPACE_PATH"] = str(workspace) - bot_env["REPORT_OUTPUT_PATH"] = str(report_path) - bot_result = subprocess.run( - ["npx", "playwright", "test", "--config", str(playwright_config)], - cwd=str(PROJECT_DIR), - capture_output=True, - text=True, - timeout=180, - env=bot_env, - ) - if report_path.exists(): - report_data = json.loads(report_path.read_text()) - summary = report_data.get("summary", {}) + # Pre-check: is there an HTML file to test? + html_files = list(workspace.rglob("*.html")) + html_files = [f for f in html_files if "node_modules" not in str(f)] + if not html_files: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": "no HTML files in workspace - game was not built", + } + else: + report_path = run_dir / "gameplay-bot-report.json" + playwright_config = task_dir / "eval" / "playwright.config.ts" + try: + bot_env = os.environ.copy() + bot_env["WORKSPACE_PATH"] = str(workspace) + bot_env["REPORT_OUTPUT_PATH"] = str(report_path) + bot_result = subprocess.run( + ["npx", "playwright", "test", "--config", str(playwright_config)], + cwd=str(PROJECT_DIR), + capture_output=True, + text=True, + timeout=180, + env=bot_env, + ) + if report_path.exists(): + report_data = json.loads(report_path.read_text()) + summary = report_data.get("summary", {}) + results["gameplay_bot"] = { + "pass": summary.get("failed", 1) == 0, + "score": summary.get("score", 0), + "total": summary.get("total", 0), + "passed": summary.get("passed", 0), + "failed": summary.get("failed", 0), + "report": report_data, + } + else: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": f"Report file not created. Exit code: {bot_result.returncode}. " + f"stderr: {bot_result.stderr[:1000]}", + } + except FileNotFoundError: results["gameplay_bot"] = { - "pass": summary.get("failed", 1) == 0, - "score": summary.get("score", 0), - "total": summary.get("total", 0), - "passed": summary.get("passed", 0), - "failed": summary.get("failed", 0), - "report": report_data, + "pass": False, + "score": 0, + "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test", } - else: + except subprocess.TimeoutExpired: results["gameplay_bot"] = { "pass": False, "score": 0, - "error": f"Report file not created. Exit code: {bot_result.returncode}. " - f"stderr: {bot_result.stderr[:1000]}", + "error": "Gameplay bot timed out after 180 seconds", + } + except Exception as e: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": str(e), } - except FileNotFoundError: - results["gameplay_bot"] = { - "pass": False, - "score": 0, - "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test", - } - except subprocess.TimeoutExpired: - results["gameplay_bot"] = { - "pass": False, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds", - } - except Exception as e: - results["gameplay_bot"] = { - "pass": False, - "score": 0, - "error": str(e), - } # Compute weighted score from scoring.yaml try: diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py @@ -309,6 +309,17 @@ def main(): # ---- Compute score ---- score = 100 + # No code at all = 0 score (empty workspace or build failed) + if total_loc < 50: + results["score"] = 0.0 + results["score_reason"] = f"insufficient code ({total_loc} LOC, minimum 50)" + print(json.dumps(results, indent=2)) + return + + # No HTML entry point = major penalty (game can't be played) + if html_valid == "no_html": + score -= 40 + # Unnecessary files (-10 each, max -30) score -= min(len(unnecessary) * 10, 30) diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts @@ -50,7 +50,7 @@ export async function runAllTests( if (!pageLoaded) { const remainingTests = [ "game_starts", "auto_drop", "move_left", "move_right", "move_down", - "rotate", "hard_drop", "piece_locks", "new_piece_spawns", + "rotate", "all_pieces_rotate", "hard_drop", "piece_locks", "new_piece_spawns", "multiple_pieces", "line_clear", "score_changes", "game_over", "playable_30s", ];

Impressum · Datenschutz