Fix inflated scores for empty/broken games - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 7346c07832df9d13c0567d5c870406db23840ba4
parent 8ab7efe2a47ba0517f58bc80bfe9398d68d92abf
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 22:12:01 +0200

Fix inflated scores for empty/broken games

Code analysis:
- Score 0 if workspace has < 50 LOC (empty builds scored 100% before)
- -40 penalty for no HTML files (game can't be played)

Gameplay bot:
- Score 0 if workspace has no HTML files (skip Playwright entirely)
- Added all_pieces_rotate to the fail-all list when page doesn't load

These fixes prevent budget-killed or failed runs from showing
artificially high quality scores.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 98 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M tasks/tetris/eval/code-analysis.py  | 11 +++++++++++
M tasks/tetris/eval/gameplay-bot/tests.ts  | 2 +-

3 files changed, 66 insertions(+), 45 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -319,56 +319,66 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
     # Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
     gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts"
     if gameplay_bot_entry.exists():
-        report_path = run_dir / "gameplay-bot-report.json"
-        playwright_config = task_dir / "eval" / "playwright.config.ts"
-        try:
-            bot_env = os.environ.copy()
-            bot_env["WORKSPACE_PATH"] = str(workspace)
-            bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
-            bot_result = subprocess.run(
-                ["npx", "playwright", "test", "--config", str(playwright_config)],
-                cwd=str(PROJECT_DIR),
-                capture_output=True,
-                text=True,
-                timeout=180,
-                env=bot_env,
-            )
-            if report_path.exists():
-                report_data = json.loads(report_path.read_text())
-                summary = report_data.get("summary", {})
+        # Pre-check: is there an HTML file to test?
+        html_files = list(workspace.rglob("*.html"))
+        html_files = [f for f in html_files if "node_modules" not in str(f)]
+        if not html_files:
+            results["gameplay_bot"] = {
+                "pass": False,
+                "score": 0,
+                "error": "no HTML files in workspace - game was not built",
+            }
+        else:
+            report_path = run_dir / "gameplay-bot-report.json"
+            playwright_config = task_dir / "eval" / "playwright.config.ts"
+            try:
+                bot_env = os.environ.copy()
+                bot_env["WORKSPACE_PATH"] = str(workspace)
+                bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
+                bot_result = subprocess.run(
+                    ["npx", "playwright", "test", "--config", str(playwright_config)],
+                    cwd=str(PROJECT_DIR),
+                    capture_output=True,
+                    text=True,
+                    timeout=180,
+                    env=bot_env,
+                )
+                if report_path.exists():
+                    report_data = json.loads(report_path.read_text())
+                    summary = report_data.get("summary", {})
+                    results["gameplay_bot"] = {
+                        "pass": summary.get("failed", 1) == 0,
+                        "score": summary.get("score", 0),
+                        "total": summary.get("total", 0),
+                        "passed": summary.get("passed", 0),
+                        "failed": summary.get("failed", 0),
+                        "report": report_data,
+                    }
+                else:
+                    results["gameplay_bot"] = {
+                        "pass": False,
+                        "score": 0,
+                        "error": f"Report file not created. Exit code: {bot_result.returncode}. "
+                                 f"stderr: {bot_result.stderr[:1000]}",
+                    }
+            except FileNotFoundError:
                 results["gameplay_bot"] = {
-                    "pass": summary.get("failed", 1) == 0,
-                    "score": summary.get("score", 0),
-                    "total": summary.get("total", 0),
-                    "passed": summary.get("passed", 0),
-                    "failed": summary.get("failed", 0),
-                    "report": report_data,
+                    "pass": False,
+                    "score": 0,
+                    "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
                 }
-            else:
+            except subprocess.TimeoutExpired:
                 results["gameplay_bot"] = {
                     "pass": False,
                     "score": 0,
-                    "error": f"Report file not created. Exit code: {bot_result.returncode}. "
-                             f"stderr: {bot_result.stderr[:1000]}",
+                    "error": "Gameplay bot timed out after 180 seconds",
+                }
+            except Exception as e:
+                results["gameplay_bot"] = {
+                    "pass": False,
+                    "score": 0,
+                    "error": str(e),
                 }
-        except FileNotFoundError:
-            results["gameplay_bot"] = {
-                "pass": False,
-                "score": 0,
-                "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
-            }
-        except subprocess.TimeoutExpired:
-            results["gameplay_bot"] = {
-                "pass": False,
-                "score": 0,
-                "error": "Gameplay bot timed out after 180 seconds",
-            }
-        except Exception as e:
-            results["gameplay_bot"] = {
-                "pass": False,
-                "score": 0,
-                "error": str(e),
-            }
 
     # Compute weighted score from scoring.yaml
     try:
diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py
@@ -309,6 +309,17 @@ def main():
     # ---- Compute score ----
     score = 100
 
+    # No code at all = 0 score (empty workspace or build failed)
+    if total_loc < 50:
+        results["score"] = 0.0
+        results["score_reason"] = f"insufficient code ({total_loc} LOC, minimum 50)"
+        print(json.dumps(results, indent=2))
+        return
+
+    # No HTML entry point = major penalty (game can't be played)
+    if html_valid == "no_html":
+        score -= 40
+
     # Unnecessary files (-10 each, max -30)
     score -= min(len(unnecessary) * 10, 30)
 
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -50,7 +50,7 @@ export async function runAllTests(
   if (!pageLoaded) {
     const remainingTests = [
       "game_starts", "auto_drop", "move_left", "move_right", "move_down",
-      "rotate", "hard_drop", "piece_locks", "new_piece_spawns",
+      "rotate", "all_pieces_rotate", "hard_drop", "piece_locks", "new_piece_spawns",
       "multiple_pieces", "line_clear", "score_changes", "game_over",
       "playable_30s",
     ];

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	harness/run.py	\|	98	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	tasks/tetris/eval/code-analysis.py	\|	11	+++++++++++
M	tasks/tetris/eval/gameplay-bot/tests.ts	\|	2	+-