commit 7346c07832df9d13c0567d5c870406db23840ba4
parent 8ab7efe2a47ba0517f58bc80bfe9398d68d92abf
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 22:12:01 +0200
Fix inflated scores for empty/broken games
Code analysis:
- Score 0 if workspace has < 50 LOC (empty builds scored 100% before)
- -40 penalty for no HTML files (game can't be played)
Gameplay bot:
- Score 0 if workspace has no HTML files (skip Playwright entirely)
- Added all_pieces_rotate to the fail-all list when page doesn't load
These fixes prevent budget-killed or failed runs from showing
artificially high quality scores.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 66 insertions(+), 45 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -319,56 +319,66 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
# Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts"
if gameplay_bot_entry.exists():
- report_path = run_dir / "gameplay-bot-report.json"
- playwright_config = task_dir / "eval" / "playwright.config.ts"
- try:
- bot_env = os.environ.copy()
- bot_env["WORKSPACE_PATH"] = str(workspace)
- bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
- bot_result = subprocess.run(
- ["npx", "playwright", "test", "--config", str(playwright_config)],
- cwd=str(PROJECT_DIR),
- capture_output=True,
- text=True,
- timeout=180,
- env=bot_env,
- )
- if report_path.exists():
- report_data = json.loads(report_path.read_text())
- summary = report_data.get("summary", {})
+ # Pre-check: is there an HTML file to test?
+ html_files = list(workspace.rglob("*.html"))
+ html_files = [f for f in html_files if "node_modules" not in str(f)]
+ if not html_files:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": "no HTML files in workspace - game was not built",
+ }
+ else:
+ report_path = run_dir / "gameplay-bot-report.json"
+ playwright_config = task_dir / "eval" / "playwright.config.ts"
+ try:
+ bot_env = os.environ.copy()
+ bot_env["WORKSPACE_PATH"] = str(workspace)
+ bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
+ bot_result = subprocess.run(
+ ["npx", "playwright", "test", "--config", str(playwright_config)],
+ cwd=str(PROJECT_DIR),
+ capture_output=True,
+ text=True,
+ timeout=180,
+ env=bot_env,
+ )
+ if report_path.exists():
+ report_data = json.loads(report_path.read_text())
+ summary = report_data.get("summary", {})
+ results["gameplay_bot"] = {
+ "pass": summary.get("failed", 1) == 0,
+ "score": summary.get("score", 0),
+ "total": summary.get("total", 0),
+ "passed": summary.get("passed", 0),
+ "failed": summary.get("failed", 0),
+ "report": report_data,
+ }
+ else:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": f"Report file not created. Exit code: {bot_result.returncode}. "
+ f"stderr: {bot_result.stderr[:1000]}",
+ }
+ except FileNotFoundError:
results["gameplay_bot"] = {
- "pass": summary.get("failed", 1) == 0,
- "score": summary.get("score", 0),
- "total": summary.get("total", 0),
- "passed": summary.get("passed", 0),
- "failed": summary.get("failed", 0),
- "report": report_data,
+ "pass": False,
+ "score": 0,
+ "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
}
- else:
+ except subprocess.TimeoutExpired:
results["gameplay_bot"] = {
"pass": False,
"score": 0,
- "error": f"Report file not created. Exit code: {bot_result.returncode}. "
- f"stderr: {bot_result.stderr[:1000]}",
+ "error": "Gameplay bot timed out after 180 seconds",
+ }
+ except Exception as e:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": str(e),
}
- except FileNotFoundError:
- results["gameplay_bot"] = {
- "pass": False,
- "score": 0,
- "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
- }
- except subprocess.TimeoutExpired:
- results["gameplay_bot"] = {
- "pass": False,
- "score": 0,
- "error": "Gameplay bot timed out after 180 seconds",
- }
- except Exception as e:
- results["gameplay_bot"] = {
- "pass": False,
- "score": 0,
- "error": str(e),
- }
# Compute weighted score from scoring.yaml
try:
diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py
@@ -309,6 +309,17 @@ def main():
# ---- Compute score ----
score = 100
+ # No code at all = 0 score (empty workspace or build failed)
+ if total_loc < 50:
+ results["score"] = 0.0
+ results["score_reason"] = f"insufficient code ({total_loc} LOC, minimum 50)"
+ print(json.dumps(results, indent=2))
+ return
+
+ # No HTML entry point = major penalty (game can't be played)
+ if html_valid == "no_html":
+ score -= 40
+
# Unnecessary files (-10 each, max -30)
score -= min(len(unnecessary) * 10, 30)
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -50,7 +50,7 @@ export async function runAllTests(
if (!pageLoaded) {
const remainingTests = [
"game_starts", "auto_drop", "move_left", "move_right", "move_down",
- "rotate", "hard_drop", "piece_locks", "new_piece_spawns",
+ "rotate", "all_pieces_rotate", "hard_drop", "piece_locks", "new_piece_spawns",
"multiple_pieces", "line_clear", "score_changes", "game_over",
"playable_30s",
];