commit 821022cb2060158a630e184a67c95678e8dca7c7
parent 00055378a50253cc949795147e20b64ed2a2767f
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 12 Apr 2026 17:43:33 +0200
Switch production eval to V2 gameplay bot
Harness now uses gameplay-bot-v2 (two-tier architecture) when available,
falls back to V1 if not. V2 has 95% agreement with human calibration
(vs V1's 58%).
Expect breakage on canvas games without GPU (getImageData returns zeros).
DOM games should work well.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -442,7 +442,10 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
results["transcript_analysis"] = {"error": str(e), "score": 0}
# Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
- gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts"
+ # Use V2 bot (two-tier architecture) if available, fall back to V1
+ gameplay_bot_v2 = task_dir / "eval" / "gameplay-bot-v2" / "index.ts"
+ gameplay_bot_v1 = task_dir / "eval" / "gameplay-bot" / "index.ts"
+ gameplay_bot_entry = gameplay_bot_v2 if gameplay_bot_v2.exists() else gameplay_bot_v1
if gameplay_bot_entry.exists():
# Pre-check: is there an HTML file to test?
html_files = list(workspace.rglob("*.html"))
@@ -455,7 +458,8 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
}
else:
report_path = run_dir / "gameplay-bot-report.json"
- playwright_config = task_dir / "eval" / "playwright.config.ts"
+ bot_dir = gameplay_bot_entry.parent
+ playwright_config = bot_dir / "playwright.config.ts"
try:
bot_env = os.environ.copy()
bot_env["WORKSPACE_PATH"] = str(workspace)