commit b17300f58730efeae5ef71f559fe28e087deabfc
parent 53625f81965fd7797eaf3cabb4dd67b44a4a8fd5
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 08:15:38 +0200
Fix bad run detection, wire gameplay bot, fix compare page, improve rotation test
Harness (run.py):
- Added is_valid_run() that catches all failure modes: timeout, null cost,
1 turn, invalid API key, short transcript
- Resume now auto-deletes invalid runs instead of skipping them
- Gameplay bot integrated into evaluate(): runs Playwright bot for tasks
that have it, captures the JSON report, includes score in overall eval
Dashboard:
- Compare page fixed: avg_time now reads from avg_wall_time correctly
Eval:
- Rotation test uses grid diffing to isolate active piece from settled cells
- Increased rotation test attempts from 40 to 60
- Scoring weights updated: gameplay_bot at 10%, functional reduced to 25%
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 221 insertions(+), 37 deletions(-)
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -50,7 +50,7 @@ for (const axis of AXIS_NAMES) {
stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-",
avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-",
avg_time:
- stats.avg_time != null ? Math.round(stats.avg_time) + "s" : "-",
+ stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-",
});
}
}
diff --git a/harness/run.py b/harness/run.py
@@ -256,6 +256,7 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
"quality": None,
"code_analysis": None,
"transcript_analysis": None,
+ "gameplay_bot": None,
"score": None,
}
@@ -300,6 +301,60 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
except Exception as e:
results["transcript_analysis"] = {"error": str(e), "score": 0}
+ # Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
+ gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts"
+ if gameplay_bot_entry.exists():
+ report_path = run_dir / "gameplay-bot-report.json"
+ playwright_config = task_dir / "eval" / "playwright.config.ts"
+ try:
+ bot_env = os.environ.copy()
+ bot_env["WORKSPACE_PATH"] = str(workspace)
+ bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
+ bot_result = subprocess.run(
+ ["npx", "playwright", "test", "--config", str(playwright_config)],
+ cwd=str(PROJECT_DIR),
+ capture_output=True,
+ text=True,
+ timeout=180,
+ env=bot_env,
+ )
+ if report_path.exists():
+ report_data = json.loads(report_path.read_text())
+ summary = report_data.get("summary", {})
+ results["gameplay_bot"] = {
+ "pass": summary.get("failed", 1) == 0,
+ "score": summary.get("score", 0),
+ "total": summary.get("total", 0),
+ "passed": summary.get("passed", 0),
+ "failed": summary.get("failed", 0),
+ "report": report_data,
+ }
+ else:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": f"Report file not created. Exit code: {bot_result.returncode}. "
+ f"stderr: {bot_result.stderr[:1000]}",
+ }
+ except FileNotFoundError:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
+ }
+ except subprocess.TimeoutExpired:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": "Gameplay bot timed out after 180 seconds",
+ }
+ except Exception as e:
+ results["gameplay_bot"] = {
+ "pass": False,
+ "score": 0,
+ "error": str(e),
+ }
+
# Compute weighted score from scoring.yaml
try:
scoring_file = task_dir / "scoring.yaml"
@@ -376,6 +431,66 @@ def log(msg: str):
print(msg, flush=True)
+def is_valid_run(run_dir: Path) -> bool:
+ """Check whether a completed run directory contains valid results.
+
+ Returns False (invalid) if any of these are true:
+ - claude_output.json has total_cost_usd of 0, null, or missing
+ - claude_output.json has num_turns of 1, null, or missing (real runs always have >1)
+ - meta.json has exit_code of 124 (timeout)
+ - claude_output.json contains "Invalid API key" in the result field
+ - transcript.jsonl has fewer than 5 lines (too short to be a real session)
+ """
+ # Check meta.json for timeout
+ meta_path = run_dir / "meta.json"
+ if meta_path.exists():
+ try:
+ meta = json.loads(meta_path.read_text())
+ if meta.get("exit_code") == 124:
+ return False
+ except (json.JSONDecodeError, OSError):
+ return False
+
+ # Check transcript.jsonl line count
+ transcript_path = run_dir / "transcript.jsonl"
+ if transcript_path.exists():
+ try:
+ lines = transcript_path.read_text().strip().split("\n")
+ if len(lines) < 5:
+ return False
+ except OSError:
+ return False
+ else:
+ return False
+
+ # Check claude_output.json
+ output_path = run_dir / "claude_output.json"
+ if output_path.exists():
+ try:
+ output = json.loads(output_path.read_text())
+ except (json.JSONDecodeError, OSError):
+ return False
+
+ # total_cost_usd: 0, null, or missing
+ cost = output.get("total_cost_usd")
+ if not cost: # catches None, 0, 0.0, and missing (None from .get)
+ return False
+
+ # num_turns: 1, null, or missing
+ num_turns = output.get("num_turns")
+ if num_turns is None or num_turns <= 1:
+ return False
+
+ # "Invalid API key" in result field
+ result_text = output.get("result", "")
+ if isinstance(result_text, str) and "Invalid API key" in result_text:
+ return False
+ else:
+ return False
+
+ return True
+
+
def run_single(
cell: dict,
run_num: int,
@@ -391,10 +506,14 @@ def run_single(
run_id = f"{cell_id}_run{run_num}"
run_dir = results_dir / "runs" / run_id
- # Resume support
+ # Resume support: skip only if the run completed AND is valid
if (run_dir / "eval_results.json").exists():
- log(f"SKIP: {run_id}")
- return "skipped"
+ if is_valid_run(run_dir):
+ log(f"SKIP: {run_id}")
+ return "skipped"
+ else:
+ log(f"INVALID: {run_id} - deleting and re-running")
+ shutil.rmtree(run_dir)
log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -287,7 +287,8 @@ export async function runAllTests(
try {
if (cal.scoreElementSelector) {
const scoreText = await page.textContent(cal.scoreElementSelector);
- const score = parseInt(scoreText?.replace(/\D/g, "") || "0", 10);
+ const nums = extractScoreFromText(scoreText);
+ const score = Math.max(...nums);
if (score > gameplay.max_score_observed) {
gameplay.max_score_observed = score;
}
@@ -387,19 +388,35 @@ async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResul
}
/**
- * Detect the active piece's shape from the grid by finding cells that are
- * filled in the current grid but weren't in a "settled" snapshot.
- * Returns the bounding box dimensions (width x height) or null.
+ * Detect the active piece's shape by diffing two grids: one taken before
+ * the piece spawned (or the settled state) and the current grid.
+ * Cells present in `current` but absent in `settled` are the active piece.
+ * Falls back to scanning the top 6 rows if no settled grid is provided.
*/
-function detectPieceShape(grid: boolean[][] | null): { w: number; h: number; cells: number } | null {
- if (!grid) return null;
+function detectPieceShape(
+ current: boolean[][] | null,
+ settled?: boolean[][] | null,
+): { w: number; h: number; cells: number } | null {
+ if (!current) return null;
- // Find filled cells in the top 6 rows (where new pieces spawn/fall)
const activeCells: Array<[number, number]> = [];
- for (let row = 0; row < Math.min(6, grid.length); row++) {
- for (let col = 0; col < grid[row].length; col++) {
- if (grid[row][col]) {
- activeCells.push([row, col]);
+
+ if (settled && settled.length === current.length) {
+ // Diff approach: cells in current but not in settled = the active piece
+ for (let row = 0; row < current.length; row++) {
+ for (let col = 0; col < current[row].length; col++) {
+ if (current[row][col] && !settled[row][col]) {
+ activeCells.push([row, col]);
+ }
+ }
+ }
+ } else {
+ // Fallback: scan top 6 rows (original behavior, used when no settled grid)
+ for (let row = 0; row < Math.min(6, current.length); row++) {
+ for (let col = 0; col < current[row].length; col++) {
+ if (current[row][col]) {
+ activeCells.push([row, col]);
+ }
}
}
}
@@ -439,34 +456,39 @@ async function testAllPiecesRotate(
await page.reload();
await page.waitForTimeout(1000);
- // Start the game
- if (cal.start_mechanism === "button") {
+ // Start the game (use camelCase startMechanism from CalibrationResult)
+ if (cal.startMechanism === "button") {
const btn = page.locator("button").filter({ hasText: /start|play|begin|new/i }).first();
if (await btn.count() > 0) await btn.click();
- } else if (cal.start_mechanism === "space") {
+ } else if (cal.startMechanism === "space") {
await page.keyboard.press("Space");
- } else if (cal.start_mechanism === "enter") {
+ } else if (cal.startMechanism === "enter") {
await page.keyboard.press("Enter");
- } else if (cal.start_mechanism === "click") {
+ } else if (cal.startMechanism === "click_canvas") {
await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true });
}
await page.waitForTimeout(1000);
const rotatedPieces = new Set<string>();
const failedPieces = new Set<string>();
- const maxAttempts = 40; // Play up to 40 pieces to find all types
+ const maxAttempts = 60; // Play up to 60 pieces to find all types
+
+ // Capture the settled grid (state right after drop, before next piece spawns)
+ let settledGrid: boolean[][] | null = null;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
await page.waitForTimeout(300);
const gridBefore = await readGrid(page, cal);
- const shapeBefore = detectPieceShape(gridBefore);
+ const shapeBefore = detectPieceShape(gridBefore, settledGrid);
if (!shapeBefore) {
// Can't read the piece, drop it and try the next one
await page.keyboard.press(cal.controls.drop);
gameplay.pieces_placed++;
await page.waitForTimeout(500);
+ // Capture settled state right after a piece lands
+ settledGrid = await readGrid(page, cal);
continue;
}
@@ -477,6 +499,7 @@ async function testAllPiecesRotate(
await page.keyboard.press(cal.controls.drop);
gameplay.pieces_placed++;
await page.waitForTimeout(500);
+ settledGrid = await readGrid(page, cal);
continue;
}
@@ -485,6 +508,7 @@ async function testAllPiecesRotate(
await page.keyboard.press(cal.controls.drop);
gameplay.pieces_placed++;
await page.waitForTimeout(500);
+ settledGrid = await readGrid(page, cal);
continue;
}
@@ -493,7 +517,7 @@ async function testAllPiecesRotate(
await page.waitForTimeout(300);
const gridAfter = await readGrid(page, cal);
- const shapeAfter = detectPieceShape(gridAfter);
+ const shapeAfter = detectPieceShape(gridAfter, settledGrid);
if (shapeAfter) {
const changed = shapeBefore.w !== shapeAfter.w || shapeBefore.h !== shapeAfter.h;
@@ -521,9 +545,11 @@ async function testAllPiecesRotate(
gameplay.pieces_placed++;
await page.waitForTimeout(500);
+ // Capture settled state right after a piece lands (before next piece spawns)
+ settledGrid = await readGrid(page, cal);
+
// Check if game is over
- const currentGrid = await readGrid(page, cal);
- if (currentGrid && hasFilledInTopRows(currentGrid, 2)) {
+ if (settledGrid && hasFilledInTopRows(settledGrid, 2)) {
break;
}
}
@@ -732,6 +758,25 @@ async function testLineClear(
return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" };
}
+/**
+ * Extract the score number from potentially concatenated text.
+ * Handles cases like "Score: 100Level: 1Lines: 5" or "Score100Level1Lines5"
+ * by looking for a labeled "score" value, or falling back to the first number.
+ */
+function extractScoreFromText(text: string | null): number[] {
+ if (!text) return [0];
+
+ // Try labeled extraction: "Score: 100" or "Score100" or "score 100"
+ const labeledMatch = text.match(/score\s*[:\-=]?\s*(\d+)/i);
+ if (labeledMatch) {
+ return [parseInt(labeledMatch[1], 10)];
+ }
+
+ // Extract all individual numbers from the text
+ const allNumbers = (text.match(/\d+/g) || []).map(Number);
+ return allNumbers.length > 0 ? allNumbers : [0];
+}
+
async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> {
if (!cal.scoreElementSelector) {
// Try to find any number on the page that changes
@@ -756,7 +801,7 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes
try {
const scoreBefore = await page.textContent(cal.scoreElementSelector);
- const numBefore = parseInt(scoreBefore?.replace(/\D/g, "") || "0", 10);
+ const numsBefore = extractScoreFromText(scoreBefore);
// Play a bit to change the score
for (let i = 0; i < 5; i++) {
@@ -764,20 +809,40 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes
await page.waitForTimeout(300);
}
- const scoreAfter = await page.textContent(cal.scoreElementSelector);
- const numAfter = parseInt(scoreAfter?.replace(/\D/g, "") || "0", 10);
+ // Poll for score change: check multiple times over 2 seconds
+ for (let poll = 0; poll < 4; poll++) {
+ await page.waitForTimeout(500);
- if (numAfter > numBefore) {
- return {
- name: "score_changes",
- pass: true,
- detail: `score changed from ${numBefore} to ${numAfter}`,
- };
+ const scoreAfter = await page.textContent(cal.scoreElementSelector);
+ const numsAfter = extractScoreFromText(scoreAfter);
+
+ // Compare each extracted number: if any number increased, score changed
+ for (let i = 0; i < Math.min(numsBefore.length, numsAfter.length); i++) {
+ if (numsAfter[i] > numsBefore[i]) {
+ return {
+ name: "score_changes",
+ pass: true,
+ detail: `score changed from ${numsBefore[i]} to ${numsAfter[i]}`,
+ };
+ }
+ }
+
+ // Also check if any new number appeared that's larger than any before number
+ const maxBefore = Math.max(...numsBefore);
+ const maxAfter = Math.max(...numsAfter);
+ if (maxAfter > maxBefore) {
+ return {
+ name: "score_changes",
+ pass: true,
+ detail: `score changed: max value ${maxBefore} -> ${maxAfter}`,
+ };
+ }
}
+
return {
name: "score_changes",
pass: false,
- detail: `score did not increase: ${numBefore} -> ${numAfter}`,
+ detail: `score did not increase: [${numsBefore.join(", ")}] -> no change after polling`,
};
} catch {
return { name: "score_changes", pass: false, detail: "could not read score element" };
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,7 +1,7 @@
weights:
- functional: 0.35
+ functional: 0.25
structural: 0.10
quality: 0.20
+ gameplay_bot: 0.10
code_analysis: 0.15
transcript_analysis: 0.10
- # gameplay_bot will be added here once wired (0.10 from functional)