Fix bad run detection, wire gameplay bot, fix compare page, improve rotation test - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit b17300f58730efeae5ef71f559fe28e087deabfc
parent 53625f81965fd7797eaf3cabb4dd67b44a4a8fd5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:15:38 +0200

Fix bad run detection, wire gameplay bot, fix compare page, improve rotation test

Harness (run.py):
- Added is_valid_run() that catches all failure modes: timeout, null cost,
  1 turn, invalid API key, short transcript
- Resume now auto-deletes invalid runs instead of skipping them
- Gameplay bot integrated into evaluate(): runs Playwright bot for tasks
  that have it, captures the JSON report, includes score in overall eval

Dashboard:
- Compare page fixed: avg_time now reads from avg_wall_time correctly

Eval:
- Rotation test uses grid diffing to isolate active piece from settled cells
- Increased rotation test attempts from 40 to 60
- Scoring weights updated: gameplay_bot at 10%, functional reduced to 25%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/pages/compare.astro  | 2 +-
M harness/run.py  | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M tasks/tetris/eval/gameplay-bot/tests.ts  | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M tasks/tetris/scoring.yaml  | 4 ++--

4 files changed, 221 insertions(+), 37 deletions(-)
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -50,7 +50,7 @@ for (const axis of AXIS_NAMES) {
         stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-",
       avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-",
       avg_time:
-        stats.avg_time != null ? Math.round(stats.avg_time) + "s" : "-",
+        stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-",
     });
   }
 }
diff --git a/harness/run.py b/harness/run.py
@@ -256,6 +256,7 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
         "quality": None,
         "code_analysis": None,
         "transcript_analysis": None,
+        "gameplay_bot": None,
         "score": None,
     }
 
@@ -300,6 +301,60 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
         except Exception as e:
             results["transcript_analysis"] = {"error": str(e), "score": 0}
 
+    # Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
+    gameplay_bot_entry = task_dir / "eval" / "gameplay-bot" / "index.ts"
+    if gameplay_bot_entry.exists():
+        report_path = run_dir / "gameplay-bot-report.json"
+        playwright_config = task_dir / "eval" / "playwright.config.ts"
+        try:
+            bot_env = os.environ.copy()
+            bot_env["WORKSPACE_PATH"] = str(workspace)
+            bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
+            bot_result = subprocess.run(
+                ["npx", "playwright", "test", "--config", str(playwright_config)],
+                cwd=str(PROJECT_DIR),
+                capture_output=True,
+                text=True,
+                timeout=180,
+                env=bot_env,
+            )
+            if report_path.exists():
+                report_data = json.loads(report_path.read_text())
+                summary = report_data.get("summary", {})
+                results["gameplay_bot"] = {
+                    "pass": summary.get("failed", 1) == 0,
+                    "score": summary.get("score", 0),
+                    "total": summary.get("total", 0),
+                    "passed": summary.get("passed", 0),
+                    "failed": summary.get("failed", 0),
+                    "report": report_data,
+                }
+            else:
+                results["gameplay_bot"] = {
+                    "pass": False,
+                    "score": 0,
+                    "error": f"Report file not created. Exit code: {bot_result.returncode}. "
+                             f"stderr: {bot_result.stderr[:1000]}",
+                }
+        except FileNotFoundError:
+            results["gameplay_bot"] = {
+                "pass": False,
+                "score": 0,
+                "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
+            }
+        except subprocess.TimeoutExpired:
+            results["gameplay_bot"] = {
+                "pass": False,
+                "score": 0,
+                "error": "Gameplay bot timed out after 180 seconds",
+            }
+        except Exception as e:
+            results["gameplay_bot"] = {
+                "pass": False,
+                "score": 0,
+                "error": str(e),
+            }
+
     # Compute weighted score from scoring.yaml
     try:
         scoring_file = task_dir / "scoring.yaml"
@@ -376,6 +431,66 @@ def log(msg: str):
         print(msg, flush=True)
 
 
+def is_valid_run(run_dir: Path) -> bool:
+    """Check whether a completed run directory contains valid results.
+
+    Returns False (invalid) if any of these are true:
+    - claude_output.json has total_cost_usd of 0, null, or missing
+    - claude_output.json has num_turns of 1, null, or missing (real runs always have >1)
+    - meta.json has exit_code of 124 (timeout)
+    - claude_output.json contains "Invalid API key" in the result field
+    - transcript.jsonl has fewer than 5 lines (too short to be a real session)
+    """
+    # Check meta.json for timeout
+    meta_path = run_dir / "meta.json"
+    if meta_path.exists():
+        try:
+            meta = json.loads(meta_path.read_text())
+            if meta.get("exit_code") == 124:
+                return False
+        except (json.JSONDecodeError, OSError):
+            return False
+
+    # Check transcript.jsonl line count
+    transcript_path = run_dir / "transcript.jsonl"
+    if transcript_path.exists():
+        try:
+            lines = transcript_path.read_text().strip().split("\n")
+            if len(lines) < 5:
+                return False
+        except OSError:
+            return False
+    else:
+        return False
+
+    # Check claude_output.json
+    output_path = run_dir / "claude_output.json"
+    if output_path.exists():
+        try:
+            output = json.loads(output_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            return False
+
+        # total_cost_usd: 0, null, or missing
+        cost = output.get("total_cost_usd")
+        if not cost:  # catches None, 0, 0.0, and missing (None from .get)
+            return False
+
+        # num_turns: 1, null, or missing
+        num_turns = output.get("num_turns")
+        if num_turns is None or num_turns <= 1:
+            return False
+
+        # "Invalid API key" in result field
+        result_text = output.get("result", "")
+        if isinstance(result_text, str) and "Invalid API key" in result_text:
+            return False
+    else:
+        return False
+
+    return True
+
+
 def run_single(
     cell: dict,
     run_num: int,
@@ -391,10 +506,14 @@ def run_single(
     run_id = f"{cell_id}_run{run_num}"
     run_dir = results_dir / "runs" / run_id
 
-    # Resume support
+    # Resume support: skip only if the run completed AND is valid
     if (run_dir / "eval_results.json").exists():
-        log(f"SKIP: {run_id}")
-        return "skipped"
+        if is_valid_run(run_dir):
+            log(f"SKIP: {run_id}")
+            return "skipped"
+        else:
+            log(f"INVALID: {run_id} - deleting and re-running")
+            shutil.rmtree(run_dir)
 
     log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
 
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -287,7 +287,8 @@ export async function runAllTests(
   try {
     if (cal.scoreElementSelector) {
       const scoreText = await page.textContent(cal.scoreElementSelector);
-      const score = parseInt(scoreText?.replace(/\D/g, "") || "0", 10);
+      const nums = extractScoreFromText(scoreText);
+      const score = Math.max(...nums);
       if (score > gameplay.max_score_observed) {
         gameplay.max_score_observed = score;
       }
@@ -387,19 +388,35 @@ async function testRotate(page: Page, cal: CalibrationResult): Promise<TestResul
 }
 
 /**
- * Detect the active piece's shape from the grid by finding cells that are
- * filled in the current grid but weren't in a "settled" snapshot.
- * Returns the bounding box dimensions (width x height) or null.
+ * Detect the active piece's shape by diffing two grids: one taken before
+ * the piece spawned (or the settled state) and the current grid.
+ * Cells present in `current` but absent in `settled` are the active piece.
+ * Falls back to scanning the top 6 rows if no settled grid is provided.
  */
-function detectPieceShape(grid: boolean[][] | null): { w: number; h: number; cells: number } | null {
-  if (!grid) return null;
+function detectPieceShape(
+  current: boolean[][] | null,
+  settled?: boolean[][] | null,
+): { w: number; h: number; cells: number } | null {
+  if (!current) return null;
 
-  // Find filled cells in the top 6 rows (where new pieces spawn/fall)
   const activeCells: Array<[number, number]> = [];
-  for (let row = 0; row < Math.min(6, grid.length); row++) {
-    for (let col = 0; col < grid[row].length; col++) {
-      if (grid[row][col]) {
-        activeCells.push([row, col]);
+
+  if (settled && settled.length === current.length) {
+    // Diff approach: cells in current but not in settled = the active piece
+    for (let row = 0; row < current.length; row++) {
+      for (let col = 0; col < current[row].length; col++) {
+        if (current[row][col] && !settled[row][col]) {
+          activeCells.push([row, col]);
+        }
+      }
+    }
+  } else {
+    // Fallback: scan top 6 rows (original behavior, used when no settled grid)
+    for (let row = 0; row < Math.min(6, current.length); row++) {
+      for (let col = 0; col < current[row].length; col++) {
+        if (current[row][col]) {
+          activeCells.push([row, col]);
+        }
       }
     }
   }
@@ -439,34 +456,39 @@ async function testAllPiecesRotate(
   await page.reload();
   await page.waitForTimeout(1000);
 
-  // Start the game
-  if (cal.start_mechanism === "button") {
+  // Start the game (use camelCase startMechanism from CalibrationResult)
+  if (cal.startMechanism === "button") {
     const btn = page.locator("button").filter({ hasText: /start|play|begin|new/i }).first();
     if (await btn.count() > 0) await btn.click();
-  } else if (cal.start_mechanism === "space") {
+  } else if (cal.startMechanism === "space") {
     await page.keyboard.press("Space");
-  } else if (cal.start_mechanism === "enter") {
+  } else if (cal.startMechanism === "enter") {
     await page.keyboard.press("Enter");
-  } else if (cal.start_mechanism === "click") {
+  } else if (cal.startMechanism === "click_canvas") {
     await page.locator("canvas, [class*='game'], [id*='game']").first().click({ force: true });
   }
   await page.waitForTimeout(1000);
 
   const rotatedPieces = new Set<string>();
   const failedPieces = new Set<string>();
-  const maxAttempts = 40; // Play up to 40 pieces to find all types
+  const maxAttempts = 60; // Play up to 60 pieces to find all types
+
+  // Capture the settled grid (state right after drop, before next piece spawns)
+  let settledGrid: boolean[][] | null = null;
 
   for (let attempt = 0; attempt < maxAttempts; attempt++) {
     await page.waitForTimeout(300);
 
     const gridBefore = await readGrid(page, cal);
-    const shapeBefore = detectPieceShape(gridBefore);
+    const shapeBefore = detectPieceShape(gridBefore, settledGrid);
 
     if (!shapeBefore) {
       // Can't read the piece, drop it and try the next one
       await page.keyboard.press(cal.controls.drop);
       gameplay.pieces_placed++;
       await page.waitForTimeout(500);
+      // Capture settled state right after a piece lands
+      settledGrid = await readGrid(page, cal);
       continue;
     }
 
@@ -477,6 +499,7 @@ async function testAllPiecesRotate(
       await page.keyboard.press(cal.controls.drop);
       gameplay.pieces_placed++;
       await page.waitForTimeout(500);
+      settledGrid = await readGrid(page, cal);
       continue;
     }
 
@@ -485,6 +508,7 @@ async function testAllPiecesRotate(
       await page.keyboard.press(cal.controls.drop);
       gameplay.pieces_placed++;
       await page.waitForTimeout(500);
+      settledGrid = await readGrid(page, cal);
       continue;
     }
 
@@ -493,7 +517,7 @@ async function testAllPiecesRotate(
     await page.waitForTimeout(300);
 
     const gridAfter = await readGrid(page, cal);
-    const shapeAfter = detectPieceShape(gridAfter);
+    const shapeAfter = detectPieceShape(gridAfter, settledGrid);
 
     if (shapeAfter) {
       const changed = shapeBefore.w !== shapeAfter.w || shapeBefore.h !== shapeAfter.h;
@@ -521,9 +545,11 @@ async function testAllPiecesRotate(
     gameplay.pieces_placed++;
     await page.waitForTimeout(500);
 
+    // Capture settled state right after a piece lands (before next piece spawns)
+    settledGrid = await readGrid(page, cal);
+
     // Check if game is over
-    const currentGrid = await readGrid(page, cal);
-    if (currentGrid && hasFilledInTopRows(currentGrid, 2)) {
+    if (settledGrid && hasFilledInTopRows(settledGrid, 2)) {
       break;
     }
   }
@@ -732,6 +758,25 @@ async function testLineClear(
   return { name: "line_clear", pass: false, detail: "could not trigger or detect a line clear" };
 }
 
+/**
+ * Extract the score number from potentially concatenated text.
+ * Handles cases like "Score: 100Level: 1Lines: 5" or "Score100Level1Lines5"
+ * by looking for a labeled "score" value, or falling back to the first number.
+ */
+function extractScoreFromText(text: string | null): number[] {
+  if (!text) return [0];
+
+  // Try labeled extraction: "Score: 100" or "Score100" or "score 100"
+  const labeledMatch = text.match(/score\s*[:\-=]?\s*(\d+)/i);
+  if (labeledMatch) {
+    return [parseInt(labeledMatch[1], 10)];
+  }
+
+  // Extract all individual numbers from the text
+  const allNumbers = (text.match(/\d+/g) || []).map(Number);
+  return allNumbers.length > 0 ? allNumbers : [0];
+}
+
 async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<TestResult> {
   if (!cal.scoreElementSelector) {
     // Try to find any number on the page that changes
@@ -756,7 +801,7 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes
 
   try {
     const scoreBefore = await page.textContent(cal.scoreElementSelector);
-    const numBefore = parseInt(scoreBefore?.replace(/\D/g, "") || "0", 10);
+    const numsBefore = extractScoreFromText(scoreBefore);
 
     // Play a bit to change the score
     for (let i = 0; i < 5; i++) {
@@ -764,20 +809,40 @@ async function testScoreChanges(page: Page, cal: CalibrationResult): Promise<Tes
       await page.waitForTimeout(300);
     }
 
-    const scoreAfter = await page.textContent(cal.scoreElementSelector);
-    const numAfter = parseInt(scoreAfter?.replace(/\D/g, "") || "0", 10);
+    // Poll for score change: check multiple times over 2 seconds
+    for (let poll = 0; poll < 4; poll++) {
+      await page.waitForTimeout(500);
 
-    if (numAfter > numBefore) {
-      return {
-        name: "score_changes",
-        pass: true,
-        detail: `score changed from ${numBefore} to ${numAfter}`,
-      };
+      const scoreAfter = await page.textContent(cal.scoreElementSelector);
+      const numsAfter = extractScoreFromText(scoreAfter);
+
+      // Compare each extracted number: if any number increased, score changed
+      for (let i = 0; i < Math.min(numsBefore.length, numsAfter.length); i++) {
+        if (numsAfter[i] > numsBefore[i]) {
+          return {
+            name: "score_changes",
+            pass: true,
+            detail: `score changed from ${numsBefore[i]} to ${numsAfter[i]}`,
+          };
+        }
+      }
+
+      // Also check if any new number appeared that's larger than any before number
+      const maxBefore = Math.max(...numsBefore);
+      const maxAfter = Math.max(...numsAfter);
+      if (maxAfter > maxBefore) {
+        return {
+          name: "score_changes",
+          pass: true,
+          detail: `score changed: max value ${maxBefore} -> ${maxAfter}`,
+        };
+      }
     }
+
     return {
       name: "score_changes",
       pass: false,
-      detail: `score did not increase: ${numBefore} -> ${numAfter}`,
+      detail: `score did not increase: [${numsBefore.join(", ")}] -> no change after polling`,
     };
   } catch {
     return { name: "score_changes", pass: false, detail: "could not read score element" };
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,7 +1,7 @@
 weights:
-  functional: 0.35
+  functional: 0.25
   structural: 0.10
   quality: 0.20
+  gameplay_bot: 0.10
   code_analysis: 0.15
   transcript_analysis: 0.10
-  # gameplay_bot will be added here once wired (0.10 from functional)

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/pages/compare.astro	\|	2	+-
M	harness/run.py	\|	125	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	tasks/tetris/eval/gameplay-bot/tests.ts	\|	127	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M	tasks/tetris/scoring.yaml	\|	4	++--