commit d748de6f4a388178c427cd55d11db0149a9d0d5b
parent dcef6a4928511792f670d74ad63b8e1b9a7bde45
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Thu, 9 Apr 2026 07:41:41 +0200
Add bot calibration page with human vs bot comparison
Hidden /calibrate page showing hand-picked games with human test results
side-by-side with bot results. Data is JSON-powered (one file per game in
calibration/), references canonical bot results from eval_results.json.
Initial 5 entries from manual testing of DOM-rendered games:
- 2 games match well (80-85% bot vs human "playable")
- 1 false negative (bot 18%, human says playable -- likely GPU issue)
- 1 overlay bug correctly identified by human, bot confused
- 1 genuinely broken game (both agree: won't start)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
6 files changed, 332 insertions(+), 0 deletions(-)
diff --git a/dashboard/src/pages/calibrate.astro b/dashboard/src/pages/calibrate.astro
@@ -0,0 +1,167 @@
+---
+import Base from "../layouts/Base.astro";
+import fs from "node:fs";
+import path from "node:path";
+import { loadAllRuns } from "../lib/data";
+
+// Load calibration data
+const calibrationDir = path.resolve(process.cwd(), "../tasks/tetris/eval/gameplay-bot/calibration");
+interface CalibrationEntry {
+ run_id: string;
+ short_id: string;
+ label: string;
+ notes: string;
+ human_tested_at: string;
+ human_tests: Record<string, boolean | null>;
+}
+
+const entries: CalibrationEntry[] = [];
+if (fs.existsSync(calibrationDir)) {
+ for (const file of fs.readdirSync(calibrationDir)) {
+ if (!file.endsWith(".json")) continue;
+ try {
+ const data = JSON.parse(fs.readFileSync(path.join(calibrationDir, file), "utf-8"));
+ entries.push(data);
+ } catch {}
+ }
+}
+
+// Load bot results for these runs
+const allRuns = loadAllRuns();
+const runsByRunId = new Map(allRuns.map(r => [r.meta.run_id, r]));
+
+// Build comparison data
+const comparisons = entries.map(entry => {
+ const run = runsByRunId.get(entry.run_id);
+ const botTests = run?.eval_results?.gameplay_bot?.report?.tests as Array<{name: string; pass: boolean; detail: string}> | undefined;
+ const botByName = new Map(botTests?.map(t => [t.name, t]) || []);
+ return { entry, run, botByName };
+});
+
+const allTestNames = [
+ "game_loads", "game_starts", "auto_drop",
+ "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate",
+ "piece_locks", "new_piece_spawns", "multiple_pieces",
+ "line_clear", "score_changes",
+ "game_over", "playable_30s",
+ "multi_line_clear", "score_scaling", "level_progression", "speed_progression",
+ "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct",
+];
+---
+
+<Base title="Bot Calibration">
+ <h1 style="margin-bottom: 8px;">Bot Calibration</h1>
+ <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 0.875rem;">
+ Hand-picked games with human test results compared to bot results. Used to identify false positives and false negatives.
+ </p>
+
+ {comparisons.map(({ entry, run, botByName }) => {
+ const botScore = run?.eval_results?.gameplay_bot?.score;
+ const humanPass = Object.values(entry.human_tests).filter(v => v === true).length;
+ const humanFail = Object.values(entry.human_tests).filter(v => v === false).length;
+ const humanUnanswered = Object.values(entry.human_tests).filter(v => v === null).length;
+ const artifactUrl = `/artifacts/${entry.run_id}/index.html`;
+
+ // Count agreements/disagreements
+ let agree = 0, disagree = 0, botOnly = 0, humanOnly = 0;
+ for (const name of allTestNames) {
+ const human = entry.human_tests[name];
+ const bot = botByName.get(name);
+ const botPass = bot?.pass;
+ const botSkip = bot?.detail?.startsWith("skipped:");
+ if (human === null && (!bot || botSkip)) continue; // both unanswered
+ if (human !== null && bot && !botSkip) {
+ if (human === botPass) agree++;
+ else disagree++;
+ } else if (human !== null && (!bot || botSkip)) {
+ humanOnly++;
+ } else if (human === null && bot && !botSkip) {
+ botOnly++;
+ }
+ }
+
+ return (
+ <div class="card" style="padding: 20px; margin-bottom: 20px;">
+ <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 12px;">
+ <div>
+ <h3 style="margin: 0 0 4px 0; font-size: 1rem;">{entry.label}</h3>
+ <div style="font-size: 0.7rem; color: var(--text-muted);">
+ <a href={artifactUrl} target="_blank" style="color: var(--accent);">Play game</a>
+ {" | "}
+ <a href={`/r/${entry.short_id}`} style="color: var(--accent);">Run detail</a>
+ {" | "}
+ Tested {entry.human_tested_at}
+ </div>
+ </div>
+ <div style="display: flex; gap: 16px; font-size: 0.75rem; font-family: var(--font-mono);">
+ <div style="text-align: center;">
+ <div style="font-weight: 700; font-size: 1.1rem;">{humanPass}/{humanPass + humanFail}</div>
+ <div style="color: var(--text-muted); font-size: 0.6rem;">HUMAN</div>
+ </div>
+ <div style="text-align: center;">
+ <div style="font-weight: 700; font-size: 1.1rem;">{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div>
+ <div style="color: var(--text-muted); font-size: 0.6rem;">BOT</div>
+ </div>
+ <div style="text-align: center;">
+ <div style="font-weight: 700; font-size: 1.1rem; color: disagree > 0 ? 'var(--red)' : 'var(--green)'">{agree}/{agree + disagree}</div>
+ <div style="color: var(--text-muted); font-size: 0.6rem;">AGREE</div>
+ </div>
+ </div>
+ </div>
+
+ {entry.notes && (
+ <div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 12px; padding: 8px; background: hsl(var(--bg-secondary)); border-radius: 4px;">
+ {entry.notes}
+ </div>
+ )}
+
+ <table style="width: 100%; font-size: 0.7rem; border-collapse: collapse;">
+ <thead>
+ <tr style="border-bottom: 1px solid var(--border);">
+ <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Test</th>
+ <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Human</th>
+ <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Bot</th>
+ <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 40px;"></th>
+ <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Bot Detail</th>
+ </tr>
+ </thead>
+ <tbody>
+ {allTestNames.map(name => {
+ const human = entry.human_tests[name];
+ const bot = botByName.get(name);
+ const botSkip = bot?.detail?.startsWith("skipped:");
+ const humanStr = human === true ? "yes" : human === false ? "no" : "-";
+ const humanColor = human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)";
+ const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-";
+ const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)";
+
+ // Agreement indicator
+ let matchIcon = "";
+ let matchColor = "var(--text-muted)";
+ if (human !== null && bot && !botSkip) {
+ if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; }
+ else { matchIcon = "!"; matchColor = "var(--red)"; }
+ }
+
+ return (
+ <tr style="border-bottom: 1px solid hsl(var(--border) / 0.3);">
+ <td style="padding: 3px 8px; font-family: var(--font-mono);">{name}</td>
+ <td style={`text-align: center; padding: 3px 8px; color: ${humanColor}; font-weight: 600;`}>{humanStr}</td>
+ <td style={`text-align: center; padding: 3px 8px; color: ${botColor}; font-weight: 600;`}>{botStr}</td>
+ <td style={`text-align: center; padding: 3px 8px; color: ${matchColor}; font-weight: 700;`}>{matchIcon}</td>
+ <td style="padding: 3px 8px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; max-width: 300px;">{bot?.detail || ""}</td>
+ </tr>
+ );
+ })}
+ </tbody>
+ </table>
+ </div>
+ );
+ })}
+
+ {entries.length === 0 && (
+ <div class="card" style="padding: 32px; text-align: center; color: var(--text-muted);">
+ No calibration entries. Add JSON files to tasks/tetris/eval/gameplay-bot/calibration/
+ </div>
+ )}
+</Base>
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json b/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json
@@ -0,0 +1,33 @@
+{
+ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=on_budget=low_model=haiku45_pw=avail_prompt=detailed_rndr=none_strat=none_tst=none_tedit=off_tglob=off_tgrep=on_tread=off_twrite=on_web=off_run2",
+ "short_id": "4c7db3b9",
+ "label": "Spanish looks ok",
+ "notes": "Spanish game. Looks ok, playable.",
+ "human_tested_at": "2026-04-09",
+ "human_tests": {
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": null,
+ "all_pieces_rotate": null,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": null,
+ "score_changes": null,
+ "game_over": null,
+ "playable_30s": true,
+ "multi_line_clear": null,
+ "score_scaling": null,
+ "level_progression": null,
+ "speed_progression": null,
+ "next_piece_preview": null,
+ "game_over_display": null,
+ "counter_clockwise_rotation": null,
+ "soft_drop_distinct": null
+ }
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json b/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json
@@ -0,0 +1,33 @@
+{
+ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=uns_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run1",
+ "short_id": "7a348b81",
+ "label": "Start button broken",
+ "notes": "Clicking start game never does anything. Game is genuinely broken -- not a bot issue.",
+ "human_tested_at": "2026-04-09",
+ "human_tests": {
+ "game_loads": true,
+ "game_starts": false,
+ "auto_drop": null,
+ "move_left": null,
+ "move_right": null,
+ "move_down": null,
+ "rotate": null,
+ "hard_drop": null,
+ "all_pieces_rotate": null,
+ "piece_locks": null,
+ "new_piece_spawns": null,
+ "multiple_pieces": null,
+ "line_clear": null,
+ "score_changes": null,
+ "game_over": null,
+ "playable_30s": null,
+ "multi_line_clear": null,
+ "score_scaling": null,
+ "level_progression": null,
+ "speed_progression": null,
+ "next_piece_preview": null,
+ "game_over_display": null,
+ "counter_clockwise_rotation": null,
+ "soft_drop_distinct": null
+ }
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json b/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json
@@ -0,0 +1,33 @@
+{
+ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=uns_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run2",
+ "short_id": "8fe72fce",
+ "label": "English playable",
+ "notes": "Playable English game.",
+ "human_tested_at": "2026-04-09",
+ "human_tests": {
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": null,
+ "all_pieces_rotate": null,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": null,
+ "score_changes": null,
+ "game_over": null,
+ "playable_30s": true,
+ "multi_line_clear": null,
+ "score_scaling": null,
+ "level_progression": null,
+ "speed_progression": null,
+ "next_piece_preview": null,
+ "game_over_display": null,
+ "counter_clockwise_rotation": null,
+ "soft_drop_distinct": null
+ }
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json b/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json
@@ -0,0 +1,33 @@
+{
+ "run_id": "tetris_arch=none_ctx=provided_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=off_budget=low_model=haiku45_pw=avail_prompt=detailed_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=off_tread=off_twrite=off_web=off_run3",
+ "short_id": "93e8feea",
+ "label": "Spanish overlay bug",
+ "notes": "Spanish game with full-screen overlay. Has a start button but clicking never dismisses the overlay. Game is visible behind the transparency and the next block changes on click, but game is unplayable.",
+ "human_tested_at": "2026-04-09",
+ "human_tests": {
+ "game_loads": true,
+ "game_starts": false,
+ "auto_drop": null,
+ "move_left": null,
+ "move_right": null,
+ "move_down": null,
+ "rotate": null,
+ "hard_drop": null,
+ "all_pieces_rotate": null,
+ "piece_locks": null,
+ "new_piece_spawns": null,
+ "multiple_pieces": null,
+ "line_clear": null,
+ "score_changes": null,
+ "game_over": null,
+ "playable_30s": null,
+ "multi_line_clear": null,
+ "score_scaling": null,
+ "level_progression": null,
+ "speed_progression": null,
+ "next_piece_preview": null,
+ "game_over_display": null,
+ "counter_clockwise_rotation": null,
+ "soft_drop_distinct": null
+ }
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json b/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json
@@ -0,0 +1,33 @@
+{
+ "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run1",
+ "short_id": "e2e04e75",
+ "label": "Spanish basic play",
+ "notes": "Spanish game. Basic play works fine.",
+ "human_tested_at": "2026-04-09",
+ "human_tests": {
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": null,
+ "all_pieces_rotate": null,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": null,
+ "score_changes": null,
+ "game_over": null,
+ "playable_30s": true,
+ "multi_line_clear": null,
+ "score_scaling": null,
+ "level_progression": null,
+ "speed_progression": null,
+ "next_piece_preview": null,
+ "game_over_display": null,
+ "counter_clockwise_rotation": null,
+ "soft_drop_distinct": null
+ }
+}