loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d748de6f4a388178c427cd55d11db0149a9d0d5b
parent dcef6a4928511792f670d74ad63b8e1b9a7bde45
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 07:41:41 +0200

Add bot calibration page with human vs bot comparison

Hidden /calibrate page showing hand-picked games with human test results
side-by-side with bot results. Data is JSON-powered (one file per game in
calibration/), references canonical bot results from eval_results.json.

Initial 5 entries from manual testing of DOM-rendered games:
- 2 games match well (80-85% bot vs human "playable")
- 1 false negative (bot 18%, human says playable -- likely GPU issue)
- 1 overlay bug correctly identified by human, bot confused
- 1 genuinely broken game (both agree: won't start)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/pages/calibrate.astro | 167+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json | 33+++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibration/7a348b81.json | 33+++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json | 33+++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibration/93e8feea.json | 33+++++++++++++++++++++++++++++++++
Atasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json | 33+++++++++++++++++++++++++++++++++
6 files changed, 332 insertions(+), 0 deletions(-)

diff --git a/dashboard/src/pages/calibrate.astro b/dashboard/src/pages/calibrate.astro @@ -0,0 +1,167 @@ +--- +import Base from "../layouts/Base.astro"; +import fs from "node:fs"; +import path from "node:path"; +import { loadAllRuns } from "../lib/data"; + +// Load calibration data +const calibrationDir = path.resolve(process.cwd(), "../tasks/tetris/eval/gameplay-bot/calibration"); +interface CalibrationEntry { + run_id: string; + short_id: string; + label: string; + notes: string; + human_tested_at: string; + human_tests: Record<string, boolean | null>; +} + +const entries: CalibrationEntry[] = []; +if (fs.existsSync(calibrationDir)) { + for (const file of fs.readdirSync(calibrationDir)) { + if (!file.endsWith(".json")) continue; + try { + const data = JSON.parse(fs.readFileSync(path.join(calibrationDir, file), "utf-8")); + entries.push(data); + } catch {} + } +} + +// Load bot results for these runs +const allRuns = loadAllRuns(); +const runsByRunId = new Map(allRuns.map(r => [r.meta.run_id, r])); + +// Build comparison data +const comparisons = entries.map(entry => { + const run = runsByRunId.get(entry.run_id); + const botTests = run?.eval_results?.gameplay_bot?.report?.tests as Array<{name: string; pass: boolean; detail: string}> | undefined; + const botByName = new Map(botTests?.map(t => [t.name, t]) || []); + return { entry, run, botByName }; +}); + +const allTestNames = [ + "game_loads", "game_starts", "auto_drop", + "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate", + "piece_locks", "new_piece_spawns", "multiple_pieces", + "line_clear", "score_changes", + "game_over", "playable_30s", + "multi_line_clear", "score_scaling", "level_progression", "speed_progression", + "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct", +]; +--- + +<Base title="Bot Calibration"> + <h1 style="margin-bottom: 8px;">Bot Calibration</h1> + <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 0.875rem;"> + Hand-picked games with human test results compared to bot results. Used to identify false positives and false negatives. + </p> + + {comparisons.map(({ entry, run, botByName }) => { + const botScore = run?.eval_results?.gameplay_bot?.score; + const humanPass = Object.values(entry.human_tests).filter(v => v === true).length; + const humanFail = Object.values(entry.human_tests).filter(v => v === false).length; + const humanUnanswered = Object.values(entry.human_tests).filter(v => v === null).length; + const artifactUrl = `/artifacts/${entry.run_id}/index.html`; + + // Count agreements/disagreements + let agree = 0, disagree = 0, botOnly = 0, humanOnly = 0; + for (const name of allTestNames) { + const human = entry.human_tests[name]; + const bot = botByName.get(name); + const botPass = bot?.pass; + const botSkip = bot?.detail?.startsWith("skipped:"); + if (human === null && (!bot || botSkip)) continue; // both unanswered + if (human !== null && bot && !botSkip) { + if (human === botPass) agree++; + else disagree++; + } else if (human !== null && (!bot || botSkip)) { + humanOnly++; + } else if (human === null && bot && !botSkip) { + botOnly++; + } + } + + return ( + <div class="card" style="padding: 20px; margin-bottom: 20px;"> + <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 12px;"> + <div> + <h3 style="margin: 0 0 4px 0; font-size: 1rem;">{entry.label}</h3> + <div style="font-size: 0.7rem; color: var(--text-muted);"> + <a href={artifactUrl} target="_blank" style="color: var(--accent);">Play game</a> + {" | "} + <a href={`/r/${entry.short_id}`} style="color: var(--accent);">Run detail</a> + {" | "} + Tested {entry.human_tested_at} + </div> + </div> + <div style="display: flex; gap: 16px; font-size: 0.75rem; font-family: var(--font-mono);"> + <div style="text-align: center;"> + <div style="font-weight: 700; font-size: 1.1rem;">{humanPass}/{humanPass + humanFail}</div> + <div style="color: var(--text-muted); font-size: 0.6rem;">HUMAN</div> + </div> + <div style="text-align: center;"> + <div style="font-weight: 700; font-size: 1.1rem;">{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div> + <div style="color: var(--text-muted); font-size: 0.6rem;">BOT</div> + </div> + <div style="text-align: center;"> + <div style="font-weight: 700; font-size: 1.1rem; color: disagree > 0 ? 'var(--red)' : 'var(--green)'">{agree}/{agree + disagree}</div> + <div style="color: var(--text-muted); font-size: 0.6rem;">AGREE</div> + </div> + </div> + </div> + + {entry.notes && ( + <div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 12px; padding: 8px; background: hsl(var(--bg-secondary)); border-radius: 4px;"> + {entry.notes} + </div> + )} + + <table style="width: 100%; font-size: 0.7rem; border-collapse: collapse;"> + <thead> + <tr style="border-bottom: 1px solid var(--border);"> + <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Test</th> + <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Human</th> + <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Bot</th> + <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 40px;"></th> + <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Bot Detail</th> + </tr> + </thead> + <tbody> + {allTestNames.map(name => { + const human = entry.human_tests[name]; + const bot = botByName.get(name); + const botSkip = bot?.detail?.startsWith("skipped:"); + const humanStr = human === true ? "yes" : human === false ? "no" : "-"; + const humanColor = human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)"; + const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-"; + const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)"; + + // Agreement indicator + let matchIcon = ""; + let matchColor = "var(--text-muted)"; + if (human !== null && bot && !botSkip) { + if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; } + else { matchIcon = "!"; matchColor = "var(--red)"; } + } + + return ( + <tr style="border-bottom: 1px solid hsl(var(--border) / 0.3);"> + <td style="padding: 3px 8px; font-family: var(--font-mono);">{name}</td> + <td style={`text-align: center; padding: 3px 8px; color: ${humanColor}; font-weight: 600;`}>{humanStr}</td> + <td style={`text-align: center; padding: 3px 8px; color: ${botColor}; font-weight: 600;`}>{botStr}</td> + <td style={`text-align: center; padding: 3px 8px; color: ${matchColor}; font-weight: 700;`}>{matchIcon}</td> + <td style="padding: 3px 8px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; max-width: 300px;">{bot?.detail || ""}</td> + </tr> + ); + })} + </tbody> + </table> + </div> + ); + })} + + {entries.length === 0 && ( + <div class="card" style="padding: 32px; text-align: center; color: var(--text-muted);"> + No calibration entries. Add JSON files to tasks/tetris/eval/gameplay-bot/calibration/ + </div> + )} +</Base> diff --git a/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json b/tasks/tetris/eval/gameplay-bot/calibration/4c7db3b9.json @@ -0,0 +1,33 @@ +{ + "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=on_budget=low_model=haiku45_pw=avail_prompt=detailed_rndr=none_strat=none_tst=none_tedit=off_tglob=off_tgrep=on_tread=off_twrite=on_web=off_run2", + "short_id": "4c7db3b9", + "label": "Spanish looks ok", + "notes": "Spanish game. Looks ok, playable.", + "human_tested_at": "2026-04-09", + "human_tests": { + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": null, + "all_pieces_rotate": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": null, + "score_changes": null, + "game_over": null, + "playable_30s": true, + "multi_line_clear": null, + "score_scaling": null, + "level_progression": null, + "speed_progression": null, + "next_piece_preview": null, + "game_over_display": null, + "counter_clockwise_rotation": null, + "soft_drop_distinct": null + } +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json b/tasks/tetris/eval/gameplay-bot/calibration/7a348b81.json @@ -0,0 +1,33 @@ +{ + "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=uns_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run1", + "short_id": "7a348b81", + "label": "Start button broken", + "notes": "Clicking start game never does anything. Game is genuinely broken -- not a bot issue.", + "human_tested_at": "2026-04-09", + "human_tests": { + "game_loads": true, + "game_starts": false, + "auto_drop": null, + "move_left": null, + "move_right": null, + "move_down": null, + "rotate": null, + "hard_drop": null, + "all_pieces_rotate": null, + "piece_locks": null, + "new_piece_spawns": null, + "multiple_pieces": null, + "line_clear": null, + "score_changes": null, + "game_over": null, + "playable_30s": null, + "multi_line_clear": null, + "score_scaling": null, + "level_progression": null, + "speed_progression": null, + "next_piece_preview": null, + "game_over_display": null, + "counter_clockwise_rotation": null, + "soft_drop_distinct": null + } +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json b/tasks/tetris/eval/gameplay-bot/calibration/8fe72fce.json @@ -0,0 +1,33 @@ +{ + "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=en_lang=uns_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run2", + "short_id": "8fe72fce", + "label": "English playable", + "notes": "Playable English game.", + "human_tested_at": "2026-04-09", + "human_tests": { + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": null, + "all_pieces_rotate": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": null, + "score_changes": null, + "game_over": null, + "playable_30s": true, + "multi_line_clear": null, + "score_scaling": null, + "level_progression": null, + "speed_progression": null, + "next_piece_preview": null, + "game_over_display": null, + "counter_clockwise_rotation": null, + "soft_drop_distinct": null + } +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json b/tasks/tetris/eval/gameplay-bot/calibration/93e8feea.json @@ -0,0 +1,33 @@ +{ + "run_id": "tetris_arch=none_ctx=provided_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=off_budget=low_model=haiku45_pw=avail_prompt=detailed_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=off_tread=off_twrite=off_web=off_run3", + "short_id": "93e8feea", + "label": "Spanish overlay bug", + "notes": "Spanish game with full-screen overlay. Has a start button but clicking never dismisses the overlay. Game is visible behind the transparency and the next block changes on click, but game is unplayable.", + "human_tested_at": "2026-04-09", + "human_tests": { + "game_loads": true, + "game_starts": false, + "auto_drop": null, + "move_left": null, + "move_right": null, + "move_down": null, + "rotate": null, + "hard_drop": null, + "all_pieces_rotate": null, + "piece_locks": null, + "new_piece_spawns": null, + "multiple_pieces": null, + "line_clear": null, + "score_changes": null, + "game_over": null, + "playable_30s": null, + "multi_line_clear": null, + "score_scaling": null, + "level_progression": null, + "speed_progression": null, + "next_piece_preview": null, + "game_over_display": null, + "counter_clockwise_rotation": null, + "soft_drop_distinct": null + } +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json b/tasks/tetris/eval/gameplay-bot/calibration/e2e04e75.json @@ -0,0 +1,33 @@ +{ + "run_id": "tetris_arch=none_ctx=none_noise=clean_dsgn=none_eff=high_echk=none_hlang=es_lang=ts_lint=on_budget=low_model=haiku45_pw=avail_prompt=simple_rndr=none_strat=usub_tst=none_tedit=on_tglob=on_tgrep=on_tread=on_twrite=on_web=on_run1", + "short_id": "e2e04e75", + "label": "Spanish basic play", + "notes": "Spanish game. Basic play works fine.", + "human_tested_at": "2026-04-09", + "human_tests": { + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": null, + "all_pieces_rotate": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": null, + "score_changes": null, + "game_over": null, + "playable_30s": true, + "multi_line_clear": null, + "score_scaling": null, + "level_progression": null, + "speed_progression": null, + "next_piece_preview": null, + "game_over_display": null, + "counter_clockwise_rotation": null, + "soft_drop_distinct": null + } +}

Impressum · Datenschutz