commit cce938f0ee50da98113502ccbc5e5d066efc5137
parent d748de6f4a388178c427cd55d11db0149a9d0d5b
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Thu, 9 Apr 2026 07:52:24 +0200
Interactive calibration UI with human testing mode
Calibrate page now uses a React island with:
- "Human Testing" toggle button reveals clickable tri-state controls
(yes/no/unanswered) for each test per game
- Short code + game link in title for easy click-and-play
- Editable notes field
- Copyable JSON export per card for pasting results back
- Aggregate agree/disagree stats at top
- Bot results referenced from eval_results.json (not duplicated)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 286 insertions(+), 126 deletions(-)
diff --git a/dashboard/src/components/Calibrate.tsx b/dashboard/src/components/Calibrate.tsx
@@ -0,0 +1,276 @@
+import { useState } from "react";
+
+interface BotTest {
+ name: string;
+ pass: boolean;
+ detail: string;
+}
+
+interface CalibrationEntry {
+ run_id: string;
+ short_id: string;
+ label: string;
+ notes: string;
+ human_tested_at: string;
+ human_tests: Record<string, boolean | null>;
+}
+
+interface ComparisonData {
+ entry: CalibrationEntry;
+ botScore: number | null;
+ botTests: BotTest[];
+ artifactUrl: string;
+}
+
+const ALL_TEST_NAMES = [
+ "game_loads", "game_starts", "auto_drop",
+ "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate",
+ "piece_locks", "new_piece_spawns", "multiple_pieces",
+ "line_clear", "score_changes",
+ "game_over", "playable_30s",
+ "multi_line_clear", "score_scaling", "level_progression", "speed_progression",
+ "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct",
+];
+
+function TriState({ value, onChange }: { value: boolean | null; onChange: (v: boolean | null) => void }) {
+ const next = value === null ? true : value === true ? false : null;
+ const label = value === true ? "yes" : value === false ? "no" : "-";
+ const color = value === true ? "var(--green)" : value === false ? "var(--red)" : "var(--text-muted)";
+ return (
+ <button
+ onClick={() => onChange(next)}
+ style={{
+ background: "none",
+ border: "1px solid hsl(var(--border) / 0.5)",
+ color,
+ fontWeight: 700,
+ fontSize: "0.7rem",
+ padding: "2px 10px",
+ cursor: "pointer",
+ fontFamily: "var(--font-mono)",
+ minWidth: "40px",
+ }}
+ >
+ {label}
+ </button>
+ );
+}
+
+function CalibrationCard({ data, onUpdate }: { data: ComparisonData; onUpdate: (tests: Record<string, boolean | null>, notes: string) => void }) {
+ const { entry, botScore, botTests, artifactUrl } = data;
+ const [humanTests, setHumanTests] = useState<Record<string, boolean | null>>({ ...entry.human_tests });
+ const [notes, setNotes] = useState(entry.notes);
+ const [showEditor, setShowEditor] = useState(false);
+
+ const botByName = new Map(botTests.map(t => [t.name, t]));
+
+ const humanPass = Object.values(humanTests).filter(v => v === true).length;
+ const humanFail = Object.values(humanTests).filter(v => v === false).length;
+
+ let agree = 0, disagree = 0;
+ for (const name of ALL_TEST_NAMES) {
+ const human = humanTests[name];
+ const bot = botByName.get(name);
+ const botSkip = bot?.detail?.startsWith("skipped:");
+ if (human !== null && human !== undefined && bot && !botSkip) {
+ if (human === bot.pass) agree++;
+ else disagree++;
+ }
+ }
+
+ function handleTestChange(name: string, value: boolean | null) {
+ const updated = { ...humanTests, [name]: value };
+ setHumanTests(updated);
+ onUpdate(updated, notes);
+ }
+
+ function handleNotesChange(value: string) {
+ setNotes(value);
+ onUpdate(humanTests, value);
+ }
+
+ // Build export JSON
+ const exportData = {
+ run_id: entry.run_id,
+ short_id: entry.short_id,
+ label: entry.label,
+ notes,
+ human_tested_at: new Date().toISOString().slice(0, 10),
+ human_tests: humanTests,
+ };
+
+ return (
+ <div className="card" style={{ padding: "20px", marginBottom: "20px" }}>
+ <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "12px" }}>
+ <div>
+ <h3 style={{ margin: "0 0 4px 0", fontSize: "1rem" }}>
+ <a href={artifactUrl} target="_blank" rel="noopener noreferrer" style={{ color: "var(--accent)", textDecoration: "none" }}>
+ {entry.short_id}
+ </a>
+ <span style={{ color: "var(--text)", marginLeft: "8px" }}>{entry.label}</span>
+ </h3>
+ <div style={{ fontSize: "0.7rem", color: "var(--text-muted)" }}>
+ <a href={artifactUrl} target="_blank" style={{ color: "var(--accent)" }}>Play game</a>
+ {" | "}
+ <a href={`/r/${entry.short_id}`} style={{ color: "var(--accent)" }}>Run detail</a>
+ </div>
+ </div>
+ <div style={{ display: "flex", gap: "16px", fontSize: "0.75rem", fontFamily: "var(--font-mono)" }}>
+ <div style={{ textAlign: "center" }}>
+ <div style={{ fontWeight: 700, fontSize: "1.1rem" }}>{humanPass}/{humanPass + humanFail || 0}</div>
+ <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>HUMAN</div>
+ </div>
+ <div style={{ textAlign: "center" }}>
+ <div style={{ fontWeight: 700, fontSize: "1.1rem" }}>{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div>
+ <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>BOT</div>
+ </div>
+ <div style={{ textAlign: "center" }}>
+ <div style={{ fontWeight: 700, fontSize: "1.1rem", color: disagree > 0 ? "var(--red)" : "var(--green)" }}>{agree}/{agree + disagree || 0}</div>
+ <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>AGREE</div>
+ </div>
+ </div>
+ </div>
+
+ <div style={{ fontSize: "0.75rem", color: "var(--text-muted)", marginBottom: "12px", padding: "8px", background: "hsl(var(--bg-secondary))" }}>
+ {showEditor ? (
+ <textarea
+ value={notes}
+ onChange={e => handleNotesChange(e.target.value)}
+ style={{ width: "100%", minHeight: "40px", background: "transparent", border: "1px solid var(--border)", color: "var(--text)", fontSize: "0.75rem", padding: "4px", fontFamily: "inherit", resize: "vertical" }}
+ />
+ ) : (
+ <span onClick={() => setShowEditor(true)} style={{ cursor: "pointer" }}>{notes || "(click to add notes)"}</span>
+ )}
+ </div>
+
+ <table style={{ width: "100%", fontSize: "0.7rem", borderCollapse: "collapse" }}>
+ <thead>
+ <tr style={{ borderBottom: "1px solid var(--border)" }}>
+ <th style={{ textAlign: "left", padding: "4px 8px", fontWeight: 600 }}>Test</th>
+ <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "80px" }}>Human</th>
+ <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "80px" }}>Bot</th>
+ <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "30px" }}></th>
+ <th style={{ textAlign: "left", padding: "4px 8px", fontWeight: 600 }}>Bot Detail</th>
+ </tr>
+ </thead>
+ <tbody>
+ {ALL_TEST_NAMES.map(name => {
+ const human = humanTests[name];
+ const bot = botByName.get(name);
+ const botSkip = bot?.detail?.startsWith("skipped:");
+ const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-";
+ const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)";
+
+ let matchIcon = "";
+ let matchColor = "var(--text-muted)";
+ if (human !== null && human !== undefined && bot && !botSkip) {
+ if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; }
+ else { matchIcon = "!"; matchColor = "var(--red)"; }
+ }
+
+ return (
+ <tr key={name} style={{ borderBottom: "1px solid hsl(var(--border) / 0.3)" }}>
+ <td style={{ padding: "3px 8px", fontFamily: "var(--font-mono)" }}>{name}</td>
+ <td style={{ textAlign: "center", padding: "3px 8px" }}>
+ {showEditor ? (
+ <TriState value={human ?? null} onChange={v => handleTestChange(name, v)} />
+ ) : (
+ <span style={{ color: human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)", fontWeight: 600 }}>
+ {human === true ? "yes" : human === false ? "no" : "-"}
+ </span>
+ )}
+ </td>
+ <td style={{ textAlign: "center", padding: "3px 8px", color: botColor, fontWeight: 600 }}>{botStr}</td>
+ <td style={{ textAlign: "center", padding: "3px 8px", color: matchColor, fontWeight: 700 }}>{matchIcon}</td>
+ <td style={{ padding: "3px 8px", color: "var(--text-muted)", overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap", maxWidth: "300px" }}>{bot?.detail || ""}</td>
+ </tr>
+ );
+ })}
+ </tbody>
+ </table>
+
+ {showEditor && (
+ <div style={{ marginTop: "12px", padding: "8px", background: "hsl(var(--bg-secondary))", fontSize: "0.65rem" }}>
+ <div style={{ fontWeight: 600, marginBottom: "4px", color: "var(--text-muted)" }}>Copy this JSON to update the calibration file:</div>
+ <pre
+ style={{ margin: 0, padding: "8px", background: "hsl(var(--bg) / 0.5)", border: "1px solid var(--border)", overflow: "auto", maxHeight: "200px", cursor: "pointer", fontSize: "0.6rem" }}
+ onClick={e => { navigator.clipboard.writeText(JSON.stringify(exportData, null, 2)); (e.target as HTMLElement).style.outline = "2px solid var(--green)"; setTimeout(() => { (e.target as HTMLElement).style.outline = ""; }, 500); }}
+ title="Click to copy"
+ >
+ {JSON.stringify(exportData, null, 2)}
+ </pre>
+ </div>
+ )}
+ </div>
+ );
+}
+
+export default function Calibrate({ comparisons }: { comparisons: ComparisonData[] }) {
+ const [showEditor, setShowEditor] = useState(false);
+ const [updates, setUpdates] = useState<Map<string, { tests: Record<string, boolean | null>; notes: string }>>(new Map());
+
+ function handleUpdate(shortId: string, tests: Record<string, boolean | null>, notes: string) {
+ const next = new Map(updates);
+ next.set(shortId, { tests, notes });
+ setUpdates(next);
+ }
+
+ // Aggregate stats
+ const totalEntries = comparisons.length;
+ let totalAgree = 0, totalDisagree = 0;
+ for (const { entry, botTests } of comparisons) {
+ const botByName = new Map(botTests.map(t => [t.name, t]));
+ for (const name of ALL_TEST_NAMES) {
+ const human = entry.human_tests[name];
+ const bot = botByName.get(name);
+ const botSkip = bot?.detail?.startsWith("skipped:");
+ if (human !== null && human !== undefined && bot && !botSkip) {
+ if (human === bot.pass) totalAgree++;
+ else totalDisagree++;
+ }
+ }
+ }
+
+ return (
+ <div>
+ <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "24px" }}>
+ <div style={{ display: "flex", gap: "24px", fontFamily: "var(--font-mono)", fontSize: "0.8rem" }}>
+ <span>{totalEntries} games</span>
+ <span style={{ color: "var(--green)" }}>{totalAgree} agree</span>
+ <span style={{ color: totalDisagree > 0 ? "var(--red)" : "var(--text-muted)" }}>{totalDisagree} disagree</span>
+ </div>
+ <button
+ onClick={() => setShowEditor(!showEditor)}
+ style={{
+ padding: "6px 16px",
+ fontSize: "0.75rem",
+ background: showEditor ? "var(--accent)" : "transparent",
+ color: showEditor ? "#fff" : "var(--text-muted)",
+ border: `1px solid ${showEditor ? "var(--accent)" : "var(--border)"}`,
+ cursor: "pointer",
+ fontFamily: "var(--font-mono)",
+ }}
+ >
+ {showEditor ? "Done Testing" : "Human Testing"}
+ </button>
+ </div>
+
+ {showEditor && (
+ <div className="card" style={{ padding: "16px", marginBottom: "20px", fontSize: "0.75rem" }}>
+ <div style={{ fontWeight: 600, marginBottom: "8px" }}>Paste results here</div>
+ <p style={{ color: "var(--text-muted)", fontSize: "0.7rem", margin: "0 0 8px 0" }}>
+ After clicking test states below, copy the JSON from each card and paste it to me in the chat with the short code. Format: <code style={{ background: "hsl(var(--bg-secondary))", padding: "1px 4px" }}>SHORT_ID: paste json</code>
+ </p>
+ </div>
+ )}
+
+ {comparisons.map(data => (
+ <CalibrationCard
+ key={data.entry.short_id}
+ data={{ ...data, entry: { ...data.entry, notes: updates.get(data.entry.short_id)?.notes ?? data.entry.notes, human_tests: updates.get(data.entry.short_id)?.tests ?? data.entry.human_tests } }}
+ onUpdate={(tests, notes) => handleUpdate(data.entry.short_id, tests, notes)}
+ />
+ ))}
+ </div>
+ );
+}
diff --git a/dashboard/src/pages/calibrate.astro b/dashboard/src/pages/calibrate.astro
@@ -1,5 +1,6 @@
---
import Base from "../layouts/Base.astro";
+import Calibrate from "../components/Calibrate";
import fs from "node:fs";
import path from "node:path";
import { loadAllRuns } from "../lib/data";
@@ -17,7 +18,7 @@ interface CalibrationEntry {
const entries: CalibrationEntry[] = [];
if (fs.existsSync(calibrationDir)) {
- for (const file of fs.readdirSync(calibrationDir)) {
+ for (const file of fs.readdirSync(calibrationDir).sort()) {
if (!file.endsWith(".json")) continue;
try {
const data = JSON.parse(fs.readFileSync(path.join(calibrationDir, file), "utf-8"));
@@ -30,138 +31,21 @@ if (fs.existsSync(calibrationDir)) {
const allRuns = loadAllRuns();
const runsByRunId = new Map(allRuns.map(r => [r.meta.run_id, r]));
-// Build comparison data
+// Build comparison data for the React component
const comparisons = entries.map(entry => {
const run = runsByRunId.get(entry.run_id);
- const botTests = run?.eval_results?.gameplay_bot?.report?.tests as Array<{name: string; pass: boolean; detail: string}> | undefined;
- const botByName = new Map(botTests?.map(t => [t.name, t]) || []);
- return { entry, run, botByName };
+ const botScore = (run?.eval_results as any)?.gameplay_bot?.score ?? null;
+ const botTests = ((run?.eval_results as any)?.gameplay_bot?.report?.tests ?? []) as Array<{name: string; pass: boolean; detail: string}>;
+ const artifactUrl = `/artifacts/${entry.run_id}/index.html`;
+ return { entry, botScore, botTests, artifactUrl };
});
-
-const allTestNames = [
- "game_loads", "game_starts", "auto_drop",
- "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate",
- "piece_locks", "new_piece_spawns", "multiple_pieces",
- "line_clear", "score_changes",
- "game_over", "playable_30s",
- "multi_line_clear", "score_scaling", "level_progression", "speed_progression",
- "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct",
-];
---
<Base title="Bot Calibration">
<h1 style="margin-bottom: 8px;">Bot Calibration</h1>
- <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 0.875rem;">
- Hand-picked games with human test results compared to bot results. Used to identify false positives and false negatives.
+ <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;">
+ Hand-picked games with human test results compared to bot results.
</p>
- {comparisons.map(({ entry, run, botByName }) => {
- const botScore = run?.eval_results?.gameplay_bot?.score;
- const humanPass = Object.values(entry.human_tests).filter(v => v === true).length;
- const humanFail = Object.values(entry.human_tests).filter(v => v === false).length;
- const humanUnanswered = Object.values(entry.human_tests).filter(v => v === null).length;
- const artifactUrl = `/artifacts/${entry.run_id}/index.html`;
-
- // Count agreements/disagreements
- let agree = 0, disagree = 0, botOnly = 0, humanOnly = 0;
- for (const name of allTestNames) {
- const human = entry.human_tests[name];
- const bot = botByName.get(name);
- const botPass = bot?.pass;
- const botSkip = bot?.detail?.startsWith("skipped:");
- if (human === null && (!bot || botSkip)) continue; // both unanswered
- if (human !== null && bot && !botSkip) {
- if (human === botPass) agree++;
- else disagree++;
- } else if (human !== null && (!bot || botSkip)) {
- humanOnly++;
- } else if (human === null && bot && !botSkip) {
- botOnly++;
- }
- }
-
- return (
- <div class="card" style="padding: 20px; margin-bottom: 20px;">
- <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 12px;">
- <div>
- <h3 style="margin: 0 0 4px 0; font-size: 1rem;">{entry.label}</h3>
- <div style="font-size: 0.7rem; color: var(--text-muted);">
- <a href={artifactUrl} target="_blank" style="color: var(--accent);">Play game</a>
- {" | "}
- <a href={`/r/${entry.short_id}`} style="color: var(--accent);">Run detail</a>
- {" | "}
- Tested {entry.human_tested_at}
- </div>
- </div>
- <div style="display: flex; gap: 16px; font-size: 0.75rem; font-family: var(--font-mono);">
- <div style="text-align: center;">
- <div style="font-weight: 700; font-size: 1.1rem;">{humanPass}/{humanPass + humanFail}</div>
- <div style="color: var(--text-muted); font-size: 0.6rem;">HUMAN</div>
- </div>
- <div style="text-align: center;">
- <div style="font-weight: 700; font-size: 1.1rem;">{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div>
- <div style="color: var(--text-muted); font-size: 0.6rem;">BOT</div>
- </div>
- <div style="text-align: center;">
- <div style="font-weight: 700; font-size: 1.1rem; color: disagree > 0 ? 'var(--red)' : 'var(--green)'">{agree}/{agree + disagree}</div>
- <div style="color: var(--text-muted); font-size: 0.6rem;">AGREE</div>
- </div>
- </div>
- </div>
-
- {entry.notes && (
- <div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 12px; padding: 8px; background: hsl(var(--bg-secondary)); border-radius: 4px;">
- {entry.notes}
- </div>
- )}
-
- <table style="width: 100%; font-size: 0.7rem; border-collapse: collapse;">
- <thead>
- <tr style="border-bottom: 1px solid var(--border);">
- <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Test</th>
- <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Human</th>
- <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Bot</th>
- <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 40px;"></th>
- <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Bot Detail</th>
- </tr>
- </thead>
- <tbody>
- {allTestNames.map(name => {
- const human = entry.human_tests[name];
- const bot = botByName.get(name);
- const botSkip = bot?.detail?.startsWith("skipped:");
- const humanStr = human === true ? "yes" : human === false ? "no" : "-";
- const humanColor = human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)";
- const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-";
- const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)";
-
- // Agreement indicator
- let matchIcon = "";
- let matchColor = "var(--text-muted)";
- if (human !== null && bot && !botSkip) {
- if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; }
- else { matchIcon = "!"; matchColor = "var(--red)"; }
- }
-
- return (
- <tr style="border-bottom: 1px solid hsl(var(--border) / 0.3);">
- <td style="padding: 3px 8px; font-family: var(--font-mono);">{name}</td>
- <td style={`text-align: center; padding: 3px 8px; color: ${humanColor}; font-weight: 600;`}>{humanStr}</td>
- <td style={`text-align: center; padding: 3px 8px; color: ${botColor}; font-weight: 600;`}>{botStr}</td>
- <td style={`text-align: center; padding: 3px 8px; color: ${matchColor}; font-weight: 700;`}>{matchIcon}</td>
- <td style="padding: 3px 8px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; max-width: 300px;">{bot?.detail || ""}</td>
- </tr>
- );
- })}
- </tbody>
- </table>
- </div>
- );
- })}
-
- {entries.length === 0 && (
- <div class="card" style="padding: 32px; text-align: center; color: var(--text-muted);">
- No calibration entries. Add JSON files to tasks/tetris/eval/gameplay-bot/calibration/
- </div>
- )}
+ <Calibrate client:load comparisons={comparisons} />
</Base>