loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit cce938f0ee50da98113502ccbc5e5d066efc5137
parent d748de6f4a388178c427cd55d11db0149a9d0d5b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu,  9 Apr 2026 07:52:24 +0200

Interactive calibration UI with human testing mode

Calibrate page now uses a React island with:
- "Human Testing" toggle button reveals clickable tri-state controls
  (yes/no/unanswered) for each test per game
- Short code + game link in title for easy click-and-play
- Editable notes field
- Copyable JSON export per card for pasting results back
- Aggregate agree/disagree stats at top
- Bot results referenced from eval_results.json (not duplicated)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Adashboard/src/components/Calibrate.tsx | 276+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/calibrate.astro | 136++++++-------------------------------------------------------------------------
2 files changed, 286 insertions(+), 126 deletions(-)

diff --git a/dashboard/src/components/Calibrate.tsx b/dashboard/src/components/Calibrate.tsx @@ -0,0 +1,276 @@ +import { useState } from "react"; + +interface BotTest { + name: string; + pass: boolean; + detail: string; +} + +interface CalibrationEntry { + run_id: string; + short_id: string; + label: string; + notes: string; + human_tested_at: string; + human_tests: Record<string, boolean | null>; +} + +interface ComparisonData { + entry: CalibrationEntry; + botScore: number | null; + botTests: BotTest[]; + artifactUrl: string; +} + +const ALL_TEST_NAMES = [ + "game_loads", "game_starts", "auto_drop", + "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate", + "piece_locks", "new_piece_spawns", "multiple_pieces", + "line_clear", "score_changes", + "game_over", "playable_30s", + "multi_line_clear", "score_scaling", "level_progression", "speed_progression", + "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct", +]; + +function TriState({ value, onChange }: { value: boolean | null; onChange: (v: boolean | null) => void }) { + const next = value === null ? true : value === true ? false : null; + const label = value === true ? "yes" : value === false ? "no" : "-"; + const color = value === true ? "var(--green)" : value === false ? "var(--red)" : "var(--text-muted)"; + return ( + <button + onClick={() => onChange(next)} + style={{ + background: "none", + border: "1px solid hsl(var(--border) / 0.5)", + color, + fontWeight: 700, + fontSize: "0.7rem", + padding: "2px 10px", + cursor: "pointer", + fontFamily: "var(--font-mono)", + minWidth: "40px", + }} + > + {label} + </button> + ); +} + +function CalibrationCard({ data, onUpdate }: { data: ComparisonData; onUpdate: (tests: Record<string, boolean | null>, notes: string) => void }) { + const { entry, botScore, botTests, artifactUrl } = data; + const [humanTests, setHumanTests] = useState<Record<string, boolean | null>>({ ...entry.human_tests }); + const [notes, setNotes] = useState(entry.notes); + const [showEditor, setShowEditor] = useState(false); + + const botByName = new Map(botTests.map(t => [t.name, t])); + + const humanPass = Object.values(humanTests).filter(v => v === true).length; + const humanFail = Object.values(humanTests).filter(v => v === false).length; + + let agree = 0, disagree = 0; + for (const name of ALL_TEST_NAMES) { + const human = humanTests[name]; + const bot = botByName.get(name); + const botSkip = bot?.detail?.startsWith("skipped:"); + if (human !== null && human !== undefined && bot && !botSkip) { + if (human === bot.pass) agree++; + else disagree++; + } + } + + function handleTestChange(name: string, value: boolean | null) { + const updated = { ...humanTests, [name]: value }; + setHumanTests(updated); + onUpdate(updated, notes); + } + + function handleNotesChange(value: string) { + setNotes(value); + onUpdate(humanTests, value); + } + + // Build export JSON + const exportData = { + run_id: entry.run_id, + short_id: entry.short_id, + label: entry.label, + notes, + human_tested_at: new Date().toISOString().slice(0, 10), + human_tests: humanTests, + }; + + return ( + <div className="card" style={{ padding: "20px", marginBottom: "20px" }}> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "12px" }}> + <div> + <h3 style={{ margin: "0 0 4px 0", fontSize: "1rem" }}> + <a href={artifactUrl} target="_blank" rel="noopener noreferrer" style={{ color: "var(--accent)", textDecoration: "none" }}> + {entry.short_id} + </a> + <span style={{ color: "var(--text)", marginLeft: "8px" }}>{entry.label}</span> + </h3> + <div style={{ fontSize: "0.7rem", color: "var(--text-muted)" }}> + <a href={artifactUrl} target="_blank" style={{ color: "var(--accent)" }}>Play game</a> + {" | "} + <a href={`/r/${entry.short_id}`} style={{ color: "var(--accent)" }}>Run detail</a> + </div> + </div> + <div style={{ display: "flex", gap: "16px", fontSize: "0.75rem", fontFamily: "var(--font-mono)" }}> + <div style={{ textAlign: "center" }}> + <div style={{ fontWeight: 700, fontSize: "1.1rem" }}>{humanPass}/{humanPass + humanFail || 0}</div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>HUMAN</div> + </div> + <div style={{ textAlign: "center" }}> + <div style={{ fontWeight: 700, fontSize: "1.1rem" }}>{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>BOT</div> + </div> + <div style={{ textAlign: "center" }}> + <div style={{ fontWeight: 700, fontSize: "1.1rem", color: disagree > 0 ? "var(--red)" : "var(--green)" }}>{agree}/{agree + disagree || 0}</div> + <div style={{ color: "var(--text-muted)", fontSize: "0.6rem" }}>AGREE</div> + </div> + </div> + </div> + + <div style={{ fontSize: "0.75rem", color: "var(--text-muted)", marginBottom: "12px", padding: "8px", background: "hsl(var(--bg-secondary))" }}> + {showEditor ? ( + <textarea + value={notes} + onChange={e => handleNotesChange(e.target.value)} + style={{ width: "100%", minHeight: "40px", background: "transparent", border: "1px solid var(--border)", color: "var(--text)", fontSize: "0.75rem", padding: "4px", fontFamily: "inherit", resize: "vertical" }} + /> + ) : ( + <span onClick={() => setShowEditor(true)} style={{ cursor: "pointer" }}>{notes || "(click to add notes)"}</span> + )} + </div> + + <table style={{ width: "100%", fontSize: "0.7rem", borderCollapse: "collapse" }}> + <thead> + <tr style={{ borderBottom: "1px solid var(--border)" }}> + <th style={{ textAlign: "left", padding: "4px 8px", fontWeight: 600 }}>Test</th> + <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "80px" }}>Human</th> + <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "80px" }}>Bot</th> + <th style={{ textAlign: "center", padding: "4px 8px", fontWeight: 600, width: "30px" }}></th> + <th style={{ textAlign: "left", padding: "4px 8px", fontWeight: 600 }}>Bot Detail</th> + </tr> + </thead> + <tbody> + {ALL_TEST_NAMES.map(name => { + const human = humanTests[name]; + const bot = botByName.get(name); + const botSkip = bot?.detail?.startsWith("skipped:"); + const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-"; + const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)"; + + let matchIcon = ""; + let matchColor = "var(--text-muted)"; + if (human !== null && human !== undefined && bot && !botSkip) { + if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; } + else { matchIcon = "!"; matchColor = "var(--red)"; } + } + + return ( + <tr key={name} style={{ borderBottom: "1px solid hsl(var(--border) / 0.3)" }}> + <td style={{ padding: "3px 8px", fontFamily: "var(--font-mono)" }}>{name}</td> + <td style={{ textAlign: "center", padding: "3px 8px" }}> + {showEditor ? ( + <TriState value={human ?? null} onChange={v => handleTestChange(name, v)} /> + ) : ( + <span style={{ color: human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)", fontWeight: 600 }}> + {human === true ? "yes" : human === false ? "no" : "-"} + </span> + )} + </td> + <td style={{ textAlign: "center", padding: "3px 8px", color: botColor, fontWeight: 600 }}>{botStr}</td> + <td style={{ textAlign: "center", padding: "3px 8px", color: matchColor, fontWeight: 700 }}>{matchIcon}</td> + <td style={{ padding: "3px 8px", color: "var(--text-muted)", overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap", maxWidth: "300px" }}>{bot?.detail || ""}</td> + </tr> + ); + })} + </tbody> + </table> + + {showEditor && ( + <div style={{ marginTop: "12px", padding: "8px", background: "hsl(var(--bg-secondary))", fontSize: "0.65rem" }}> + <div style={{ fontWeight: 600, marginBottom: "4px", color: "var(--text-muted)" }}>Copy this JSON to update the calibration file:</div> + <pre + style={{ margin: 0, padding: "8px", background: "hsl(var(--bg) / 0.5)", border: "1px solid var(--border)", overflow: "auto", maxHeight: "200px", cursor: "pointer", fontSize: "0.6rem" }} + onClick={e => { navigator.clipboard.writeText(JSON.stringify(exportData, null, 2)); (e.target as HTMLElement).style.outline = "2px solid var(--green)"; setTimeout(() => { (e.target as HTMLElement).style.outline = ""; }, 500); }} + title="Click to copy" + > + {JSON.stringify(exportData, null, 2)} + </pre> + </div> + )} + </div> + ); +} + +export default function Calibrate({ comparisons }: { comparisons: ComparisonData[] }) { + const [showEditor, setShowEditor] = useState(false); + const [updates, setUpdates] = useState<Map<string, { tests: Record<string, boolean | null>; notes: string }>>(new Map()); + + function handleUpdate(shortId: string, tests: Record<string, boolean | null>, notes: string) { + const next = new Map(updates); + next.set(shortId, { tests, notes }); + setUpdates(next); + } + + // Aggregate stats + const totalEntries = comparisons.length; + let totalAgree = 0, totalDisagree = 0; + for (const { entry, botTests } of comparisons) { + const botByName = new Map(botTests.map(t => [t.name, t])); + for (const name of ALL_TEST_NAMES) { + const human = entry.human_tests[name]; + const bot = botByName.get(name); + const botSkip = bot?.detail?.startsWith("skipped:"); + if (human !== null && human !== undefined && bot && !botSkip) { + if (human === bot.pass) totalAgree++; + else totalDisagree++; + } + } + } + + return ( + <div> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "24px" }}> + <div style={{ display: "flex", gap: "24px", fontFamily: "var(--font-mono)", fontSize: "0.8rem" }}> + <span>{totalEntries} games</span> + <span style={{ color: "var(--green)" }}>{totalAgree} agree</span> + <span style={{ color: totalDisagree > 0 ? "var(--red)" : "var(--text-muted)" }}>{totalDisagree} disagree</span> + </div> + <button + onClick={() => setShowEditor(!showEditor)} + style={{ + padding: "6px 16px", + fontSize: "0.75rem", + background: showEditor ? "var(--accent)" : "transparent", + color: showEditor ? "#fff" : "var(--text-muted)", + border: `1px solid ${showEditor ? "var(--accent)" : "var(--border)"}`, + cursor: "pointer", + fontFamily: "var(--font-mono)", + }} + > + {showEditor ? "Done Testing" : "Human Testing"} + </button> + </div> + + {showEditor && ( + <div className="card" style={{ padding: "16px", marginBottom: "20px", fontSize: "0.75rem" }}> + <div style={{ fontWeight: 600, marginBottom: "8px" }}>Paste results here</div> + <p style={{ color: "var(--text-muted)", fontSize: "0.7rem", margin: "0 0 8px 0" }}> + After clicking test states below, copy the JSON from each card and paste it to me in the chat with the short code. Format: <code style={{ background: "hsl(var(--bg-secondary))", padding: "1px 4px" }}>SHORT_ID: paste json</code> + </p> + </div> + )} + + {comparisons.map(data => ( + <CalibrationCard + key={data.entry.short_id} + data={{ ...data, entry: { ...data.entry, notes: updates.get(data.entry.short_id)?.notes ?? data.entry.notes, human_tests: updates.get(data.entry.short_id)?.tests ?? data.entry.human_tests } }} + onUpdate={(tests, notes) => handleUpdate(data.entry.short_id, tests, notes)} + /> + ))} + </div> + ); +} diff --git a/dashboard/src/pages/calibrate.astro b/dashboard/src/pages/calibrate.astro @@ -1,5 +1,6 @@ --- import Base from "../layouts/Base.astro"; +import Calibrate from "../components/Calibrate"; import fs from "node:fs"; import path from "node:path"; import { loadAllRuns } from "../lib/data"; @@ -17,7 +18,7 @@ interface CalibrationEntry { const entries: CalibrationEntry[] = []; if (fs.existsSync(calibrationDir)) { - for (const file of fs.readdirSync(calibrationDir)) { + for (const file of fs.readdirSync(calibrationDir).sort()) { if (!file.endsWith(".json")) continue; try { const data = JSON.parse(fs.readFileSync(path.join(calibrationDir, file), "utf-8")); @@ -30,138 +31,21 @@ if (fs.existsSync(calibrationDir)) { const allRuns = loadAllRuns(); const runsByRunId = new Map(allRuns.map(r => [r.meta.run_id, r])); -// Build comparison data +// Build comparison data for the React component const comparisons = entries.map(entry => { const run = runsByRunId.get(entry.run_id); - const botTests = run?.eval_results?.gameplay_bot?.report?.tests as Array<{name: string; pass: boolean; detail: string}> | undefined; - const botByName = new Map(botTests?.map(t => [t.name, t]) || []); - return { entry, run, botByName }; + const botScore = (run?.eval_results as any)?.gameplay_bot?.score ?? null; + const botTests = ((run?.eval_results as any)?.gameplay_bot?.report?.tests ?? []) as Array<{name: string; pass: boolean; detail: string}>; + const artifactUrl = `/artifacts/${entry.run_id}/index.html`; + return { entry, botScore, botTests, artifactUrl }; }); - -const allTestNames = [ - "game_loads", "game_starts", "auto_drop", - "move_left", "move_right", "move_down", "rotate", "hard_drop", "all_pieces_rotate", - "piece_locks", "new_piece_spawns", "multiple_pieces", - "line_clear", "score_changes", - "game_over", "playable_30s", - "multi_line_clear", "score_scaling", "level_progression", "speed_progression", - "next_piece_preview", "game_over_display", "counter_clockwise_rotation", "soft_drop_distinct", -]; --- <Base title="Bot Calibration"> <h1 style="margin-bottom: 8px;">Bot Calibration</h1> - <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 0.875rem;"> - Hand-picked games with human test results compared to bot results. Used to identify false positives and false negatives. + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;"> + Hand-picked games with human test results compared to bot results. </p> - {comparisons.map(({ entry, run, botByName }) => { - const botScore = run?.eval_results?.gameplay_bot?.score; - const humanPass = Object.values(entry.human_tests).filter(v => v === true).length; - const humanFail = Object.values(entry.human_tests).filter(v => v === false).length; - const humanUnanswered = Object.values(entry.human_tests).filter(v => v === null).length; - const artifactUrl = `/artifacts/${entry.run_id}/index.html`; - - // Count agreements/disagreements - let agree = 0, disagree = 0, botOnly = 0, humanOnly = 0; - for (const name of allTestNames) { - const human = entry.human_tests[name]; - const bot = botByName.get(name); - const botPass = bot?.pass; - const botSkip = bot?.detail?.startsWith("skipped:"); - if (human === null && (!bot || botSkip)) continue; // both unanswered - if (human !== null && bot && !botSkip) { - if (human === botPass) agree++; - else disagree++; - } else if (human !== null && (!bot || botSkip)) { - humanOnly++; - } else if (human === null && bot && !botSkip) { - botOnly++; - } - } - - return ( - <div class="card" style="padding: 20px; margin-bottom: 20px;"> - <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 12px;"> - <div> - <h3 style="margin: 0 0 4px 0; font-size: 1rem;">{entry.label}</h3> - <div style="font-size: 0.7rem; color: var(--text-muted);"> - <a href={artifactUrl} target="_blank" style="color: var(--accent);">Play game</a> - {" | "} - <a href={`/r/${entry.short_id}`} style="color: var(--accent);">Run detail</a> - {" | "} - Tested {entry.human_tested_at} - </div> - </div> - <div style="display: flex; gap: 16px; font-size: 0.75rem; font-family: var(--font-mono);"> - <div style="text-align: center;"> - <div style="font-weight: 700; font-size: 1.1rem;">{humanPass}/{humanPass + humanFail}</div> - <div style="color: var(--text-muted); font-size: 0.6rem;">HUMAN</div> - </div> - <div style="text-align: center;"> - <div style="font-weight: 700; font-size: 1.1rem;">{botScore != null ? `${Math.round(botScore * 100)}%` : "-"}</div> - <div style="color: var(--text-muted); font-size: 0.6rem;">BOT</div> - </div> - <div style="text-align: center;"> - <div style="font-weight: 700; font-size: 1.1rem; color: disagree > 0 ? 'var(--red)' : 'var(--green)'">{agree}/{agree + disagree}</div> - <div style="color: var(--text-muted); font-size: 0.6rem;">AGREE</div> - </div> - </div> - </div> - - {entry.notes && ( - <div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 12px; padding: 8px; background: hsl(var(--bg-secondary)); border-radius: 4px;"> - {entry.notes} - </div> - )} - - <table style="width: 100%; font-size: 0.7rem; border-collapse: collapse;"> - <thead> - <tr style="border-bottom: 1px solid var(--border);"> - <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Test</th> - <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Human</th> - <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 80px;">Bot</th> - <th style="text-align: center; padding: 4px 8px; font-weight: 600; width: 40px;"></th> - <th style="text-align: left; padding: 4px 8px; font-weight: 600;">Bot Detail</th> - </tr> - </thead> - <tbody> - {allTestNames.map(name => { - const human = entry.human_tests[name]; - const bot = botByName.get(name); - const botSkip = bot?.detail?.startsWith("skipped:"); - const humanStr = human === true ? "yes" : human === false ? "no" : "-"; - const humanColor = human === true ? "var(--green)" : human === false ? "var(--red)" : "var(--text-muted)"; - const botStr = bot ? (botSkip ? "skip" : bot.pass ? "yes" : "no") : "-"; - const botColor = bot ? (botSkip ? "var(--text-muted)" : bot.pass ? "var(--green)" : "var(--red)") : "var(--text-muted)"; - - // Agreement indicator - let matchIcon = ""; - let matchColor = "var(--text-muted)"; - if (human !== null && bot && !botSkip) { - if (human === bot.pass) { matchIcon = "="; matchColor = "var(--green)"; } - else { matchIcon = "!"; matchColor = "var(--red)"; } - } - - return ( - <tr style="border-bottom: 1px solid hsl(var(--border) / 0.3);"> - <td style="padding: 3px 8px; font-family: var(--font-mono);">{name}</td> - <td style={`text-align: center; padding: 3px 8px; color: ${humanColor}; font-weight: 600;`}>{humanStr}</td> - <td style={`text-align: center; padding: 3px 8px; color: ${botColor}; font-weight: 600;`}>{botStr}</td> - <td style={`text-align: center; padding: 3px 8px; color: ${matchColor}; font-weight: 700;`}>{matchIcon}</td> - <td style="padding: 3px 8px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; max-width: 300px;">{bot?.detail || ""}</td> - </tr> - ); - })} - </tbody> - </table> - </div> - ); - })} - - {entries.length === 0 && ( - <div class="card" style="padding: 32px; text-align: center; color: var(--text-muted);"> - No calibration entries. Add JSON files to tasks/tetris/eval/gameplay-bot/calibration/ - </div> - )} + <Calibrate client:load comparisons={comparisons} /> </Base>

Impressum · Datenschutz