commit daf7a8d631b1f765ce5c665c04d380933ecdd207
parent c7d67a0208000a33a1957c4a7c045a94d3bf9427
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 14:02:48 +0200
Fix cell_id length, add SonarQube details, rebuild gameplay bot
- Abbreviate axis names in cell_id to stay under ext4 255-char limit
(257 chars -> 181 max). Fixes Plackett-Burman runs failing on long configs.
- Add SonarQube detail card on run page (ratings, bugs, smells, complexity)
- Iframe: single iframe with sandbox="allow-scripts" instead of double
iframe that caused CORS errors from null origin
- Gameplay bot: two-phase architecture (mechanics test, then play-to-win),
60 pieces / 45s extended play (was 30/20s), integrated score tracking
during play instead of separate 5-drop observation, 60ms polling (was 150ms)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
6 files changed, 158 insertions(+), 145 deletions(-)
diff --git a/dashboard/src/components/CellDetail.tsx b/dashboard/src/components/CellDetail.tsx
@@ -507,14 +507,14 @@ export default function CellDetail({ runs, axisValues }: CellDetailProps) {
</span>
</div>
<iframe
- srcDoc={`<!DOCTYPE html><html style="height:100%"><head><meta charset="UTF-8"></head><body style="margin:0;height:100%"><iframe src="${artifactUrl}" style="width:100%;height:100%;border:none" sandbox="allow-scripts"></iframe></body></html>`}
+ src={artifactUrl}
style={{
width: "100%",
height: "50vh",
border: "none",
background: "#fff",
}}
- sandbox="allow-scripts allow-same-origin"
+ sandbox="allow-scripts"
title={`Run #${r.meta.run_number} preview`}
/>
</div>
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -378,6 +378,36 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
</div>
)}
+ {/* SonarQube details */}
+ {(eval_results as Record<string, any>)?.sonarqube && !(eval_results as Record<string, any>).sonarqube.error && (
+ <div className="card" style={{ padding: "16px" }}>
+ <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>SonarQube</h4>
+ {(() => {
+ const sq = (eval_results as Record<string, any>).sonarqube;
+ const ratingColor = (r: string) => r === "A" ? "var(--green)" : r === "B" ? "var(--yellow)" : "var(--red)";
+ return (
+ <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}>
+ <div style={{ display: "flex", gap: "12px", marginBottom: "4px" }}>
+ {["maintainability", "reliability", "security"].map((k) => sq[k] && (
+ <div key={k} style={{ textAlign: "center" }}>
+ <div style={{ fontFamily: "var(--font-mono)", fontWeight: 700, fontSize: "1.1rem", color: ratingColor(sq[k]) }}>{sq[k]}</div>
+ <div style={{ fontSize: "0.55rem", color: "var(--text-muted)", textTransform: "capitalize" }}>{k}</div>
+ </div>
+ ))}
+ </div>
+ <Stat label="Bugs" value={sq.bugs ?? 0} />
+ <Stat label="Vulnerabilities" value={sq.vulnerabilities ?? 0} />
+ <Stat label="Code smells" value={sq.code_smells ?? 0} />
+ <Stat label="Cognitive complexity" value={sq.cognitive_complexity ?? "-"} />
+ <Stat label="Duplication" value={`${sq.duplication_pct ?? 0}%`} />
+ <Stat label="Tech debt" value={sq.tech_debt_minutes != null ? `${sq.tech_debt_minutes} min` : "-"} />
+ <Stat label="Lines analyzed" value={sq.lines_of_code ?? "-"} />
+ </div>
+ );
+ })()}
+ </div>
+ )}
+
{/* Quality details */}
{eval_results?.quality && (
<div className="card" style={{ padding: "16px" }}>
@@ -484,14 +514,14 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
</a>
</div>
<iframe
- srcDoc={`<!DOCTYPE html><html style="height:100%"><head><meta charset="UTF-8"></head><body style="margin:0;height:100%"><iframe src="${artifactUrl}" style="width:100%;height:100%;border:none" sandbox="allow-scripts"></iframe></body></html>`}
+ src={artifactUrl}
style={{
width: "100%",
height: "70vh",
border: "none",
background: "#fff",
}}
- sandbox="allow-scripts allow-same-origin"
+ sandbox="allow-scripts"
title="Result preview"
/>
</div>
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -17,6 +17,26 @@ from itertools import product
import yaml
+# Short axis names for cell_id to avoid filesystem path length limits (ext4: 255 chars)
+AXIS_ABBREV = {
+ "context_file": "ctx",
+ "effort": "eff",
+ "human_language": "hlang",
+ "language": "lang",
+ "linter": "lint",
+ "max_budget": "budget",
+ "model": "model",
+ "playwright": "pw",
+ "prompt_style": "prompt",
+ "sub_agents": "agents",
+ "tool_edit": "tedit",
+ "tool_glob": "tglob",
+ "tool_grep": "tgrep",
+ "tool_read": "tread",
+ "tool_write": "twrite",
+ "web_search": "web",
+}
+
def load_grid(path):
with open(path) as f:
@@ -95,8 +115,8 @@ def compute_cells(grid, profile_name):
if excluded:
continue
- # Build cell ID from task + all axis values (deterministic, filename-safe)
- cell_id_parts = [task] + [f"{k}={cell[k]}" for k in axis_names]
+ # Build cell ID from task + abbreviated axis values (deterministic, filename-safe)
+ cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={cell[k]}" for k in axis_names]
cell_id = "_".join(cell_id_parts)
# Resolve budget value
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -444,8 +444,9 @@ def _is_excluded(cell, grid):
def _build_cell(task, cell, defaults, grid):
+ from compute_grid import AXIS_ABBREV
axis_names = sorted(cell.keys())
- cell_id_parts = [task] + [f"{k}={cell[k]}" for k in axis_names]
+ cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={cell[k]}" for k in axis_names]
result = dict(cell)
result["task"] = task
diff --git a/tasks/tetris/eval/gameplay-bot/player.ts b/tasks/tetris/eval/gameplay-bot/player.ts
@@ -85,10 +85,11 @@ interface Placement {
export async function playGame(
page: Page,
cal: CalibrationResult,
- options: { maxPieces?: number; maxDurationMs?: number }
-): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number }> {
+ options: { maxPieces?: number; maxDurationMs?: number; scoreSelector?: string }
+): Promise<{ piecesPlaced: number; linesCleared: number; errors: number; gridReads: number; gridReadFails: number; scoreValues: number[] }> {
const maxPieces = options.maxPieces ?? 100;
const maxDuration = options.maxDurationMs ?? 30000;
+ const scoreSelector = options.scoreSelector ?? null;
const start = Date.now();
let piecesPlaced = 0;
let linesCleared = 0;
@@ -96,6 +97,8 @@ export async function playGame(
let gridReads = 0;
let gridReadFails = 0;
let consecutiveReadFails = 0;
+ const scoreValues: number[] = [];
+ let scorePollCounter = 0;
let previousGrid: Grid | null = null;
let settledGrid: Grid | null = null;
@@ -115,13 +118,29 @@ export async function playGame(
piecesPlaced += 3;
break;
}
- await page.waitForTimeout(150);
+ await page.waitForTimeout(60);
continue;
}
gridReads++;
consecutiveReadFails = 0;
+ // Lightweight score tracking: read score every ~5 polls
+ if (scoreSelector) {
+ scorePollCounter++;
+ if (scorePollCounter % 5 === 0) {
+ try {
+ const scoreText = await page.textContent(scoreSelector);
+ if (scoreText) {
+ const nums = (scoreText.match(/\d+/g) || []).map(Number);
+ if (nums.length > 0) {
+ scoreValues.push(Math.max(...nums));
+ }
+ }
+ } catch { /* ignore score read failures */ }
+ }
+ }
+
// Detect if anything changed
if (previousGrid && !gridsAreDifferent(grid, previousGrid)) {
// Nothing changed, wait and poll again
@@ -131,7 +150,7 @@ export async function playGame(
await page.keyboard.press(cal.controls.drop);
lastPlacementTime = Date.now();
}
- await page.waitForTimeout(150);
+ await page.waitForTimeout(60);
continue;
}
@@ -142,7 +161,7 @@ export async function playGame(
waitingForNewPiece = false;
lastPlacementTime = Date.now();
previousGrid = grid;
- await page.waitForTimeout(100);
+ await page.waitForTimeout(60);
continue;
}
@@ -169,7 +188,7 @@ export async function playGame(
}
// Wait for the piece to lock and next piece to spawn
- await page.waitForTimeout(200);
+ await page.waitForTimeout(100);
// Read the settled state
const afterGrid = await readGrid(page, cal);
@@ -197,16 +216,16 @@ export async function playGame(
}
previousGrid = grid;
- await page.waitForTimeout(150);
+ await page.waitForTimeout(60);
} catch {
errors++;
await playRandomMove(page, cal);
piecesPlaced++;
- await page.waitForTimeout(100);
+ await page.waitForTimeout(60);
}
}
- return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails };
+ return { piecesPlaced, linesCleared, errors, gridReads, gridReadFails, scoreValues };
}
/**
@@ -254,7 +273,7 @@ async function executePlacement(
// Hard drop
await page.keyboard.press(cal.controls.drop);
- await page.waitForTimeout(100);
+ await page.waitForTimeout(60);
}
/**
diff --git a/tasks/tetris/eval/gameplay-bot/tests.ts b/tasks/tetris/eval/gameplay-bot/tests.ts
@@ -107,28 +107,17 @@ export async function runAllTests(
await runBasicMechanicsPhase(page, cal, session);
}
- // ---- Phase 4: Multi-piece play session ----
- // Reload for clean state
+ // ---- Phase 4: Reload + calibrate for gameplay ----
try {
await loadGamePage(page, serverUrl);
cal = await calibrate(page);
session.started = session.started || cal.startMechanism !== "unknown";
} catch { /* continue with existing state */ }
- await runPlayPhase(page, cal, session, gameplay);
+ // ---- Phase 5: Extended gameplay with integrated score tracking ----
+ await runGameplayPhase(page, cal, session, gameplay);
- // ---- Phase 5: Line clear attempts ----
- try {
- await loadGamePage(page, serverUrl);
- cal = await calibrate(page);
- } catch { /* continue */ }
-
- await runLineClearPhase(page, cal, session, gameplay);
-
- // ---- Phase 6: Score observation ----
- await observeScore(page, cal, session, gameplay);
-
- // ---- Phase 7: Game over test ----
+ // ---- Phase 6: Game over test ----
try {
await loadGamePage(page, serverUrl);
cal = await calibrate(page);
@@ -136,7 +125,7 @@ export async function runAllTests(
await runGameOverPhase(page, cal, session);
- // ---- Phase 8: 30-second play test ----
+ // ---- Phase 7: 30-second endurance play ----
try {
await loadGamePage(page, serverUrl);
cal = await calibrate(page);
@@ -353,15 +342,16 @@ async function runBasicMechanicsPhase(
}
/**
- * Play multiple pieces and track what happens.
+ * Extended gameplay phase with integrated score tracking.
+ * Plays up to 60 pieces / 45 seconds using the AI, reads score on every
+ * poll cycle, and falls back to brute-force line clearing if needed.
*/
-async function runPlayPhase(
+async function runGameplayPhase(
page: Page,
cal: CalibrationResult,
session: GameSession,
gameplay: GameplayStats
): Promise<void> {
- // Drop 10 pieces to test multiple pieces mechanic
const gridBefore = await readGrid(page, cal);
const filledBefore = gridBefore ? countFilled(gridBefore) : 0;
if (gridBefore) {
@@ -371,69 +361,36 @@ async function runPlayPhase(
}
session.frames++;
- let settledGrid = gridBefore;
-
- for (let i = 0; i < 10; i++) {
- await hardDrop(page, cal);
- await page.waitForTimeout(300);
- gameplay.pieces_placed++;
- session.piecesLocked++;
-
- const grid = await readGrid(page, cal);
- if (grid) {
- session.gridReadSuccess++;
- session.frames++;
-
- // Detect piece type from diff
- if (settledGrid) {
- const cells = detectActivePieceCells(grid, settledGrid);
- if (cells) {
- const pt = identifyPieceType(cells);
- session.pieceTypes.add(pt);
- session.piecesSpawned++;
- }
- }
- settledGrid = grid;
- } else {
- session.gridReadFail++;
- session.frames++;
- }
- }
-
- const gridAfter = await readGrid(page, cal);
- if (gridAfter) {
- session.gridReadSuccess++;
- session.frames++;
- const filledAfter = countFilled(gridAfter);
- if (filledAfter > filledBefore) {
- session.events.push({
- type: "piece_locked",
- frame: session.frames,
- filledDelta: filledAfter - filledBefore,
- });
- }
+ // Read initial score before play begins
+ if (cal.scoreElementSelector) {
+ try {
+ const scoreText = await page.textContent(cal.scoreElementSelector);
+ const nums = extractScoreFromText(scoreText);
+ const val = Math.max(...nums);
+ session.scoreValues.push(val);
+ } catch { /* ignore */ }
}
-}
-/**
- * Attempt to clear lines using AI play and brute-force methods.
- */
-async function runLineClearPhase(
- page: Page,
- cal: CalibrationResult,
- session: GameSession,
- gameplay: GameplayStats
-): Promise<void> {
- const gridBefore = await readGrid(page, cal);
- const filledBefore = gridBefore ? countFilled(gridBefore) : 0;
-
- // Play strategically using the AI
- const result = await playGame(page, cal, { maxPieces: 30, maxDurationMs: 20000 });
+ // Play strategically using the AI with integrated score tracking
+ const result = await playGame(page, cal, {
+ maxPieces: 60,
+ maxDurationMs: 45000,
+ scoreSelector: cal.scoreElementSelector ?? undefined,
+ });
gameplay.pieces_placed += result.piecesPlaced;
gameplay.errors_during_play += result.errors;
session.gridReadSuccess += result.gridReads;
session.gridReadFail += result.gridReadFails;
session.frames += result.gridReads + result.gridReadFails;
+ session.piecesLocked += result.piecesPlaced;
+
+ // Merge score values collected during play
+ for (const sv of result.scoreValues) {
+ session.scoreValues.push(sv);
+ if (sv > gameplay.max_score_observed) {
+ gameplay.max_score_observed = sv;
+ }
+ }
if (result.linesCleared > 0) {
session.linesCleared += result.linesCleared;
@@ -443,40 +400,21 @@ async function runLineClearPhase(
}
}
- // If no lines cleared yet, try brute-force approach
- if (session.linesCleared === 0) {
- const cleared = await tryFillRow(page, cal, 10);
- gameplay.pieces_placed += 10;
- if (cleared) {
- session.linesCleared++;
- gameplay.lines_cleared++;
- session.events.push({ type: "line_cleared", count: 1, frame: session.frames });
- }
- }
-
- // Check if total filled decreased (indicates clearing happened)
- if (session.linesCleared === 0) {
- const gridAfter = await readGrid(page, cal);
- const filledAfter = gridAfter ? countFilled(gridAfter) : 0;
- if (filledAfter < filledBefore && filledBefore > 0) {
- session.linesCleared++;
- gameplay.lines_cleared++;
- session.events.push({ type: "line_cleared", count: 1, frame: session.frames });
- }
+ // Read final score after play
+ if (cal.scoreElementSelector) {
+ try {
+ const scoreText = await page.textContent(cal.scoreElementSelector);
+ const nums = extractScoreFromText(scoreText);
+ const val = Math.max(...nums);
+ session.scoreValues.push(val);
+ if (val > gameplay.max_score_observed) {
+ gameplay.max_score_observed = val;
+ }
+ } catch { /* ignore */ }
}
-}
-/**
- * Observe the score element during gameplay.
- */
-async function observeScore(
- page: Page,
- cal: CalibrationResult,
- session: GameSession,
- gameplay: GameplayStats
-): Promise<void> {
- if (!cal.scoreElementSelector) {
- // Try to find any number on the page that changes
+ // If no score element found, try to detect changing numbers on page
+ if (!cal.scoreElementSelector && session.scoreValues.length === 0) {
try {
const textBefore = await page.evaluate(() => document.body.innerText);
const numbersBefore = (textBefore.match(/\d+/g) || []).map(Number);
@@ -497,33 +435,38 @@ async function observeScore(
}
}
} catch { /* ignore */ }
- return;
}
- try {
- const scoreBefore = await page.textContent(cal.scoreElementSelector);
- const numsBefore = extractScoreFromText(scoreBefore);
- session.scoreValues.push(Math.max(...numsBefore));
+ // Record pieces for multi-piece detection
+ if (result.piecesPlaced > 0) {
+ session.events.push({
+ type: "piece_locked",
+ frame: session.frames,
+ filledDelta: result.piecesPlaced * 4,
+ });
+ }
- // Play a bit
- for (let i = 0; i < 5; i++) {
- await page.keyboard.press(cal.controls.drop);
- await page.waitForTimeout(300);
+ // If no lines cleared by AI, try brute-force approach
+ if (session.linesCleared === 0) {
+ const cleared = await tryFillRow(page, cal, 10);
+ gameplay.pieces_placed += 10;
+ if (cleared) {
+ session.linesCleared++;
+ gameplay.lines_cleared++;
+ session.events.push({ type: "line_cleared", count: 1, frame: session.frames });
}
+ }
- // Poll for score change
- for (let poll = 0; poll < 4; poll++) {
- await page.waitForTimeout(500);
- const scoreAfter = await page.textContent(cal.scoreElementSelector);
- const numsAfter = extractScoreFromText(scoreAfter);
- const maxAfter = Math.max(...numsAfter);
- session.scoreValues.push(maxAfter);
- if (maxAfter > gameplay.max_score_observed) {
- gameplay.max_score_observed = maxAfter;
- }
- if (maxAfter > Math.max(...numsBefore)) break;
+ // Check if total filled decreased (indicates clearing happened)
+ if (session.linesCleared === 0) {
+ const gridAfter = await readGrid(page, cal);
+ const filledAfter = gridAfter ? countFilled(gridAfter) : 0;
+ if (filledAfter < filledBefore && filledBefore > 0) {
+ session.linesCleared++;
+ gameplay.lines_cleared++;
+ session.events.push({ type: "line_cleared", count: 1, frame: session.frames });
}
- } catch { /* ignore */ }
+ }
}
/**