loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 7f450138909c4de86b94981aca1ebbb1a9defa82
parent 08a16207bbb7cb2cc8da116886da99149ea093da
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:17:46 +0200

Add HTML validation, duplication detection, accessibility, page load time

code-analysis.sh:
- HTML validation via html-validate (error count, pass/fail)
- Code duplication percentage via jscpd
- Both factor into the code analysis score

gameplay bot:
- Page load time measurement (ms)
- Lightweight accessibility checks: page title, headings, canvas
  aria-label, image alt text, color contrast
- Results included in bot JSON report
- Timeout bumped to 3 minutes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/code-analysis.sh | 47+++++++++++++++++++++++++++++++++++++++++++++++
Mtasks/tetris/eval/gameplay-bot/index.ts | 67++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtasks/tetris/eval/gameplay-bot/types.ts | 8++++++++
3 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/tasks/tetris/eval/code-analysis.sh b/tasks/tetris/eval/code-analysis.sh @@ -74,6 +74,40 @@ results=$(echo "$results" | jq --arg c "$complexity" '. + {complexity: $c}') console_logs=$(grep -r "console\.log" --include="*.ts" --include="*.js" . 2>/dev/null | grep -v node_modules | wc -l) results=$(echo "$results" | jq --argjson cl "$console_logs" '. + {console_logs: $cl}') +# --- HTML validation --- +html_valid="unknown" +html_errors=0 +if [ -f "index.html" ] || find . -name "*.html" -not -path "*/node_modules/*" | head -1 | grep -q .; then + npm install --save-dev html-validate > /dev/null 2>&1 + html_file=$(find . -name "index.html" -not -path "*/node_modules/*" | head -1) + if [ -n "$html_file" ]; then + html_output=$(npx html-validate --formatter json "$html_file" 2>/dev/null) || true + if echo "$html_output" | jq . > /dev/null 2>&1; then + html_errors=$(echo "$html_output" | jq '[.[].errorCount] | add // 0') + html_errors=${html_errors:-0} + if [ "$html_errors" -eq 0 ]; then + html_valid="true" + else + html_valid="false" + fi + fi + fi +fi +results=$(echo "$results" | jq --arg v "$html_valid" --argjson e "$html_errors" \ + '. + {html_validation: {valid: ($v == "true"), errors: $e}}') + +# --- Code duplication --- +duplication_pct=0 +npm install --save-dev jscpd > /dev/null 2>&1 +dupl_output=$(npx jscpd --min-lines 5 --min-tokens 50 --reporters json \ + --ignore "node_modules,package-lock.json" . 2>/dev/null) || true +if echo "$dupl_output" | jq . > /dev/null 2>&1; then + duplication_pct=$(echo "$dupl_output" | jq '.statistics.total.percentage // 0') + duplication_pct=${duplication_pct:-0} +fi +results=$(echo "$results" | jq --argjson d "$duplication_pct" \ + '. + {duplication_percentage: $d}') + # --- Compute score --- # Scoring: fewer unnecessary files, fewer deps, moderate LOC, no debug noise score=100 @@ -100,6 +134,19 @@ if [ "$complexity" = "over-engineered" ]; then score=$((score - 10)) fi +# Penalty for invalid HTML (5 points) +if [ "$html_valid" = "false" ]; then + score=$((score - 5)) +fi + +# Penalty for high duplication (over 10%) +dup_int=$(awk "BEGIN {printf \"%d\", $duplication_pct}") +if [ "$dup_int" -gt 10 ]; then + score=$((score - 10)) +elif [ "$dup_int" -gt 5 ]; then + score=$((score - 5)) +fi + # Normalize to 0-1 score_normalized=$(awk "BEGIN {s = $score / 100; if (s < 0) s = 0; printf \"%.2f\", s}") diff --git a/tasks/tetris/eval/gameplay-bot/index.ts b/tasks/tetris/eval/gameplay-bot/index.ts @@ -92,10 +92,67 @@ test.describe("Tetris Gameplay Bot", () => { }); test("run gameplay bot", async ({ page }) => { - test.setTimeout(120_000); // 2-minute total timeout + test.setTimeout(180_000); // 3-minute total timeout + + // Measure page load time + let loadTimeMs = -1; + try { + const loadStart = Date.now(); + await page.goto(serverUrl, { waitUntil: "load", timeout: 10000 }); + loadTimeMs = Date.now() - loadStart; + // Navigate away so runAllTests starts fresh + await page.goto("about:blank"); + } catch { + // Load time measurement failed, not critical + } const { testResults, calibration, gameplay } = await runAllTests(page, serverUrl); + // Accessibility check via page evaluation (lightweight, no axe-core dependency) + let a11yIssues: string[] = []; + try { + await page.goto(serverUrl, { timeout: 10000 }); + await page.waitForTimeout(1000); + a11yIssues = await page.evaluate(() => { + const issues: string[] = []; + // Check page title + if (!document.title || document.title.trim() === "") { + issues.push("missing page title"); + } + // Check for headings + if (document.querySelectorAll("h1, h2, h3, [role='heading']").length === 0) { + issues.push("no headings found"); + } + // Check images have alt text + document.querySelectorAll("img").forEach((img) => { + if (!img.alt && !img.getAttribute("aria-label")) { + issues.push("image without alt text"); + } + }); + // Check canvas has accessible label + document.querySelectorAll("canvas").forEach((canvas) => { + if (!canvas.getAttribute("aria-label") && !canvas.getAttribute("role")) { + issues.push("canvas without aria-label or role"); + } + }); + // Check for focus indicators + const focusable = document.querySelectorAll("button, a, input, [tabindex]"); + if (focusable.length === 0 && document.querySelectorAll("canvas").length === 0) { + issues.push("no focusable elements"); + } + // Check color contrast on text elements (basic check) + const body = window.getComputedStyle(document.body); + const bgColor = body.backgroundColor; + const textColor = body.color; + if (bgColor === textColor) { + issues.push("text color matches background color"); + } + return issues; + }); + } catch { + // a11y check failed, not critical + } + const passed = testResults.filter((t) => t.pass).length; const failed = testResults.filter((t) => !t.pass).length; const total = testResults.length; @@ -117,6 +174,14 @@ test.describe("Tetris Gameplay Bot", () => { score: total > 0 ? Math.round((passed / total) * 100) / 100 : 0, }, gameplay, + performance: { + load_time_ms: loadTimeMs, + }, + accessibility: { + issues: a11yIssues, + issue_count: a11yIssues.length, + pass: a11yIssues.length === 0, + }, }; // Write report to file diff --git a/tasks/tetris/eval/gameplay-bot/types.ts b/tasks/tetris/eval/gameplay-bot/types.ts @@ -81,6 +81,14 @@ export interface BotReport { score: number; }; gameplay: GameplayStats; + performance?: { + load_time_ms: number; + }; + accessibility?: { + issues: string[]; + issue_count: number; + pass: boolean; + }; } /** Context passed through calibration, play, and reporting phases. */

Impressum · Datenschutz