commit d7f2fbbca814d869ee1d4f46ef35b449c22ec226
parent 64c6b20e80b1e88b3b7649aa8941428f92699ab1
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 10:30:51 +0200
Add methodology page explaining scoring and experiment design
Static content page covering:
1. Framework: inputs (variables), outputs (code quality), outcomes (does it work)
2. Scoring: 50% gameplay bot + 50% quality
3. Output metrics: structural, code analysis, SonarQube, transcript
4. DOE: main effects, Plackett-Burman, interaction hunt
5. Gameplay bot: calibration, continuous observation, AI play, test derivation
6. Known limitations
Nav updated with vertical divider separating data pages from methodology.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 477 insertions(+), 0 deletions(-)
diff --git a/dashboard/src/pages/methodology.astro b/dashboard/src/pages/methodology.astro
@@ -0,0 +1,477 @@
+---
+import Base from "../layouts/Base.astro";
+---
+
+<Base title="Methodology">
+ <h1 style="margin-bottom: 8px;">Methodology</h1>
+ <p style="color: var(--text-muted); margin-bottom: 32px; font-size: 0.875rem;">
+ How the benchmark works, what it measures, and why.
+ </p>
+
+ <div class="methodology">
+
+ <!-- Framework -->
+ <section class="method-section">
+ <h2>Framework</h2>
+ <p>
+ The benchmark separates three concepts: what goes in, what comes out, and whether it works.
+ </p>
+
+ <div class="three-col">
+ <div class="card">
+ <h3>Inputs</h3>
+ <p>
+ The experiment variables -- model, effort level, tools, prompt style, programming language,
+ human language, budget, and more. These are the grid axes. Each unique combination of values
+ is a "cell" in the experiment matrix.
+ </p>
+ <p class="muted">16 axes. See the <a href="/compare">Compare</a> page for the full list.</p>
+ </div>
+ <div class="card">
+ <h3>Outputs</h3>
+ <p>
+ Measures of <em>how</em> the code was built. Code quality, structural integrity, agent
+ efficiency, complexity metrics. These tell you about the process and the codebase, not
+ whether the end product works.
+ </p>
+ <p class="muted">Tracked and displayed, but not in the headline score.</p>
+ </div>
+ <div class="card">
+ <h3>Outcomes</h3>
+ <p>
+ Measures of <em>what</em> was delivered. Does the game load? Do the controls work?
+ Does it clear lines? Can you play for 30 seconds without crashing? These tell you
+ whether the agent succeeded at the task.
+ </p>
+ <p class="muted">This is the headline score.</p>
+ </div>
+ </div>
+
+ <div class="callout">
+ The headline score is <strong>50% gameplay + 50% code quality</strong> (outcomes only).
+ Output metrics are tracked separately so you can see the full picture without them
+ distorting the primary question: did it work?
+ </div>
+ </section>
+
+ <!-- Scoring -->
+ <section class="method-section">
+ <h2>How scoring works</h2>
+ <p>
+ All evaluation is deterministic code. No LLM grading. The agent never sees the test suite.
+ </p>
+
+ <div class="two-col">
+ <div class="card">
+ <h3>Gameplay Bot <span class="weight">50%</span></h3>
+ <p>
+ 16 automated Playwright tests. The bot calibrates itself to each game -- it finds the
+ grid, discovers controls, locates the start mechanism. Then it plays using a continuous
+ 150ms polling loop that reads grid state directly from the canvas or DOM.
+ </p>
+ <p>Tests cover:</p>
+ <ul>
+ <li>Game loads and starts</li>
+ <li>Auto-drop (gravity works)</li>
+ <li>Movement (left, right, down)</li>
+ <li>Rotation</li>
+ <li>Hard drop</li>
+ <li>Piece locking and new piece spawning</li>
+ <li>Line clearing</li>
+ <li>Score changes</li>
+ <li>Game over detection</li>
+ <li>30-second endurance (no crashes or freezes)</li>
+ </ul>
+ <p class="muted">
+ Every test is pure deterministic observation. The bot reads pixels or DOM state,
+ presses keys, and checks if the game responded correctly.
+ </p>
+ </div>
+ <div class="card">
+ <h3>Code Quality <span class="weight">50%</span></h3>
+ <p>
+ Automated quality checks run against the agent's output:
+ </p>
+ <ul>
+ <li><strong>ESLint</strong> -- errors and warnings counted</li>
+ <li><strong>TypeScript</strong> -- compilation success/failure</li>
+ <li><strong>Bundle size</strong> -- measured after build</li>
+ </ul>
+ <p class="muted">
+ Quality scoring rewards clean, buildable code. A game that works perfectly but has
+ 200 lint errors will score lower than one with clean code.
+ </p>
+ </div>
+ </div>
+ </section>
+
+ <!-- Output metrics -->
+ <section class="method-section">
+ <h2>Output metrics</h2>
+ <p>
+ These are tracked and displayed on each run's detail page, but they do not affect
+ the headline score. They provide context for understanding how the agent worked.
+ </p>
+
+ <div class="two-col">
+ <div class="card">
+ <h3>Structural</h3>
+ <ul>
+ <li>Entry point exists (index.html or equivalent)</li>
+ <li>Build succeeds</li>
+ <li>TypeScript compiles without errors</li>
+ </ul>
+ </div>
+
+ <div class="card">
+ <h3>Code Analysis</h3>
+ <ul>
+ <li>Function length (max lines per function)</li>
+ <li>Nesting depth (max indent level)</li>
+ <li>Naming consistency (camelCase vs mixed)</li>
+ <li>Separation of concerns (single-file vs modular)</li>
+ <li>Code duplication</li>
+ <li>HTML validation</li>
+ <li>Magic numbers</li>
+ </ul>
+ </div>
+
+ <div class="card">
+ <h3>SonarQube</h3>
+ <ul>
+ <li>Cognitive complexity</li>
+ <li>Bugs and vulnerabilities</li>
+ <li>Code smells</li>
+ <li>Maintainability rating</li>
+ <li>Reliability rating</li>
+ <li>Security rating</li>
+ </ul>
+ <p class="muted">Only runs when the SonarQube server is available locally.</p>
+ </div>
+
+ <div class="card">
+ <h3>Transcript Analysis</h3>
+ <ul>
+ <li>Tool call breakdown (which tools the agent used)</li>
+ <li>Wasted turns (reading docs, generating ASCII art, starting dev servers)</li>
+ <li>Productivity ratio (useful actions / total actions)</li>
+ <li>Self-testing (did the agent test its own code?)</li>
+ </ul>
+ <p class="muted">Measures agent efficiency, not code quality.</p>
+ </div>
+ </div>
+ </section>
+
+ <!-- DOE -->
+ <section class="method-section">
+ <h2>Experiment design</h2>
+ <p>
+ The full grid has 16 axes. A naive full factorial would produce over 200,000 cells.
+ Instead, we use statistical designs from Design of Experiments (DOE) to sample
+ the space efficiently.
+ </p>
+
+ <div class="designs">
+ <div class="card">
+ <h3>Main effects sweep</h3>
+ <p>
+ Vary one axis at a time from a baseline configuration, holding everything else constant.
+ This identifies which variables matter most for a given metric (score, cost, time).
+ </p>
+ <p class="muted">
+ ~18 cells. Efficient but cannot detect interactions between variables.
+ </p>
+ </div>
+
+ <div class="card">
+ <h3>Plackett-Burman</h3>
+ <p>
+ A screening design for binary factors. Tests many on/off variables simultaneously
+ using a mathematically constructed matrix that minimizes the number of runs needed
+ to estimate main effects.
+ </p>
+ <p class="muted">
+ Scales logarithmically with number of factors. Good for tool toggles.
+ </p>
+ </div>
+
+ <div class="card">
+ <h3>Interaction hunt</h3>
+ <p>
+ Full factorial on a small subset of axes (typically 2-4). Used after main effects
+ screening identifies the top variables, to find interactions between them.
+ </p>
+ <p class="muted">
+ Example: does the effect of prompt_style depend on model? Only a factorial can tell you.
+ </p>
+ </div>
+ </div>
+
+ <div class="callout">
+ Each cell runs <strong>3 times</strong> by default. Repeat trials let us measure variance
+ and distinguish real effects from noise. A variable that shifts the mean by 5 points is only
+ meaningful if the run-to-run variance within a cell is smaller than 5 points.
+ </div>
+ </section>
+
+ <!-- Gameplay bot -->
+ <section class="method-section">
+ <h2>How the gameplay bot works</h2>
+ <p>
+ The bot is a Playwright script that loads any Tetris implementation and figures out
+ how to interact with it. It handles different renderers, control schemes, and start
+ mechanisms without prior knowledge of the implementation.
+ </p>
+
+ <div class="phases">
+ <div class="phase">
+ <div class="phase-header">
+ <span class="phase-number">1</span>
+ <h3>Calibration</h3>
+ </div>
+ <p>
+ Detect the game grid (canvas, DOM, or SVG). Find controls by trying arrow keys,
+ WASD, and checking if pieces move. Locate the start mechanism (auto-start, button,
+ keypress). Find the score display element.
+ </p>
+ </div>
+
+ <div class="phase">
+ <div class="phase-header">
+ <span class="phase-number">2</span>
+ <h3>Observation</h3>
+ </div>
+ <p>
+ Continuous 150ms polling loop reads the grid state as a 10x20 boolean matrix.
+ For canvas games, this means sampling one pixel per cell and checking against a
+ calibrated background threshold. All state changes are recorded as events.
+ </p>
+ </div>
+
+ <div class="phase">
+ <div class="phase-header">
+ <span class="phase-number">3</span>
+ <h3>AI play</h3>
+ </div>
+ <p>
+ A 4-heuristic algorithm evaluates all possible placements for each piece:
+ </p>
+ <pre><code>score = -0.51 * height + 0.76 * lines - 0.36 * holes - 0.18 * bumpiness</code></pre>
+ <p class="muted">
+ Algorithm from <a href="https://github.com/LeeYiyuan/tetrisai" target="_blank" rel="noopener">LeeYiyuan/tetrisai</a> (MIT license).
+ The goal is not to play well -- it is to exercise all game mechanics.
+ </p>
+ </div>
+
+ <div class="phase">
+ <div class="phase-header">
+ <span class="phase-number">4</span>
+ <h3>Test derivation</h3>
+ </div>
+ <p>
+ 16 pass/fail results are derived from the recorded events. Each test is independent --
+ a failure in one does not affect others. The bot never crashes on a single test failure.
+ </p>
+ </div>
+ </div>
+
+ <div class="callout">
+ No false positives: the grid reader must confirm state changes through pixel/DOM inspection.
+ If grid detection fails entirely, the bot falls back to screenshot comparison and reports
+ grid-dependent tests as <strong>INCONCLUSIVE</strong> rather than passed.
+ </div>
+ </section>
+
+ <!-- Limitations -->
+ <section class="method-section">
+ <h2>Known limitations</h2>
+ <ul class="limitations">
+ <li>
+ <strong>Non-standard rendering.</strong> Games that draw to canvas without using standard
+ 2D context methods (e.g., WebGL-only) may not have their grid detected.
+ </li>
+ <li>
+ <strong>Score detection.</strong> The bot looks for elements containing "score" text or
+ standalone numbers that change during play. Non-standard layouts can cause misdetection.
+ </li>
+ <li>
+ <strong>Bot skill.</strong> The heuristic player tests mechanics, not mastery. It will
+ miss edge cases that only appear at high speeds or with unusual piece sequences.
+ </li>
+ <li>
+ <strong>SonarQube availability.</strong> SonarQube metrics only populate when the server
+ is running locally. Runs evaluated without it will have empty SonarQube sections.
+ </li>
+ <li>
+ <strong>Single task.</strong> Currently only the Tetris task is evaluated. Results may
+ not generalize to other task types (APIs, data pipelines, etc.).
+ </li>
+ <li>
+ <strong>Sample size.</strong> Statistical power depends on having enough runs. Small
+ sweeps can produce noisy main effect estimates.
+ </li>
+ </ul>
+ </section>
+
+ </div>
+</Base>
+
+<style>
+ .methodology {
+ max-width: 960px;
+ }
+
+ .method-section {
+ margin-bottom: 48px;
+ }
+
+ .method-section h2 {
+ margin-bottom: 12px;
+ color: hsl(var(--primary));
+ }
+
+ .method-section > p {
+ margin-bottom: 16px;
+ line-height: 1.7;
+ }
+
+ .three-col {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 16px;
+ margin-bottom: 16px;
+ }
+
+ .two-col {
+ display: grid;
+ grid-template-columns: repeat(2, 1fr);
+ gap: 16px;
+ margin-bottom: 16px;
+ }
+
+ .designs {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 16px;
+ margin-bottom: 16px;
+ }
+
+ .card h3 {
+ margin-bottom: 8px;
+ color: hsl(var(--foreground));
+ }
+
+ .card p {
+ margin-bottom: 8px;
+ line-height: 1.6;
+ }
+
+ .card ul {
+ margin: 8px 0;
+ padding-left: 20px;
+ }
+
+ .card li {
+ margin-bottom: 4px;
+ line-height: 1.5;
+ }
+
+ .weight {
+ color: hsl(var(--muted-foreground));
+ font-size: var(--text-label);
+ font-weight: 400;
+ }
+
+ .muted {
+ color: hsl(var(--muted-foreground));
+ font-size: 0.85rem;
+ }
+
+ .callout {
+ border-left: 3px solid hsl(var(--primary));
+ padding: 12px 16px;
+ background: hsl(var(--primary) / 0.04);
+ margin-bottom: 16px;
+ line-height: 1.6;
+ }
+
+ .phases {
+ display: flex;
+ flex-direction: column;
+ gap: 16px;
+ margin-bottom: 16px;
+ }
+
+ .phase {
+ background: hsl(var(--card));
+ border: 1px solid hsl(var(--border));
+ padding: 16px 20px;
+ }
+
+ .phase-header {
+ display: flex;
+ align-items: center;
+ gap: 12px;
+ margin-bottom: 8px;
+ }
+
+ .phase-number {
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ width: 28px;
+ height: 28px;
+ border: 1px solid hsl(var(--primary));
+ color: hsl(var(--primary));
+ font-weight: 600;
+ font-size: var(--text-ui);
+ flex-shrink: 0;
+ }
+
+ .phase-header h3 {
+ margin: 0;
+ color: hsl(var(--foreground));
+ }
+
+ .phase p {
+ margin-bottom: 8px;
+ line-height: 1.6;
+ }
+
+ .phase pre {
+ background: hsl(var(--smui-surface-2));
+ border: 1px solid hsl(var(--border));
+ padding: 10px 14px;
+ overflow-x: auto;
+ margin-bottom: 8px;
+ }
+
+ .phase code {
+ font-family: var(--font-mono);
+ font-size: var(--text-ui);
+ color: hsl(var(--smui-green));
+ }
+
+ .limitations {
+ list-style: none;
+ padding: 0;
+ }
+
+ .limitations li {
+ padding: 10px 0;
+ border-bottom: 1px solid hsl(var(--border));
+ line-height: 1.6;
+ }
+
+ .limitations li:last-child {
+ border-bottom: none;
+ }
+
+ @media (max-width: 768px) {
+ .three-col,
+ .two-col,
+ .designs {
+ grid-template-columns: 1fr;
+ }
+ }
+</style>