commit a1dcd8c3630ee94ad18d319107152852e745631c
parent 1afd77baba3aa07602ce3136076e22fa72f3c63a
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 07:46:41 +0200
Add code analysis and transcript analysis to eval pipeline
New eval dimensions:
- code-analysis.sh: file count, LOC, unnecessary files, dependencies,
complexity, console.log count
- transcript-analysis.py: tool call breakdown, wasted turns (docs,
ASCII art, server starts), error count, productivity ratio, self-testing
Both produce scored JSON and are factored into the overall score.
Scoring weights updated: functional 35%, quality 20%, code_analysis 15%,
structural 10%, transcript_analysis 10%.
Also includes gameplay bot spec for future Playwright-based testing
(Opus agent building it in background).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
6 files changed, 588 insertions(+), 15 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -254,6 +254,8 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
"structural": None,
"functional": None,
"quality": None,
+ "code_analysis": None,
+ "transcript_analysis": None,
"score": None,
}
@@ -266,24 +268,39 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
# Functional
tests_dir = task_dir / "eval" / "tests"
if tests_dir.is_dir():
- # Check for different test types
if (tests_dir / "functional.sh").exists():
output = run_eval_script(tests_dir / "functional.sh", workspace, language)
results["functional"] = safe_parse_json(output)
elif (tests_dir / "functional.spec.ts").exists():
- # Playwright tests - would need server setup, skip for now
results["functional"] = {"pass": False, "error": "playwright eval not yet wired", "score": 0}
elif (tests_dir / "functional.test.ts").exists():
- # vitest tests - would need server setup, skip for now
results["functional"] = {"pass": False, "error": "vitest eval not yet wired", "score": 0}
- # Quality
+ # Quality (lint, typecheck, bundle size)
quality_sh = task_dir / "eval" / "quality.sh"
if quality_sh.exists():
output = run_eval_script(quality_sh, workspace, language)
results["quality"] = safe_parse_json(output)
- # Compute weighted score
+ # Code analysis (file count, LOC, unnecessary files, dependencies)
+ code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
+ if code_analysis_sh.exists():
+ output = run_eval_script(code_analysis_sh, workspace, language)
+ results["code_analysis"] = safe_parse_json(output)
+
+ # Transcript analysis (agent efficiency, wasted turns, self-testing)
+ transcript_py = task_dir / "eval" / "transcript-analysis.py"
+ if transcript_py.exists():
+ try:
+ result = subprocess.run(
+ ["python3", str(transcript_py), str(run_dir)],
+ capture_output=True, text=True, timeout=30,
+ )
+ results["transcript_analysis"] = safe_parse_json(result.stdout.strip())
+ except Exception as e:
+ results["transcript_analysis"] = {"error": str(e), "score": 0}
+
+ # Compute weighted score from scoring.yaml
try:
scoring_file = task_dir / "scoring.yaml"
if scoring_file.exists():
@@ -292,10 +309,20 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
weights = scoring.get("weights", {})
score = 0.0
+ total_weight = 0.0
for category, weight in weights.items():
cat_data = results.get(category)
if cat_data and isinstance(cat_data.get("score"), (int, float)):
score += cat_data["score"] * weight
+ total_weight += weight
+
+ # Also factor in code_analysis and transcript_analysis if present
+ for extra in ["code_analysis", "transcript_analysis"]:
+ cat_data = results.get(extra)
+ if cat_data and isinstance(cat_data.get("score"), (int, float)):
+ extra_weight = 0.1 # 10% each
+ score += cat_data["score"] * extra_weight
+ total_weight += extra_weight
results["score"] = round(score, 4)
except Exception:
diff --git a/tasks/tetris/eval/GAMEPLAY_BOT_SPEC.md b/tasks/tetris/eval/GAMEPLAY_BOT_SPEC.md
@@ -0,0 +1,218 @@
+# Tetris Gameplay Bot Spec
+
+## Purpose
+
+A Playwright-based bot that can load any Tetris implementation, figure out how to interact with it, play the game, and report which game mechanics work and which don't. It must handle wildly different implementations -- different DOM structures, canvas vs DOM rendering, different control schemes, start buttons vs auto-start, etc.
+
+## Architecture
+
+Three phases: **Calibration**, **Play**, **Report**.
+
+### Phase 1: Calibration
+
+The bot loads the page and figures out how to interact with this specific implementation.
+
+**1a. Start the game**
+
+Try multiple start mechanisms in order, checking after each if the game state changed:
+1. Wait 3 seconds (some games auto-start)
+2. Click the canvas or game container
+3. Press Enter
+4. Press Space
+5. Look for a button with text matching /start|play|begin|new game/i and click it
+6. Press any key
+
+After each attempt, take a screenshot and compare to the previous one. If pixels changed, the game has started.
+
+**1b. Locate the game grid**
+
+The grid could be:
+- A `<canvas>` element
+- A grid of `<div>` or `<td>` elements
+- An SVG
+
+Detection strategy:
+1. Check for a `<canvas>` element. If found, use `getImageData()` to read pixels.
+2. If no canvas, look for a grid-like DOM structure (many sibling elements in a container with grid/flex layout, or a table).
+3. Take a screenshot and look for a rectangular region with a grid pattern.
+
+Once found, determine:
+- Grid pixel bounds (x, y, width, height)
+- Cell size (width / 10, height / 20 for standard Tetris)
+- Sample one pixel per cell to build a 10x20 boolean matrix
+
+**1c. Detect controls**
+
+Default to standard controls: ArrowLeft, ArrowRight, ArrowDown, ArrowUp (rotate), Space (hard drop).
+
+Verify by:
+1. Read the page text/HTML for control instructions (look for "arrow", "wasd", "z", "x", "space", "rotate" etc.)
+2. Press ArrowLeft, take screenshot, check if a piece moved. If not, try "a".
+3. Press ArrowUp, take screenshot, check if a piece rotated. If not, try "z" or "x".
+
+Store the working key mappings.
+
+**1d. Locate score display**
+
+Scan the page for elements containing the text "score" (case insensitive) or elements that contain only a number that changes during gameplay.
+
+### Phase 2: Play
+
+A deterministic play session that exercises all game mechanics. Not trying to play well -- trying to test everything.
+
+**2a. Test Suite (sequential, do not stop on failure)**
+
+Each test captures before/after state and reports pass/fail independently.
+
+| # | Test | Method | Pass condition |
+|---|------|--------|----------------|
+| 1 | Game loads | Page loads without console errors | No uncaught exceptions in first 3s |
+| 2 | Game starts | Run calibration start sequence | Screenshot changes after start |
+| 3 | Auto-drop | Wait 5s with no input after start | Grid state changes (piece fell) |
+| 4 | Move left | Press left key | Grid state differs from before |
+| 5 | Move right | Press right key | Grid state differs from before |
+| 6 | Move down | Press down key | Grid state differs from before |
+| 7 | Rotate | Press rotate key | Grid state differs, piece shape changed |
+| 8 | Hard drop | Press hard drop key | Piece immediately at bottom, new piece appears |
+| 9 | Piece locks | Wait for a piece to reach bottom via auto-drop (no input for ~15s) | Grid has filled cells at bottom that persist |
+| 10 | New piece spawns | After piece locks, check top of grid | New piece appears at top |
+| 11 | Multiple pieces | Play 10 pieces (hard drop each) | Grid accumulates filled cells |
+| 12 | Line clear | Fill a complete row by strategic placement | At least one row disappears, cells above shift down |
+| 13 | Score changes | Check score element before and after line clear | Score value increased |
+| 14 | Game over | Stack pieces to the top rapidly | Game stops, some game-over indication |
+| 15 | Playable for 30s | Play normally for 30 seconds | No crashes, console errors, or freezes |
+
+**2b. Playing Strategy**
+
+For tests that require actual gameplay (11, 12, 15), use the 4-heuristic algorithm:
+
+```
+score = -0.51 * aggregateHeight + 0.76 * completeLines - 0.36 * holes - 0.18 * bumpiness
+```
+
+For each piece:
+1. Read current grid state (10x20 boolean matrix)
+2. Read current piece (detect from grid -- the moving cells)
+3. Try all (rotation, column) placements
+4. Score each resulting board
+5. Execute: rotate N times, move left/right, hard drop
+
+If the bot can't read the grid reliably, fall back to random inputs: cycle through left, right, rotate, down in a fixed pattern.
+
+**2c. Grid Reading**
+
+For canvas-based games:
+```js
+async function readGrid(page, bounds, cellW, cellH) {
+ return await page.evaluate(({ x, y, cellW, cellH }) => {
+ const canvas = document.querySelector('canvas');
+ const ctx = canvas.getContext('2d');
+ const grid = [];
+ for (let row = 0; row < 20; row++) {
+ const rowData = [];
+ for (let col = 0; col < 10; col++) {
+ const px = x + col * cellW + cellW / 2;
+ const py = y + row * cellH + cellH / 2;
+ const pixel = ctx.getImageData(px, py, 1, 1).data;
+ // Consider a cell filled if it's not the background color
+ const brightness = pixel[0] + pixel[1] + pixel[2];
+ rowData.push(brightness > 100); // threshold
+ }
+ grid.push(rowData);
+ }
+ return grid;
+ }, { x: bounds.x, y: bounds.y, cellW, cellH });
+}
+```
+
+For DOM-based games:
+```js
+// Find cells by their grid position, check background color or class
+```
+
+The background color threshold should be calibrated during Phase 1 by reading the empty grid.
+
+### Phase 3: Report
+
+Output a JSON report:
+
+```json
+{
+ "implementation": {
+ "renderer": "canvas|dom|svg",
+ "grid_detected": true,
+ "grid_bounds": { "x": 0, "y": 0, "width": 300, "height": 600 },
+ "controls": { "left": "ArrowLeft", "right": "ArrowRight", "rotate": "ArrowUp", "drop": "Space" },
+ "start_mechanism": "button|auto|keypress",
+ "score_element_found": true
+ },
+ "tests": [
+ { "name": "game_loads", "pass": true, "detail": "no console errors" },
+ { "name": "game_starts", "pass": true, "detail": "started via button click" },
+ { "name": "auto_drop", "pass": false, "detail": "piece did not move in 5 seconds" },
+ ...
+ ],
+ "summary": {
+ "total": 15,
+ "passed": 12,
+ "failed": 3,
+ "score": 0.80
+ },
+ "gameplay": {
+ "pieces_placed": 47,
+ "lines_cleared": 3,
+ "max_score_observed": 400,
+ "play_duration_seconds": 30,
+ "errors_during_play": 0
+ }
+}
+```
+
+## Error Handling
+
+- NEVER crash on a single test failure. Each test is independent.
+- If grid detection fails, skip grid-dependent tests but still test basic page load, console errors, and input response via screenshots.
+- If a test times out (e.g., waiting for auto-drop), mark it as failed and move on.
+- Capture all console errors throughout the session and include them in the report.
+- If the game page itself fails to load, report all tests as failed with the error.
+
+## File Structure
+
+```
+tasks/tetris/eval/
+ gameplay-bot/
+ index.ts # Main entry point, orchestrates calibration + play + report
+ calibrate.ts # Phase 1: detect grid, controls, start mechanism
+ grid-reader.ts # Read grid state from canvas or DOM
+ player.ts # Phase 2: heuristic AI + move execution
+ tests.ts # Individual test implementations
+ types.ts # Shared types
+ playwright.config.ts
+```
+
+## Dependencies
+
+- `@playwright/test` (already in the project)
+- No other dependencies. Pure Playwright + vanilla JS evaluation.
+
+## Integration
+
+The harness calls:
+```bash
+npx playwright test --config=tasks/tetris/eval/playwright.config.ts
+```
+
+The Playwright test:
+1. Starts an HTTP server for the workspace (serve static files)
+2. Runs the bot against the served game
+3. Writes the JSON report to a specified output path
+4. Exit code 0 regardless of test results (the report contains pass/fail)
+
+## Constraints
+
+- Must work with canvas-based AND DOM-based Tetris implementations
+- Must handle games that auto-start and games with start buttons
+- Must handle different control schemes
+- Must not depend on any specific DOM structure, class names, or IDs
+- Each test has a timeout (default 10 seconds per test, 30 seconds for the play test)
+- Total bot runtime should be under 2 minutes per game
diff --git a/tasks/tetris/eval/code-analysis.sh b/tasks/tetris/eval/code-analysis.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Code analysis for generated Tetris implementations.
+# Measures code quality attributes that don't require running the game.
+#
+# Usage: code-analysis.sh <workspace_path> <language>
+# Output: JSON to stdout
+
+WORKSPACE="$1"
+LANGUAGE="$2"
+
+cd "$WORKSPACE" || exit 1
+
+results='{}'
+
+# --- File inventory ---
+total_files=$(find . -not -path './node_modules/*' -not -path './.git/*' -type f | wc -l)
+code_files=$(find . -not -path './node_modules/*' -not -path './.git/*' \( -name "*.ts" -o -name "*.js" -o -name "*.html" -o -name "*.css" \) -type f | wc -l)
+doc_files=$(find . -not -path './node_modules/*' -not -path './.git/*' \( -name "*.md" -o -name "*.txt" \) -type f | wc -l)
+
+# Count specific unnecessary files
+unnecessary=0
+unnecessary_list=""
+for pattern in README.md IMPLEMENTATION.md FEATURES.md QUICK_START.txt CHANGELOG.md TODO.md server.js; do
+ if [ -f "$pattern" ]; then
+ unnecessary=$((unnecessary + 1))
+ unnecessary_list="$unnecessary_list $pattern"
+ fi
+done
+
+results=$(echo "$results" | jq \
+ --argjson total "$total_files" \
+ --argjson code "$code_files" \
+ --argjson docs "$doc_files" \
+ --argjson unnecessary "$unnecessary" \
+ --arg unnecessary_list "${unnecessary_list# }" \
+ '. + {files: {total: $total, code: $code, docs: $docs, unnecessary: $unnecessary, unnecessary_list: $unnecessary_list}}')
+
+# --- Lines of code ---
+total_loc=0
+for ext in ts js html css; do
+ count=$(find . -not -path './node_modules/*' -not -path './.git/*' -name "*.$ext" -exec cat {} + 2>/dev/null | wc -l)
+ total_loc=$((total_loc + count))
+done
+
+results=$(echo "$results" | jq --argjson loc "$total_loc" '. + {lines_of_code: $loc}')
+
+# --- Dependency count ---
+dep_count=0
+dev_dep_count=0
+if [ -f "package.json" ]; then
+ dep_count=$(jq '.dependencies // {} | length' package.json 2>/dev/null || echo 0)
+ dev_dep_count=$(jq '.devDependencies // {} | length' package.json 2>/dev/null || echo 0)
+fi
+
+results=$(echo "$results" | jq \
+ --argjson deps "$dep_count" \
+ --argjson devDeps "$dev_dep_count" \
+ '. + {dependencies: {production: $deps, dev: $devDeps, total: ($deps + $devDeps)}}')
+
+# --- Single file vs multi file ---
+# For Tetris, a single HTML file with inline JS is perfectly valid
+# Over-engineering signal: more than 5 code files for a Tetris game
+if [ "$code_files" -le 2 ]; then
+ complexity="minimal"
+elif [ "$code_files" -le 5 ]; then
+ complexity="moderate"
+else
+ complexity="over-engineered"
+fi
+
+results=$(echo "$results" | jq --arg c "$complexity" '. + {complexity: $c}')
+
+# --- Console.log count (debug noise) ---
+console_logs=$(grep -r "console\.log" --include="*.ts" --include="*.js" . 2>/dev/null | grep -v node_modules | wc -l)
+results=$(echo "$results" | jq --argjson cl "$console_logs" '. + {console_logs: $cl}')
+
+# --- Compute score ---
+# Scoring: fewer unnecessary files, fewer deps, moderate LOC, no debug noise
+score=100
+
+# Penalty for unnecessary files (10 points each, max 30)
+penalty=$((unnecessary * 10))
+[ "$penalty" -gt 30 ] && penalty=30
+score=$((score - penalty))
+
+# Penalty for too many dependencies (5 points each over 2)
+if [ "$dep_count" -gt 2 ]; then
+ dep_penalty=$(( (dep_count - 2) * 5 ))
+ [ "$dep_penalty" -gt 20 ] && dep_penalty=20
+ score=$((score - dep_penalty))
+fi
+
+# Penalty for excessive console.logs (2 points each, max 10)
+log_penalty=$((console_logs * 2))
+[ "$log_penalty" -gt 10 ] && log_penalty=10
+score=$((score - log_penalty))
+
+# Penalty for over-engineering
+if [ "$complexity" = "over-engineered" ]; then
+ score=$((score - 10))
+fi
+
+# Normalize to 0-1
+score_normalized=$(awk "BEGIN {s = $score / 100; if (s < 0) s = 0; printf \"%.2f\", s}")
+
+results=$(echo "$results" | jq --argjson s "$score_normalized" '. + {score: $s}')
+
+echo "$results" | jq '.'
diff --git a/tasks/tetris/eval/transcript-analysis.py b/tasks/tetris/eval/transcript-analysis.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Transcript analysis - measures agent efficiency from the conversation log."""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def main():
+ run_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
+ transcript = run_dir / "transcript.jsonl"
+
+ if not transcript.exists():
+ print(json.dumps({"error": "no transcript found", "score": 0}))
+ return
+
+ events = []
+ for line in transcript.read_text().strip().split("\n"):
+ if line.strip():
+ try:
+ events.append(json.loads(line))
+ except json.JSONDecodeError:
+ pass
+
+ # Count tool calls by type
+ tool_calls = []
+ for ev in events:
+ if ev.get("type") == "assistant" and ev.get("message", {}).get("content"):
+ for block in ev["message"]["content"]:
+ if block.get("type") == "tool_use":
+ tool_calls.append(block)
+
+ tool_names = [t.get("name", "") for t in tool_calls]
+ bash_commands = []
+ for t in tool_calls:
+ if t.get("name") == "Bash":
+ cmd = t.get("input", {}).get("command", "")
+ bash_commands.append(cmd)
+
+ # Count wasted turns
+ doc_patterns = re.compile(r"cat >.*?(README|IMPLEMENTATION|FEATURES|QUICK_START|CHANGELOG|TODO|\.txt)", re.I)
+ ascii_patterns = re.compile(r"(cat <<|echo).*[═╔╗╚╝║▓░█✓✅🎮]")
+ server_patterns = re.compile(r"node server|npm start|npx serve|http-server|python.*http")
+ test_patterns = re.compile(r"npm test|npx.*test|node.*test|tsc --noEmit|eslint")
+
+ wasted_docs = sum(1 for c in bash_commands if doc_patterns.search(c))
+ wasted_ascii = sum(1 for c in bash_commands if ascii_patterns.search(c))
+ wasted_server = sum(1 for c in bash_commands if server_patterns.search(c))
+ self_tested = sum(1 for c in bash_commands if test_patterns.search(c))
+
+ total_wasted = wasted_docs + wasted_ascii + wasted_server
+
+ # Count errors in tool results
+ errors = 0
+ for ev in events:
+ if ev.get("type") == "user":
+ result = ev.get("tool_use_result")
+ if isinstance(result, dict) and result.get("stderr"):
+ errors += 1
+
+ # Count thinking and text blocks
+ thinking_blocks = 0
+ text_blocks = 0
+ for ev in events:
+ if ev.get("type") == "assistant" and ev.get("message", {}).get("content"):
+ for block in ev["message"]["content"]:
+ if block.get("type") == "thinking":
+ thinking_blocks += 1
+ elif block.get("type") == "text":
+ text_blocks += 1
+
+ # Productivity ratio
+ total_tools = len(tool_calls)
+ productive = total_tools - total_wasted
+ productivity_ratio = round(productive / total_tools, 2) if total_tools > 0 else 0
+
+ # Score
+ score = 100
+ waste_penalty = min(total_wasted * 5, 25)
+ score -= waste_penalty
+ if self_tested > 0:
+ score = min(score + 10, 100)
+ score_normalized = round(score / 100, 2)
+
+ result = {
+ "total_events": len(events),
+ "tool_calls": {
+ "total": total_tools,
+ "bash": tool_names.count("Bash"),
+ "write": tool_names.count("Write"),
+ "edit": tool_names.count("Edit"),
+ "read": tool_names.count("Read"),
+ },
+ "wasted_turns": {
+ "total": total_wasted,
+ "docs": wasted_docs,
+ "ascii_art": wasted_ascii,
+ "server_starts": wasted_server,
+ },
+ "errors_encountered": errors,
+ "thinking_blocks": thinking_blocks,
+ "text_blocks": text_blocks,
+ "productivity_ratio": productivity_ratio,
+ "self_tested": self_tested > 0,
+ "score": score_normalized,
+ }
+
+ print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks/tetris/eval/transcript-analysis.sh b/tasks/tetris/eval/transcript-analysis.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# Transcript analysis - measures agent efficiency from the conversation log.
+# Extracts behavioral metrics from how the agent worked, not what it produced.
+#
+# Usage: transcript-analysis.sh <run_dir>
+# Output: JSON to stdout
+
+RUN_DIR="$1"
+TRANSCRIPT="$RUN_DIR/transcript.jsonl"
+
+if [ ! -f "$TRANSCRIPT" ]; then
+ echo '{"error": "no transcript found", "score": 0}'
+ exit 0
+fi
+
+# --- Count events by type ---
+total_events=$(wc -l < "$TRANSCRIPT")
+assistant_events=$(jq -r 'select(.type == "assistant")' "$TRANSCRIPT" 2>/dev/null | wc -l)
+
+# --- Count tool usage ---
+tool_calls=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | .name' "$TRANSCRIPT" 2>/dev/null)
+total_tools=$(echo "$tool_calls" | grep -c . 2>/dev/null || echo "0")
+bash_calls=$(echo "$tool_calls" | grep -c "^Bash$" 2>/dev/null || echo "0")
+write_calls=$(echo "$tool_calls" | grep -c "^Write$" 2>/dev/null || echo "0")
+edit_calls=$(echo "$tool_calls" | grep -c "^Edit$" 2>/dev/null || echo "0")
+read_calls=$(echo "$tool_calls" | grep -c "^Read$" 2>/dev/null || echo "0")
+
+# --- Detect wasted turns ---
+# File writes that are documentation, not code
+wasted_writes=0
+doc_patterns='README|IMPLEMENTATION|FEATURES|QUICK_START|CHANGELOG|TODO|\.txt'
+wasted_writes=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE "cat >.*($doc_patterns)" 2>/dev/null || echo "0")
+
+# Turns spent printing ASCII art or decorative output
+ascii_art=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE '(cat <<|echo).*[═╔╗╚╝║▓░█✓✅🎮]' 2>/dev/null || echo "0")
+
+# Turns spent starting a server (unnecessary for static games)
+server_starts=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'node server|npm start|npx serve|http-server|python.*http' 2>/dev/null || echo "0")
+
+total_wasted=$((wasted_writes + ascii_art + server_starts))
+
+# --- Detect error-fix cycles ---
+# Count tool results with non-empty stderr or error indicators
+errors=$(jq -r 'select(.type == "user") | .tool_use_result | if type == "object" then (.stderr // "") else "" end' "$TRANSCRIPT" 2>/dev/null | grep -c . 2>/dev/null || echo "0")
+
+# --- Thinking blocks ---
+thinking_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "thinking")' "$TRANSCRIPT" 2>/dev/null | wc -l)
+
+# --- Text output blocks ---
+text_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "text")' "$TRANSCRIPT" 2>/dev/null | wc -l)
+
+# --- Productivity ratio ---
+productive_tools=$((total_tools - total_wasted))
+if [ "$total_tools" -gt 0 ]; then
+ productivity_ratio=$(awk "BEGIN {printf \"%.2f\", $productive_tools / $total_tools}")
+else
+ productivity_ratio="0"
+fi
+
+# --- Self-testing ---
+# Did the agent try to test its own code?
+self_test=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'npm test|npx.*test|node.*test|tsc --noEmit|eslint' 2>/dev/null || echo "0")
+
+# --- Build results ---
+results=$(jq -n \
+ --argjson total_events "$total_events" \
+ --argjson total_tools "$total_tools" \
+ --argjson bash "$bash_calls" \
+ --argjson write "$write_calls" \
+ --argjson edit "$edit_calls" \
+ --argjson read "$read_calls" \
+ --argjson wasted "$total_wasted" \
+ --argjson wasted_docs "$wasted_writes" \
+ --argjson wasted_ascii "$ascii_art" \
+ --argjson wasted_server "$server_starts" \
+ --argjson errors "$errors" \
+ --argjson thinking "$thinking_blocks" \
+ --argjson text "$text_blocks" \
+ --arg productivity "$productivity_ratio" \
+ --argjson self_test "$self_test" \
+ '{
+ total_events: $total_events,
+ tool_calls: {total: $total_tools, bash: $bash, write: $write, edit: $edit, read: $read},
+ wasted_turns: {total: $wasted, docs: $wasted_docs, ascii_art: $wasted_ascii, server_starts: $wasted_server},
+ errors_encountered: $errors,
+ thinking_blocks: $thinking,
+ text_blocks: $text,
+ productivity_ratio: ($productivity | tonumber),
+ self_tested: ($self_test > 0)
+ }')
+
+# --- Score ---
+# High productivity ratio = good, low wasted turns = good, self-testing = bonus
+score=100
+
+# Penalty for wasted turns (5 points each, max 25)
+waste_penalty=$((total_wasted * 5))
+[ "$waste_penalty" -gt 25 ] && waste_penalty=25
+score=$((score - waste_penalty))
+
+# Bonus for self-testing
+if [ "$self_test" -gt 0 ]; then
+ score=$((score + 10))
+fi
+[ "$score" -gt 100 ] && score=100
+
+score_normalized=$(awk "BEGIN {printf \"%.2f\", $score / 100}")
+
+results=$(echo "$results" | jq --argjson s "$score_normalized" '. + {score: $s}')
+
+echo "$results" | jq '.'
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,11 +1,7 @@
weights:
- functional: 0.50
- structural: 0.15
- quality: 0.35
-
-quality_weights:
- lint: 0.25
- typecheck: 0.20
- accessibility: 0.25
- performance: 0.15
- no_console_errors: 0.15
+ functional: 0.35
+ structural: 0.10
+ quality: 0.20
+ code_analysis: 0.15
+ transcript_analysis: 0.10
+ # gameplay_bot will be added here once wired (0.10 from functional)