run.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

run.py (46826B)
      1 #!/usr/bin/env python3
      2 """Loop Benchmarking Harness - Main orchestrator.
      3 
      4 Computes the experiment grid, creates isolated workspaces, invokes claude,
      5 runs evaluation, and stores results.
      6 
      7 Usage:
      8     python3 run.py [grid_file] [profile_or_design] [-j N]
      9 
     10     profile_or_design can be:
     11       - A profile name from grid.yaml (e.g., smoke, core, full)
     12       - A DOE design: main_effects, plackett_burman
     13       - interaction_hunt:axis1,axis2,axis3
     14 
     15     -j N: run N experiments in parallel (default 1)
     16     --model MODEL: set baseline model for main_effects sweep
     17     --provider PROVIDER: required -- anthropic or zai
     18     --reeval: re-evaluate all existing runs with latest eval scripts
     19     --analyze: run analysis and save results to results/analysis/
     20     --full-pipeline: reeval + analyze after sweep completes
     21 """
     22 
     23 import hashlib
     24 import json
     25 import os
     26 import shlex
     27 import signal
     28 import shutil
     29 import subprocess
     30 import sys
     31 import tarfile
     32 import tempfile
     33 import threading
     34 import time
     35 from concurrent.futures import ThreadPoolExecutor, as_completed
     36 from datetime import datetime, timezone
     37 from pathlib import Path
     38 
     39 SCRIPT_DIR = Path(__file__).resolve().parent
     40 PROJECT_DIR = SCRIPT_DIR.parent
     41 sys.path.insert(0, str(SCRIPT_DIR / "lib"))
     42 
     43 from compute_grid import load_grid, compute_cells
     44 from experiment_design import (
     45     main_effects_plan,
     46     plackett_burman_plan,
     47     interaction_hunt_plan,
     48     analyze_main_effects,
     49 )
     50 
     51 # Prompt snippets appended based on axis values. Order matters.
     52 PROMPT_SNIPPET_ORDER = [
     53     "language", "renderer", "architecture", "design_guidance",
     54     "tests_provided", "strategy", "error_checking", "playwright",
     55 ]
     56 
     57 PROMPT_SNIPPETS = {
     58     # language (existing, moved from build_prompt body)
     59     ("language", "typescript"): "Use TypeScript.",
     60     ("language", "javascript"): "Use JavaScript (no TypeScript).",
     61 
     62     # renderer
     63     ("renderer", "canvas"): "Render the game using HTML5 Canvas.",
     64     ("renderer", "svg"): "Render the game using SVG elements.",
     65     ("renderer", "dom"): "Render the game using DOM elements (divs/tables) with CSS for styling. Do not use Canvas or SVG.",
     66     ("renderer", "webgl"): "Render the game using WebGL.",
     67 
     68     # architecture
     69     ("architecture", "separation"): "Use separate files for every concern: game logic, rendering, input handling, scoring, and UI. No file should be longer than 150 lines.",
     70     ("architecture", "best_practices"): "Follow software engineering best practices throughout: SOLID principles, clean code, proper error handling, no magic numbers, meaningful variable names, and JSDoc comments on public functions.",
     71 
     72     # design_guidance
     73     ("design_guidance", "vague"): "Make it look visually polished and professional. The game should feel like a finished product, not a prototype.",
     74     ("design_guidance", "specific"): "Follow this design specification:\n- Dark background (#1a1a2e) with a subtle grid pattern\n- Neon-colored pieces with subtle glow effects (I=cyan, O=yellow, T=purple, S=green, Z=red, J=blue, L=orange)\n- Smooth drop and line-clear animations\n- Side panel with score, level, lines cleared, and next-piece preview\n- Use a monospace font for all numeric displays\n- Center the game board on the page with comfortable margins\n- Visible cell borders within the grid",
     75 
     76     # tests_provided
     77     ("tests_provided", "a_few"): "I have included a few Playwright tests in the tests/ directory. Your implementation should pass them. Do not modify the test files.",
     78     ("tests_provided", "many"): "I have included a comprehensive Playwright test suite in the tests/ directory. Your implementation should pass all of them. Do not modify the test files.",
     79 
     80     # strategy
     81     ("strategy", "plan_first"): "Before writing any code, explore the requirements thoroughly and create a detailed implementation plan. Think through the architecture, data structures, and edge cases. Only start coding once you have a clear plan.",
     82     ("strategy", "iterate"): "Build incrementally. Start with the simplest possible working version, then add features one at a time. After each change, test it to make sure everything still works. Do not stop iterating until the game is polished and complete.",
     83     ("strategy", "creative_validate"): "Think outside the box for ways to validate your implementation works flawlessly. Do not just assume it works. Find creative, unusual ways to verify correctness, test edge cases, and stress-test the game.",
     84     ("strategy", "use_subagents"): "Make use of sub-agents to help you work more effectively.",
     85     ("strategy", "delegate"): "Use sub-agents strategically: spawn a research agent to explore requirements and plan the architecture, a separate implementation agent for the core game logic, and a testing agent to verify the result. Coordinate their work.",
     86     ("strategy", "review"): "After building the game, spawn a separate sub-agent to thoroughly review your code for bugs, edge cases, and quality issues. Then fix every issue the reviewer finds.",
     87     ("strategy", "split_work"): "Divide the work among specialized sub-agents: one agent builds the UI and rendering, another builds the game logic and state management. Before they start, a third agent defines the interfaces between their components. After both are done, the third agent integrates and tests the final product.",
     88 
     89     # error_checking
     90     ("error_checking", "self_verify"): "After building the game, run it yourself. Open the page, check the browser console for errors, verify that pieces move and rotate correctly, test that lines clear and the score updates, and fix any issues you find. Do not declare the task complete until you have verified the game works end-to-end.",
     91 
     92     # playwright (only "instructed" adds a snippet)
     93     ("playwright", "instructed"): "Use Playwright to test your work as you build it. Write and run tests frequently to catch issues early.",
     94 }
     95 
     96 PROMPT_SNIPPETS_ES = {
     97     ("language", "typescript"): "Usa TypeScript.",
     98     ("language", "javascript"): "Usa JavaScript (sin TypeScript).",
     99     ("renderer", "canvas"): "Renderiza el juego usando HTML5 Canvas.",
    100     ("renderer", "svg"): "Renderiza el juego usando elementos SVG.",
    101     ("renderer", "dom"): "Renderiza el juego usando elementos DOM (divs/tablas) con CSS. No uses Canvas ni SVG.",
    102     ("renderer", "webgl"): "Renderiza el juego usando WebGL.",
    103     ("architecture", "separation"): "Usa archivos separados para cada responsabilidad: logica del juego, renderizado, manejo de entrada, puntuacion e interfaz. Ningun archivo debe superar las 150 lineas.",
    104     ("architecture", "best_practices"): "Sigue las mejores practicas de ingenieria de software: principios SOLID, codigo limpio, manejo adecuado de errores, sin numeros magicos, nombres significativos y comentarios JSDoc en funciones publicas.",
    105     ("design_guidance", "vague"): "Hazlo visualmente pulido y profesional. El juego debe sentirse como un producto terminado, no un prototipo.",
    106     ("design_guidance", "specific"): "Sigue esta especificacion de diseno:\n- Fondo oscuro (#1a1a2e) con un patron de cuadricula sutil\n- Piezas de colores neon con efectos de brillo sutil (I=cyan, O=amarillo, T=morado, S=verde, Z=rojo, J=azul, L=naranja)\n- Animaciones suaves de caida y limpieza de lineas\n- Panel lateral con puntuacion, nivel, lineas eliminadas y vista previa de la siguiente pieza\n- Usa una fuente monoespaciada para todas las visualizaciones numericas\n- Centra el tablero en la pagina con margenes comodos\n- Bordes de celda visibles dentro de la cuadricula",
    107     ("tests_provided", "a_few"): "He incluido algunas pruebas de Playwright en el directorio tests/. Tu implementacion debe pasarlas. No modifiques los archivos de prueba.",
    108     ("tests_provided", "many"): "He incluido una suite completa de pruebas Playwright en el directorio tests/. Tu implementacion debe pasar todas. No modifiques los archivos de prueba.",
    109     ("strategy", "plan_first"): "Antes de escribir codigo, explora los requisitos a fondo y crea un plan de implementacion detallado. Piensa en la arquitectura, estructuras de datos y casos extremos. Solo comienza a codificar cuando tengas un plan claro.",
    110     ("strategy", "iterate"): "Construye incrementalmente. Comienza con la version funcional mas simple posible, luego agrega funciones una por una. Despues de cada cambio, prueba para asegurarte de que todo sigue funcionando. No dejes de iterar hasta que el juego este pulido y completo.",
    111     ("strategy", "creative_validate"): "Piensa de forma creativa en maneras de validar que tu implementacion funciona perfectamente. No asumas que funciona. Encuentra formas creativas e inusuales de verificar la correctitud, probar casos extremos y someter el juego a pruebas de estres.",
    112     ("strategy", "use_subagents"): "Usa sub-agentes para ayudarte a trabajar de manera mas efectiva.",
    113     ("strategy", "delegate"): "Usa sub-agentes estrategicamente: lanza un agente de investigacion para explorar requisitos y planificar la arquitectura, un agente de implementacion separado para la logica del juego, y un agente de pruebas para verificar el resultado. Coordina su trabajo.",
    114     ("strategy", "review"): "Despues de construir el juego, lanza un sub-agente separado para revisar tu codigo en busca de errores, casos extremos y problemas de calidad. Luego corrige cada problema que encuentre el revisor.",
    115     ("strategy", "split_work"): "Divide el trabajo entre sub-agentes especializados: uno construye la interfaz y el renderizado, otro construye la logica del juego y la gestion de estado. Antes de que comiencen, un tercer agente define las interfaces entre sus componentes. Despues de que ambos terminen, el tercer agente integra y prueba el producto final.",
    116     ("error_checking", "self_verify"): "Despues de construir el juego, ejecutalo tu mismo. Abre la pagina, revisa la consola del navegador en busca de errores, verifica que las piezas se muevan y roten correctamente, prueba que las lineas se limpien y la puntuacion se actualice, y corrige cualquier problema que encuentres. No declares la tarea completa hasta que hayas verificado que el juego funciona de principio a fin.",
    117     ("playwright", "instructed"): "Usa Playwright para probar tu trabajo mientras lo construyes. Escribe y ejecuta pruebas con frecuencia para detectar problemas temprano.",
    118 }
    119 
    120 # Strategy values that enable the Agent tool
    121 STRATEGIES_WITH_AGENTS = {"use_subagents", "delegate", "review", "compete", "split_work"}
    122 
    123 
    124 def create_workspace(project_dir: Path, task: str, cell: dict) -> Path:
    125     """Create an isolated temp directory with appropriate setup."""
    126     workspace = Path(tempfile.mkdtemp(prefix="loop-bench-"))
    127 
    128     language = cell.get("language", "typescript")
    129     linter = cell.get("linter", "off")
    130     playwright = cell.get("playwright", "off")
    131 
    132     # npm init
    133     subprocess.run(["npm", "init", "-y"], cwd=workspace, capture_output=True)
    134 
    135     # TypeScript (only pre-install if explicitly requested, not for "unspecified")
    136     if language == "typescript":
    137         subprocess.run(
    138             ["npm", "install", "--save-dev", "typescript", "@types/node"],
    139             cwd=workspace, capture_output=True,
    140         )
    141 
    142     # Linter
    143     if linter == "on":
    144         subprocess.run(
    145             ["npm", "install", "--save-dev", "eslint", "@eslint/js"],
    146             cwd=workspace, capture_output=True,
    147         )
    148 
    149     # Playwright
    150     if playwright in ("on", "available", "instructed"):
    151         subprocess.run(
    152             ["npm", "install", "--save-dev", "@playwright/test"],
    153             cwd=workspace, capture_output=True,
    154         )
    155         subprocess.run(
    156             ["npx", "playwright", "install", "chromium", "--with-deps"],
    157             cwd=workspace, capture_output=True,
    158         )
    159 
    160     # Copy fixtures
    161     fixtures_dir = project_dir / "tasks" / task / "fixtures"
    162     if fixtures_dir.is_dir():
    163         for item in fixtures_dir.iterdir():
    164             dest = workspace / item.name
    165             if item.is_dir():
    166                 shutil.copytree(item, dest)
    167             else:
    168                 shutil.copy2(item, dest)
    169 
    170     # Copy test fixtures if tests_provided
    171     tests_provided = cell.get("tests_provided", "none")
    172     if tests_provided != "none":
    173         variant = "tests-few" if tests_provided == "a_few" else "tests-full"
    174         tests_src = project_dir / "tasks" / task / "fixtures" / variant
    175         if tests_src.is_dir():
    176             shutil.copytree(tests_src, workspace / "tests")
    177 
    178     return workspace
    179 
    180 
    181 def build_prompt(project_dir: Path, cell: dict) -> str:
    182     """Read the base prompt file and append axis-specific snippets."""
    183     task = cell["task"]
    184     style = cell["prompt_style"]
    185     lang_code = cell["human_language"]
    186 
    187     prompt_file = project_dir / "tasks" / task / "prompts" / f"{style}.{lang_code}.md"
    188     prompt = prompt_file.read_text()
    189 
    190     # Select language-appropriate snippets
    191     snippets = PROMPT_SNIPPETS_ES if lang_code == "es" else PROMPT_SNIPPETS
    192 
    193     # Append snippets from each prompt-modifying axis
    194     for axis in PROMPT_SNIPPET_ORDER:
    195         value = cell.get(axis)
    196         if value is None:
    197             continue
    198         snippet = snippets.get((axis, value))
    199         if snippet:
    200             prompt += f"\n\n{snippet}"
    201 
    202     # Context noise prepended (not appended)
    203     noise_type = cell.get("context_noise", "clean")
    204     if noise_type != "clean":
    205         noise_file = project_dir / "tasks" / task / "noise" / f"{noise_type}.txt"
    206         if noise_file.exists():
    207             noise_text = noise_file.read_text()
    208             prompt = noise_text + "\n\n---\n\nNow for your actual task:\n\n" + prompt
    209 
    210     return prompt
    211 
    212 
    213 def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path, provider_config: dict = None) -> int:
    214     """Invoke claude CLI and capture output."""
    215     prompt = build_prompt(project_dir, cell)
    216     model = cell["model"]
    217     # Map display model name to CLI arg (e.g., "sonnet-4.6" -> "sonnet")
    218     cli_model_map = (provider_config or {}).get("cli_model_map", {})
    219     cli_model = cli_model_map.get(model, model)
    220     effort = cell.get("effort", "high")
    221     budget = cell.get("max_budget_usd", 0.50)
    222     timeout = cell.get("timeout_seconds", 600)
    223     # Build tool list from individual tool axes
    224     # Bash is always available - it's the agent's escape hatch
    225     tools_list = ["Bash"]
    226     if cell.get("tool_read", "on") == "on":
    227         tools_list.append("Read")
    228     if cell.get("tool_write", "on") == "on":
    229         tools_list.append("Write")
    230     if cell.get("tool_edit", "on") == "on":
    231         tools_list.append("Edit")
    232     if cell.get("tool_glob", "on") == "on":
    233         tools_list.append("Glob")
    234     if cell.get("tool_grep", "on") == "on":
    235         tools_list.append("Grep")
    236     if cell.get("sub_agents") == "on":
    237         tools_list.append("Agent")
    238     if cell.get("web_search") == "on":
    239         tools_list.extend(["WebSearch", "WebFetch"])
    240     tools = ",".join(tools_list)
    241 
    242     # For large prompts (noise cells), write to temp file and read via shell
    243     prompt_file = None
    244     if len(prompt) > 100000:
    245         prompt_file = Path(tempfile.mktemp(suffix=".txt", prefix="prompt-"))
    246         prompt_file.write_text(prompt)
    247 
    248     # Auth helper only for anthropic (reads OAuth from ~/.claude/.credentials.json).
    249     # Non-anthropic providers authenticate via ANTHROPIC_AUTH_TOKEN env var set below;
    250     # apiKeyHelper would override that env var and send the wrong credential.
    251     settings = {}
    252     if not provider_config or not provider_config.get("base_url"):
    253         settings["apiKeyHelper"] = str(SCRIPT_DIR / "lib" / "get-oauth-token.sh")
    254 
    255     # Build base command (prompt added separately for large prompts)
    256     cmd_base = [
    257         "claude",
    258         "--bare",
    259         "--model", cli_model,
    260         "--output-format", "stream-json",
    261         "--verbose",
    262         "--permission-mode", "dontAsk",
    263         "--max-budget-usd", str(budget),
    264         "--allowedTools", tools,
    265         "--settings", json.dumps(settings),
    266     ]
    267 
    268     if effort:
    269         cmd_base.extend(["--effort", effort])
    270 
    271     # Context file
    272     if cell.get("context_file") == "provided":
    273         ctx_file = project_dir / "tasks" / cell["task"] / "context.md"
    274         if ctx_file.exists():
    275             cmd_base.extend(["--append-system-prompt", ctx_file.read_text()])
    276 
    277     # Build final command: for large prompts, write a shell wrapper script
    278     if prompt_file:
    279         wrapper = Path(tempfile.mktemp(suffix=".sh", prefix="run-claude-"))
    280         wrapper_lines = ["#!/bin/bash"]
    281         wrapper_lines.append(f'PROMPT=$(cat {shlex.quote(str(prompt_file))})')
    282         wrapper_lines.append(" ".join(shlex.quote(c) for c in cmd_base) + ' -p "$PROMPT"')
    283         wrapper.write_text("\n".join(wrapper_lines))
    284         wrapper.chmod(0o755)
    285         cmd = ["bash", str(wrapper)]
    286     else:
    287         cmd = [*cmd_base, "-p", prompt]
    288 
    289     # Run claude
    290     transcript_path = run_dir / "transcript.jsonl"
    291     stderr_path = run_dir / "claude_stderr.log"
    292 
    293     # Inject harness metadata, prompt, and context as the first transcript entries
    294     with open(transcript_path, "w") as transcript_f:
    295         # Run configuration (everything the harness set up)
    296         config_event = {
    297             "type": "harness",
    298             "subtype": "config",
    299             "model": model,
    300             "effort": effort,
    301             "tools": tools.split(","),
    302             "max_budget_usd": budget,
    303             "timeout_seconds": timeout,
    304             "task": cell["task"],
    305             "language": cell.get("language"),
    306             "prompt_style": cell.get("prompt_style"),
    307         }
    308         transcript_f.write(json.dumps(config_event) + "\n")
    309 
    310         # The user's prompt
    311         prompt_event = {
    312             "type": "user",
    313             "subtype": "prompt",
    314             "message": {"role": "user", "content": prompt},
    315         }
    316         transcript_f.write(json.dumps(prompt_event) + "\n")
    317 
    318         # The context file if provided
    319         if cell.get("context_file") == "provided":
    320             ctx_file = project_dir / "tasks" / cell["task"] / "context.md"
    321             if ctx_file.exists():
    322                 ctx_event = {
    323                     "type": "user",
    324                     "subtype": "context",
    325                     "message": {"role": "system", "content": ctx_file.read_text()},
    326                 }
    327                 transcript_f.write(json.dumps(ctx_event) + "\n")
    328 
    329     # Provider-specific env overrides
    330     run_env = os.environ.copy()
    331     if provider_config:
    332         if provider_config.get("base_url"):
    333             run_env["ANTHROPIC_BASE_URL"] = provider_config["base_url"]
    334         else:
    335             run_env.pop("ANTHROPIC_BASE_URL", None)
    336         if provider_config.get("auth_token"):
    337             run_env["ANTHROPIC_AUTH_TOKEN"] = provider_config["auth_token"]
    338         elif provider_config.get("api_key_env"):
    339             key = os.environ.get(provider_config["api_key_env"])
    340             if key:
    341                 run_env["ANTHROPIC_AUTH_TOKEN"] = key
    342     else:
    343         run_env.pop("ANTHROPIC_BASE_URL", None)
    344 
    345     with open(transcript_path, "a") as transcript_f, open(stderr_path, "w") as stderr_f:
    346         try:
    347             result = subprocess.run(
    348                 cmd,
    349                 cwd=workspace,
    350                 stdout=transcript_f,
    351                 stderr=stderr_f,
    352                 timeout=timeout,
    353                 env=run_env,
    354             )
    355             exit_code = result.returncode
    356         except subprocess.TimeoutExpired:
    357             exit_code = 124  # Same as timeout(1) convention
    358 
    359     # Extract final result line
    360     output_path = run_dir / "claude_output.json"
    361     try:
    362         lines = transcript_path.read_text().strip().split("\n")
    363         if lines:
    364             output_path.write_text(lines[-1])
    365     except Exception:
    366         output_path.write_text("{}")
    367 
    368     return exit_code
    369 
    370 
    371 def run_eval_script(script: Path, workspace: Path, language: str) -> str:
    372     """Run a bash eval script and return its stdout."""
    373     try:
    374         result = subprocess.run(
    375             ["bash", str(script), str(workspace), language],
    376             capture_output=True, text=True, timeout=120,
    377         )
    378         return result.stdout.strip()
    379     except Exception as e:
    380         return json.dumps({"pass": False, "error": str(e)})
    381 
    382 
    383 def safe_parse_json(text: str, fallback_key: str = "error") -> dict:
    384     """Parse JSON, returning an error dict if parsing fails."""
    385     if not text:
    386         return {"pass": False, "error": "no output"}
    387     try:
    388         return json.loads(text)
    389     except json.JSONDecodeError:
    390         return {"pass": False, "error": text[:500]}
    391 
    392 
    393 def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
    394     """Run all evaluation scripts and write eval_results.json."""
    395     language = cell.get("language", "typescript")
    396 
    397     results = {
    398         "structural": None,
    399         "quality": None,
    400         "code_analysis": None,
    401         "transcript_analysis": None,
    402         "gameplay_bot": None,
    403         "outcome_score": None,
    404         "score": None,
    405     }
    406 
    407     # Structural
    408     structural_sh = task_dir / "eval" / "structural.sh"
    409     if structural_sh.exists():
    410         output = run_eval_script(structural_sh, workspace, language)
    411         results["structural"] = safe_parse_json(output)
    412 
    413     # Quality (lint, typecheck, bundle size)
    414     quality_sh = task_dir / "eval" / "quality.sh"
    415     if quality_sh.exists():
    416         output = run_eval_script(quality_sh, workspace, language)
    417         results["quality"] = safe_parse_json(output)
    418 
    419     # Code analysis (file count, LOC, unnecessary files, dependencies, quality metrics)
    420     code_analysis_py = task_dir / "eval" / "code-analysis.py"
    421     if code_analysis_py.exists():
    422         try:
    423             result = subprocess.run(
    424                 ["python3", str(code_analysis_py), str(workspace), language],
    425                 capture_output=True, text=True, timeout=120,
    426             )
    427             results["code_analysis"] = safe_parse_json(result.stdout.strip())
    428         except Exception as e:
    429             results["code_analysis"] = {"error": str(e), "score": 0}
    430     else:
    431         code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
    432         if code_analysis_sh.exists():
    433             output = run_eval_script(code_analysis_sh, workspace, language)
    434             results["code_analysis"] = safe_parse_json(output)
    435 
    436     # Transcript analysis (agent efficiency, wasted turns, self-testing)
    437     transcript_py = task_dir / "eval" / "transcript-analysis.py"
    438     if transcript_py.exists():
    439         try:
    440             result = subprocess.run(
    441                 ["python3", str(transcript_py), str(run_dir)],
    442                 capture_output=True, text=True, timeout=30,
    443             )
    444             results["transcript_analysis"] = safe_parse_json(result.stdout.strip())
    445         except Exception as e:
    446             results["transcript_analysis"] = {"error": str(e), "score": 0}
    447 
    448     # Gameplay bot (Playwright-based interactive testing, e.g. Tetris)
    449     # Use V2 bot (two-tier architecture) if available, fall back to V1
    450     gameplay_bot_v2 = task_dir / "eval" / "gameplay-bot-v2" / "index.ts"
    451     gameplay_bot_v1 = task_dir / "eval" / "gameplay-bot" / "index.ts"
    452     gameplay_bot_entry = gameplay_bot_v2 if gameplay_bot_v2.exists() else gameplay_bot_v1
    453     if gameplay_bot_entry.exists():
    454         # Pre-check: is there an HTML file to test?
    455         html_files = list(workspace.rglob("*.html"))
    456         html_files = [f for f in html_files if "node_modules" not in str(f)]
    457         if not html_files:
    458             results["gameplay_bot"] = {
    459                 "pass": False,
    460                 "score": 0,
    461                 "error": "no HTML files in workspace - game was not built",
    462             }
    463         else:
    464             report_path = run_dir / "gameplay-bot-report.json"
    465             bot_dir = gameplay_bot_entry.parent
    466             playwright_config = bot_dir / "playwright.config.ts"
    467             try:
    468                 bot_env = os.environ.copy()
    469                 bot_env["WORKSPACE_PATH"] = str(workspace)
    470                 bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
    471                 bot_proc = subprocess.Popen(
    472                     ["npx", "playwright", "test", "--config", str(playwright_config)],
    473                     cwd=str(PROJECT_DIR),
    474                     stdout=subprocess.PIPE,
    475                     stderr=subprocess.PIPE,
    476                     text=True,
    477                     env=bot_env,
    478                     start_new_session=True,
    479                 )
    480                 try:
    481                     stdout, stderr = bot_proc.communicate(timeout=900)
    482                 except subprocess.TimeoutExpired:
    483                     # Kill entire process group (playwright + child serve processes)
    484                     try:
    485                         os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM)
    486                     except Exception:
    487                         pass
    488                     bot_proc.kill()
    489                     bot_proc.wait()
    490                     if report_path.exists():
    491                         report_data = json.loads(report_path.read_text())
    492                         summary = report_data.get("summary", {})
    493                         results["gameplay_bot"] = {
    494                             "pass": summary.get("failed", 1) == 0,
    495                             "score": summary.get("score", 0),
    496                             "total": summary.get("total", 0),
    497                             "passed": summary.get("passed", 0),
    498                             "failed": summary.get("failed", 0),
    499                             "report": report_data,
    500                             "timed_out": True,
    501                         }
    502                     else:
    503                         results["gameplay_bot"] = {
    504                             "pass": False,
    505                             "score": 0,
    506                             "error": "Gameplay bot timed out after 900 seconds",
    507                         }
    508                 else:
    509                     if report_path.exists():
    510                         report_data = json.loads(report_path.read_text())
    511                         summary = report_data.get("summary", {})
    512                         results["gameplay_bot"] = {
    513                             "pass": summary.get("failed", 1) == 0,
    514                             "score": summary.get("score", 0),
    515                             "total": summary.get("total", 0),
    516                             "passed": summary.get("passed", 0),
    517                             "failed": summary.get("failed", 0),
    518                             "report": report_data,
    519                         }
    520                     else:
    521                         results["gameplay_bot"] = {
    522                             "pass": False,
    523                             "score": 0,
    524                             "error": f"Report file not created. Exit code: {bot_proc.returncode}. "
    525                                      f"stderr: {stderr[:1000] if stderr else ''}",
    526                         }
    527                 finally:
    528                     # Always clean up the process group to prevent orphaned serve processes
    529                     try:
    530                         os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM)
    531                     except Exception:
    532                         pass
    533             except FileNotFoundError:
    534                 results["gameplay_bot"] = {
    535                     "pass": False,
    536                     "score": 0,
    537                     "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
    538                 }
    539             except Exception as e:
    540                 results["gameplay_bot"] = {
    541                     "pass": False,
    542                     "score": 0,
    543                     "error": str(e),
    544                 }
    545 
    546     # SonarQube analysis (if SonarQube is running)
    547     sonar_script = task_dir / "eval" / "sonarqube-scan.py"
    548     if sonar_script.exists():
    549         # Use cell_id + run_number as unique project key
    550         project_key = f"tetris-{run_dir.name}"[:250].replace("=", "-").replace("/", "-")
    551         try:
    552             result = subprocess.run(
    553                 ["python3", str(sonar_script), str(workspace), project_key],
    554                 capture_output=True, text=True, timeout=90,
    555             )
    556             results["sonarqube"] = safe_parse_json(result.stdout.strip())
    557         except subprocess.TimeoutExpired:
    558             results["sonarqube"] = {"error": "SonarQube scan timed out", "score": 0}
    559         except Exception as e:
    560             results["sonarqube"] = {"error": str(e), "score": 0}
    561 
    562     # Compute outcome score from scoring.yaml (gameplay_bot + quality only)
    563     try:
    564         scoring_file = task_dir / "scoring.yaml"
    565         if scoring_file.exists():
    566             import yaml
    567             scoring = yaml.safe_load(scoring_file.read_text())
    568             outcome_weights = scoring.get("outcome_weights", {})
    569 
    570             score = 0.0
    571             total_weight = 0.0
    572             for category, weight in outcome_weights.items():
    573                 cat_data = results.get(category)
    574                 if cat_data and isinstance(cat_data.get("score"), (int, float)):
    575                     score += cat_data["score"] * weight
    576                     total_weight += weight
    577 
    578             if total_weight > 0:
    579                 results["outcome_score"] = round(score / total_weight, 4)
    580             else:
    581                 results["outcome_score"] = 0
    582 
    583             # Alias so existing code that reads "score" keeps working
    584             results["score"] = results["outcome_score"]
    585     except Exception:
    586         pass
    587 
    588     (run_dir / "eval_results.json").write_text(json.dumps(results, indent=2))
    589 
    590 
    591 def archive_workspace(workspace: Path, run_dir: Path):
    592     """Archive workspace, extract to dashboard artifacts, then delete."""
    593     archive_path = run_dir / "workspace.tar.gz"
    594     try:
    595         with tarfile.open(archive_path, "w:gz") as tar:
    596             tar.add(workspace, arcname=workspace.name,
    597                      filter=lambda t: None if "node_modules" in t.name else t)
    598     except Exception:
    599         pass
    600 
    601     # Extract to dashboard/public/artifacts/ for iframe preview
    602     run_id = run_dir.name
    603     artifacts_dir = PROJECT_DIR / "artifacts" / run_id
    604     try:
    605         if artifacts_dir.exists():
    606             shutil.rmtree(artifacts_dir)
    607         # Copy workspace contents (excluding node_modules)
    608         shutil.copytree(
    609             workspace, artifacts_dir,
    610             ignore=shutil.ignore_patterns("node_modules", ".git", "__pycache__", "report"),
    611         )
    612         # Ensure index.html exists at root
    613         if not (artifacts_dir / "index.html").exists():
    614             for html in artifacts_dir.rglob("*.html"):
    615                 shutil.copy2(html, artifacts_dir / "index.html")
    616                 break
    617     except Exception:
    618         pass
    619 
    620     try:
    621         shutil.rmtree(workspace)
    622     except Exception:
    623         pass
    624 
    625 
    626 _print_lock = threading.Lock()
    627 _index_lock = threading.Lock()
    628 _counter_lock = threading.Lock()
    629 
    630 
    631 def log(msg: str):
    632     with _print_lock:
    633         print(msg, flush=True)
    634 
    635 
    636 def is_valid_run(run_dir: Path) -> bool:
    637     """Check whether a completed run directory contains valid results.
    638 
    639     Returns False (invalid) only for unambiguous failures:
    640     - claude_output.json missing entirely
    641     - num_turns is 0 or null (no work done at all)
    642     - "Invalid API key" in the result field
    643     - transcript.jsonl missing or empty
    644 
    645     Does NOT reject: turns=1 (GLM models complete in 1 turn),
    646     timeouts (may have produced valid work), cost=0 with turns>0.
    647     """
    648     # Check transcript exists
    649     transcript_path = run_dir / "transcript.jsonl"
    650     if not transcript_path.exists():
    651         return False
    652     try:
    653         lines = transcript_path.read_text().strip().split("\n")
    654         if len(lines) < 3:
    655             return False
    656     except OSError:
    657         return False
    658 
    659     # Check claude_output.json
    660     output_path = run_dir / "claude_output.json"
    661     if not output_path.exists():
    662         return False
    663 
    664     try:
    665         output = json.loads(output_path.read_text())
    666     except (json.JSONDecodeError, OSError):
    667         return False
    668 
    669     # num_turns: 0 or null = no work done
    670     num_turns = output.get("num_turns")
    671     if num_turns is None or num_turns == 0:
    672         return False
    673 
    674     # "Invalid API key" in result field
    675     result_text = output.get("result", "")
    676     if isinstance(result_text, str) and "Invalid API key" in result_text:
    677         return False
    678 
    679     return True
    680 
    681 
    682 def run_single(
    683     cell: dict,
    684     run_num: int,
    685     results_dir: Path,
    686     project_dir: Path,
    687     claude_version: str,
    688     providers_config: dict = None,
    689 ) -> str:
    690     """Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'."""
    691     cell_id = cell["cell_id"]
    692     task = cell["task"]
    693     model = cell["model"]
    694     prompt_style = cell["prompt_style"]
    695     run_id = f"{cell_id}_run{run_num}"
    696     run_dir = results_dir / "runs" / run_id
    697 
    698     # Resume support: skip only if the run completed AND is valid
    699     if (run_dir / "eval_results.json").exists():
    700         if is_valid_run(run_dir):
    701             log(f"SKIP: {run_id}")
    702             return "skipped"
    703         else:
    704             log(f"INVALID: {run_id} - deleting and re-running")
    705             shutil.rmtree(run_dir)
    706 
    707     # Resolve provider config
    708     provider_name = cell.get("provider", "anthropic")
    709     provider_config = (providers_config or {}).get(provider_name) or {}
    710 
    711     log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
    712 
    713     run_dir.mkdir(parents=True, exist_ok=True)
    714 
    715     # Save meta
    716     meta = {
    717         **cell,
    718         "run_id": run_id,
    719         "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8],
    720         "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8],
    721         "run_number": run_num,
    722         "actual_model": model,
    723         "claude_version": claude_version,
    724         "started_at": datetime.now(timezone.utc).isoformat(),
    725     }
    726     (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
    727 
    728     # Create workspace
    729     try:
    730         workspace = create_workspace(project_dir, task, cell)
    731     except Exception as e:
    732         log(f"  ERROR creating workspace for {run_id}: {e}")
    733         return "failed"
    734 
    735     # Invoke claude
    736     start_time = time.time()
    737     exit_code = invoke_claude(cell, workspace, run_dir, project_dir, provider_config)
    738     wall_time = int(time.time() - start_time)
    739 
    740     status = "ok" if exit_code == 0 else f"exit {exit_code}"
    741 
    742     # Update meta with timing
    743     meta["wall_time_seconds"] = wall_time
    744     meta["exit_code"] = exit_code
    745     meta["completed_at"] = datetime.now(timezone.utc).isoformat()
    746     (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
    747 
    748     # Guard: if claude produced nothing (0 turns), discard the run
    749     output_path = run_dir / "claude_output.json"
    750     if output_path.exists():
    751         try:
    752             output = json.loads(output_path.read_text())
    753             if (output.get("num_turns") or 0) == 0:
    754                 log(f"  DISCARD: {run_id} - 0 turns (no work done)")
    755                 shutil.rmtree(run_dir, ignore_errors=True)
    756                 shutil.rmtree(workspace, ignore_errors=True)
    757                 return "failed"
    758         except Exception:
    759             pass
    760 
    761     # Evaluate
    762     task_dir = project_dir / "tasks" / task
    763     evaluate(task_dir, workspace, cell, run_dir)
    764 
    765     # Append to index (thread-safe)
    766     index_entry = {
    767         "run_id": run_id,
    768         "task": task,
    769         "model": model,
    770         "cell_id": cell_id,
    771         "short_id": meta["short_id"],
    772         "short_cell_id": meta["short_cell_id"],
    773         "completed_at": meta["completed_at"],
    774     }
    775     with _index_lock:
    776         with open(results_dir / "index.jsonl", "a") as f:
    777             f.write(json.dumps(index_entry) + "\n")
    778 
    779     # Archive and cleanup
    780     archive_workspace(workspace, run_dir)
    781 
    782     result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
    783     log(f"  DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
    784     return result
    785 
    786 
    787 def main():
    788     # Parse args
    789     args = sys.argv[1:]
    790     parallel = 1
    791     baseline_model = None
    792     provider_filter = None
    793     max_runs = None
    794     commit_every = None
    795     override_runs_per_cell = None
    796     grid_file = str(PROJECT_DIR / "grid.yaml")
    797     profile = "smoke"
    798 
    799     do_reeval = False
    800     do_analyze = False
    801 
    802     i = 0
    803     positional = []
    804     while i < len(args):
    805         if args[i] == "-j" and i + 1 < len(args):
    806             parallel = int(args[i + 1])
    807             i += 2
    808         elif args[i] == "--model" and i + 1 < len(args):
    809             baseline_model = args[i + 1]
    810             i += 2
    811         elif args[i] == "--provider" and i + 1 < len(args):
    812             provider_filter = args[i + 1]
    813             i += 2
    814         elif args[i] in ("-n", "--max-runs") and i + 1 < len(args):
    815             max_runs = int(args[i + 1])
    816             i += 2
    817         elif args[i] == "--commit-every" and i + 1 < len(args):
    818             commit_every = int(args[i + 1])
    819             i += 2
    820         elif args[i] == "--runs-per-cell" and i + 1 < len(args):
    821             override_runs_per_cell = int(args[i + 1])
    822             i += 2
    823         elif args[i] == "--reeval":
    824             do_reeval = True
    825             i += 1
    826         elif args[i] == "--analyze":
    827             do_analyze = True
    828             i += 1
    829         elif args[i] == "--full-pipeline":
    830             do_reeval = True
    831             do_analyze = True
    832             i += 1
    833         else:
    834             positional.append(args[i])
    835             i += 1
    836 
    837     if len(positional) >= 1:
    838         grid_file = positional[0]
    839     if len(positional) >= 2:
    840         profile = positional[1]
    841 
    842     results_dir = PROJECT_DIR / "results"
    843     results_dir.mkdir(exist_ok=True)
    844     (results_dir / "runs").mkdir(exist_ok=True)
    845 
    846     # Preflight
    847     if shutil.which("claude") is None:
    848         print("ERROR: claude CLI not found in PATH.")
    849         sys.exit(1)
    850 
    851     # Capture claude version for metadata
    852     claude_version = "unknown"
    853     try:
    854         result = subprocess.run(
    855             ["claude", "--version"], capture_output=True, text=True, timeout=5
    856         )
    857         claude_version = result.stdout.strip() or result.stderr.strip()
    858     except Exception:
    859         pass
    860 
    861     print("=" * 40)
    862     print("Loop Benchmarking Harness")
    863     print("=" * 40)
    864     print(f"Grid file:  {grid_file}")
    865     print(f"Profile:    {profile}")
    866     print(f"Provider:   {provider_filter}")
    867     print(f"Parallel:   {parallel}")
    868     print(f"Results:    {results_dir}")
    869     print("=" * 40)
    870 
    871     # --provider is required
    872     if not provider_filter:
    873         print("ERROR: --provider is required. Use --provider anthropic or --provider zai")
    874         sys.exit(1)
    875 
    876     grid = load_grid(grid_file)
    877     providers_config = grid.get("providers", {})
    878 
    879     if provider_filter not in providers_config and provider_filter not in [v for spec in grid["axes"].values() for v in spec.get("values", [])]:
    880         valid = grid["axes"].get("provider", {}).get("values", [])
    881         print(f"ERROR: unknown provider '{provider_filter}'. Valid: {valid}")
    882         sys.exit(1)
    883 
    884     # Build baseline override from --model flag
    885     # For non-anthropic providers, accept actual model names (e.g., glm-4.5-air)
    886     # and reverse-map to the Claude arg (e.g., haiku)
    887     baseline = None
    888     if baseline_model:
    889         axes = {name: spec["values"] for name, spec in grid["axes"].items()}
    890         baseline = {name: values[0] for name, values in axes.items()}
    891         baseline["model"] = baseline_model
    892         if provider_filter:
    893             baseline["provider"] = provider_filter
    894 
    895     # Determine cell generation strategy
    896     if profile == "main_effects":
    897         cells = main_effects_plan(grid, baseline=baseline)
    898         print(f"Design:     main effects sweep" + (f" (baseline model: {baseline_model})" if baseline_model else ""))
    899     elif profile == "plackett_burman":
    900         cells = plackett_burman_plan(grid)
    901         print(f"Design:     Plackett-Burman screening")
    902     elif profile.startswith("interaction_hunt:"):
    903         top_axes = profile.split(":", 1)[1].split(",")
    904         cells = interaction_hunt_plan(grid, top_axes)
    905         print(f"Design:     interaction hunt on {top_axes}")
    906     else:
    907         cells = compute_cells(grid, profile)
    908         print(f"Profile:    {profile}")
    909 
    910     # Filter cells to requested provider
    911     cells = [c for c in cells if c.get("provider", "anthropic") == provider_filter]
    912 
    913     # Build the full list of (cell, run_num) jobs
    914     jobs = []
    915     for cell in cells:
    916         runs_per_cell = override_runs_per_cell or cell.get("runs_per_cell", 3)
    917         for run_num in range(1, runs_per_cell + 1):
    918             jobs.append((cell, run_num))
    919 
    920     if max_runs and len(jobs) > max_runs:
    921         jobs = jobs[:max_runs]
    922         print(f"Total jobs:  {len(jobs)} (limited by -n {max_runs})")
    923     else:
    924         print(f"Total jobs:  {len(jobs)}")
    925     print()
    926 
    927     # Periodic commit helper
    928     _last_commit_count = [0]  # mutable for closure
    929 
    930     def periodic_commit(completed_so_far):
    931         if not commit_every or completed_so_far - _last_commit_count[0] < commit_every:
    932             return
    933         _last_commit_count[0] = completed_so_far
    934         log(f"  --- Checkpoint: analyzing and pushing {completed_so_far} completed runs ---")
    935         # Run analysis
    936         analysis_dir = results_dir / "analysis"
    937         analysis_dir.mkdir(exist_ok=True)
    938         for metric in ["score", "cost", "turns", "wall_time", "gameplay", "sonarqube", "code_quality", "structural", "transcript", "build_quality"]:
    939             try:
    940                 effects = analyze_main_effects(str(results_dir), metric)
    941                 (analysis_dir / f"main_effects_{metric}.json").write_text(json.dumps(effects, indent=2))
    942             except Exception:
    943                 pass
    944         # Commit and push
    945         try:
    946             subprocess.run(["git", "add", "-A", "results/", "artifacts/"], cwd=str(PROJECT_DIR), capture_output=True, timeout=30)
    947             total_runs = len(list((results_dir / "runs").iterdir()))
    948             msg = f"Checkpoint: {completed_so_far} runs ({total_runs} total)"
    949             subprocess.run(["git", "commit", "-m", msg], cwd=str(PROJECT_DIR), capture_output=True, timeout=30)
    950             subprocess.run(["git", "push"], cwd=str(PROJECT_DIR), capture_output=True, timeout=60)
    951             log(f"  --- Pushed checkpoint ---")
    952         except Exception as e:
    953             log(f"  --- Checkpoint push failed: {e} ---")
    954 
    955     # Start auth keepalive in background (refreshes OAuth token every 5 min)
    956     auth_keepalive = subprocess.Popen(
    957         ["bash", str(SCRIPT_DIR / "lib" / "keep-auth-alive.sh"), "300"],
    958         stdout=subprocess.DEVNULL,
    959         stderr=subprocess.DEVNULL,
    960     )
    961     print(f"Auth keepalive started (PID {auth_keepalive.pid})")
    962 
    963     completed = 0
    964     skipped = 0
    965     failed = 0
    966 
    967     if parallel <= 1:
    968         # Sequential
    969         for cell, run_num in jobs:
    970             result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config)
    971             if result == "completed":
    972                 completed += 1
    973             elif result == "skipped":
    974                 skipped += 1
    975             else:
    976                 failed += 1
    977             periodic_commit(completed)
    978     else:
    979         # Parallel with rolling concurrency
    980         with ThreadPoolExecutor(max_workers=parallel) as executor:
    981             futures = {
    982                 executor.submit(
    983                     run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config
    984                 ): (cell, run_num)
    985                 for cell, run_num in jobs
    986             }
    987 
    988             for future in as_completed(futures):
    989                 try:
    990                     result = future.result()
    991                 except Exception as e:
    992                     log(f"  ERROR: {e}")
    993                     result = "failed"
    994 
    995                 with _counter_lock:
    996                     if result == "completed":
    997                         completed += 1
    998                     elif result == "skipped":
    999                         skipped += 1
   1000                     else:
   1001                         failed += 1
   1002 
   1003                     total_done = completed + skipped + failed
   1004                     log(f"  Progress: {total_done}/{len(jobs)} ({completed} completed, {skipped} skipped, {failed} failed)")
   1005                     periodic_commit(completed)
   1006 
   1007     # Stop auth keepalive
   1008     auth_keepalive.terminate()
   1009     auth_keepalive.wait()
   1010 
   1011     print()
   1012     print("=" * 40)
   1013     print("All runs complete.")
   1014     print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}")
   1015     print("=" * 40)
   1016 
   1017     # Re-evaluate all runs with latest eval scripts (only when explicitly requested)
   1018     if do_reeval:
   1019         print()
   1020         print("Re-evaluating ALL runs with latest eval scripts...")
   1021         reeval_result = subprocess.run(
   1022             ["python3", str(SCRIPT_DIR / "reeval.py"), str(results_dir), "-j", str(max(parallel, 4))],
   1023             cwd=str(PROJECT_DIR),
   1024         )
   1025         if reeval_result.returncode == 0:
   1026             print("Re-evaluation complete.")
   1027         else:
   1028             print("Re-evaluation had errors (continuing).")
   1029 
   1030     # Run analysis and save results
   1031     if do_analyze or completed > 0:
   1032         print()
   1033         print("Running analysis...")
   1034         analysis_dir = results_dir / "analysis"
   1035         analysis_dir.mkdir(exist_ok=True)
   1036 
   1037         metrics = ["score", "cost", "turns", "wall_time", "gameplay", "code_quality"]
   1038         for metric in metrics:
   1039             try:
   1040                 effects = analyze_main_effects(str(results_dir), metric)
   1041                 (analysis_dir / f"main_effects_{metric}.json").write_text(
   1042                     json.dumps(effects, indent=2)
   1043                 )
   1044                 print(f"  Saved main_effects_{metric}.json")
   1045             except Exception as e:
   1046                 print(f"  Error analyzing {metric}: {e}")
   1047 
   1048     # Auto-commit and push results
   1049     if completed > 0:
   1050         print()
   1051         print("Committing results...")
   1052         try:
   1053             subprocess.run(
   1054                 ["git", "add", "-A", "results/", "artifacts/"],
   1055                 cwd=str(PROJECT_DIR), capture_output=True, timeout=30,
   1056             )
   1057             total_runs = len(list((results_dir / "runs").iterdir()))
   1058             msg = (
   1059                 f"Add {completed} new runs ({total_runs} total)\n\n"
   1060                 f"Profile: {profile}\n"
   1061                 f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}"
   1062             )
   1063             subprocess.run(
   1064                 ["git", "commit", "-m", msg],
   1065                 cwd=str(PROJECT_DIR), capture_output=True,
   1066             )
   1067             result = subprocess.run(
   1068                 ["git", "push"],
   1069                 cwd=str(PROJECT_DIR), capture_output=True, text=True,
   1070             )
   1071             if result.returncode == 0:
   1072                 print("Results committed and pushed.")
   1073             else:
   1074                 print(f"Push failed: {result.stderr.strip()}")
   1075                 print("Results committed locally. Push manually with: git push")
   1076         except Exception as e:
   1077             print(f"Auto-commit failed: {e}")
   1078             print("Commit manually with: git add -A results/ artifacts/ && git commit && git push")
   1079 
   1080 
   1081 if __name__ == "__main__":
   1082     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README