run.py (46826B)
1 #!/usr/bin/env python3 2 """Loop Benchmarking Harness - Main orchestrator. 3 4 Computes the experiment grid, creates isolated workspaces, invokes claude, 5 runs evaluation, and stores results. 6 7 Usage: 8 python3 run.py [grid_file] [profile_or_design] [-j N] 9 10 profile_or_design can be: 11 - A profile name from grid.yaml (e.g., smoke, core, full) 12 - A DOE design: main_effects, plackett_burman 13 - interaction_hunt:axis1,axis2,axis3 14 15 -j N: run N experiments in parallel (default 1) 16 --model MODEL: set baseline model for main_effects sweep 17 --provider PROVIDER: required -- anthropic or zai 18 --reeval: re-evaluate all existing runs with latest eval scripts 19 --analyze: run analysis and save results to results/analysis/ 20 --full-pipeline: reeval + analyze after sweep completes 21 """ 22 23 import hashlib 24 import json 25 import os 26 import shlex 27 import signal 28 import shutil 29 import subprocess 30 import sys 31 import tarfile 32 import tempfile 33 import threading 34 import time 35 from concurrent.futures import ThreadPoolExecutor, as_completed 36 from datetime import datetime, timezone 37 from pathlib import Path 38 39 SCRIPT_DIR = Path(__file__).resolve().parent 40 PROJECT_DIR = SCRIPT_DIR.parent 41 sys.path.insert(0, str(SCRIPT_DIR / "lib")) 42 43 from compute_grid import load_grid, compute_cells 44 from experiment_design import ( 45 main_effects_plan, 46 plackett_burman_plan, 47 interaction_hunt_plan, 48 analyze_main_effects, 49 ) 50 51 # Prompt snippets appended based on axis values. Order matters. 52 PROMPT_SNIPPET_ORDER = [ 53 "language", "renderer", "architecture", "design_guidance", 54 "tests_provided", "strategy", "error_checking", "playwright", 55 ] 56 57 PROMPT_SNIPPETS = { 58 # language (existing, moved from build_prompt body) 59 ("language", "typescript"): "Use TypeScript.", 60 ("language", "javascript"): "Use JavaScript (no TypeScript).", 61 62 # renderer 63 ("renderer", "canvas"): "Render the game using HTML5 Canvas.", 64 ("renderer", "svg"): "Render the game using SVG elements.", 65 ("renderer", "dom"): "Render the game using DOM elements (divs/tables) with CSS for styling. Do not use Canvas or SVG.", 66 ("renderer", "webgl"): "Render the game using WebGL.", 67 68 # architecture 69 ("architecture", "separation"): "Use separate files for every concern: game logic, rendering, input handling, scoring, and UI. No file should be longer than 150 lines.", 70 ("architecture", "best_practices"): "Follow software engineering best practices throughout: SOLID principles, clean code, proper error handling, no magic numbers, meaningful variable names, and JSDoc comments on public functions.", 71 72 # design_guidance 73 ("design_guidance", "vague"): "Make it look visually polished and professional. The game should feel like a finished product, not a prototype.", 74 ("design_guidance", "specific"): "Follow this design specification:\n- Dark background (#1a1a2e) with a subtle grid pattern\n- Neon-colored pieces with subtle glow effects (I=cyan, O=yellow, T=purple, S=green, Z=red, J=blue, L=orange)\n- Smooth drop and line-clear animations\n- Side panel with score, level, lines cleared, and next-piece preview\n- Use a monospace font for all numeric displays\n- Center the game board on the page with comfortable margins\n- Visible cell borders within the grid", 75 76 # tests_provided 77 ("tests_provided", "a_few"): "I have included a few Playwright tests in the tests/ directory. Your implementation should pass them. Do not modify the test files.", 78 ("tests_provided", "many"): "I have included a comprehensive Playwright test suite in the tests/ directory. Your implementation should pass all of them. Do not modify the test files.", 79 80 # strategy 81 ("strategy", "plan_first"): "Before writing any code, explore the requirements thoroughly and create a detailed implementation plan. Think through the architecture, data structures, and edge cases. Only start coding once you have a clear plan.", 82 ("strategy", "iterate"): "Build incrementally. Start with the simplest possible working version, then add features one at a time. After each change, test it to make sure everything still works. Do not stop iterating until the game is polished and complete.", 83 ("strategy", "creative_validate"): "Think outside the box for ways to validate your implementation works flawlessly. Do not just assume it works. Find creative, unusual ways to verify correctness, test edge cases, and stress-test the game.", 84 ("strategy", "use_subagents"): "Make use of sub-agents to help you work more effectively.", 85 ("strategy", "delegate"): "Use sub-agents strategically: spawn a research agent to explore requirements and plan the architecture, a separate implementation agent for the core game logic, and a testing agent to verify the result. Coordinate their work.", 86 ("strategy", "review"): "After building the game, spawn a separate sub-agent to thoroughly review your code for bugs, edge cases, and quality issues. Then fix every issue the reviewer finds.", 87 ("strategy", "split_work"): "Divide the work among specialized sub-agents: one agent builds the UI and rendering, another builds the game logic and state management. Before they start, a third agent defines the interfaces between their components. After both are done, the third agent integrates and tests the final product.", 88 89 # error_checking 90 ("error_checking", "self_verify"): "After building the game, run it yourself. Open the page, check the browser console for errors, verify that pieces move and rotate correctly, test that lines clear and the score updates, and fix any issues you find. Do not declare the task complete until you have verified the game works end-to-end.", 91 92 # playwright (only "instructed" adds a snippet) 93 ("playwright", "instructed"): "Use Playwright to test your work as you build it. Write and run tests frequently to catch issues early.", 94 } 95 96 PROMPT_SNIPPETS_ES = { 97 ("language", "typescript"): "Usa TypeScript.", 98 ("language", "javascript"): "Usa JavaScript (sin TypeScript).", 99 ("renderer", "canvas"): "Renderiza el juego usando HTML5 Canvas.", 100 ("renderer", "svg"): "Renderiza el juego usando elementos SVG.", 101 ("renderer", "dom"): "Renderiza el juego usando elementos DOM (divs/tablas) con CSS. No uses Canvas ni SVG.", 102 ("renderer", "webgl"): "Renderiza el juego usando WebGL.", 103 ("architecture", "separation"): "Usa archivos separados para cada responsabilidad: logica del juego, renderizado, manejo de entrada, puntuacion e interfaz. Ningun archivo debe superar las 150 lineas.", 104 ("architecture", "best_practices"): "Sigue las mejores practicas de ingenieria de software: principios SOLID, codigo limpio, manejo adecuado de errores, sin numeros magicos, nombres significativos y comentarios JSDoc en funciones publicas.", 105 ("design_guidance", "vague"): "Hazlo visualmente pulido y profesional. El juego debe sentirse como un producto terminado, no un prototipo.", 106 ("design_guidance", "specific"): "Sigue esta especificacion de diseno:\n- Fondo oscuro (#1a1a2e) con un patron de cuadricula sutil\n- Piezas de colores neon con efectos de brillo sutil (I=cyan, O=amarillo, T=morado, S=verde, Z=rojo, J=azul, L=naranja)\n- Animaciones suaves de caida y limpieza de lineas\n- Panel lateral con puntuacion, nivel, lineas eliminadas y vista previa de la siguiente pieza\n- Usa una fuente monoespaciada para todas las visualizaciones numericas\n- Centra el tablero en la pagina con margenes comodos\n- Bordes de celda visibles dentro de la cuadricula", 107 ("tests_provided", "a_few"): "He incluido algunas pruebas de Playwright en el directorio tests/. Tu implementacion debe pasarlas. No modifiques los archivos de prueba.", 108 ("tests_provided", "many"): "He incluido una suite completa de pruebas Playwright en el directorio tests/. Tu implementacion debe pasar todas. No modifiques los archivos de prueba.", 109 ("strategy", "plan_first"): "Antes de escribir codigo, explora los requisitos a fondo y crea un plan de implementacion detallado. Piensa en la arquitectura, estructuras de datos y casos extremos. Solo comienza a codificar cuando tengas un plan claro.", 110 ("strategy", "iterate"): "Construye incrementalmente. Comienza con la version funcional mas simple posible, luego agrega funciones una por una. Despues de cada cambio, prueba para asegurarte de que todo sigue funcionando. No dejes de iterar hasta que el juego este pulido y completo.", 111 ("strategy", "creative_validate"): "Piensa de forma creativa en maneras de validar que tu implementacion funciona perfectamente. No asumas que funciona. Encuentra formas creativas e inusuales de verificar la correctitud, probar casos extremos y someter el juego a pruebas de estres.", 112 ("strategy", "use_subagents"): "Usa sub-agentes para ayudarte a trabajar de manera mas efectiva.", 113 ("strategy", "delegate"): "Usa sub-agentes estrategicamente: lanza un agente de investigacion para explorar requisitos y planificar la arquitectura, un agente de implementacion separado para la logica del juego, y un agente de pruebas para verificar el resultado. Coordina su trabajo.", 114 ("strategy", "review"): "Despues de construir el juego, lanza un sub-agente separado para revisar tu codigo en busca de errores, casos extremos y problemas de calidad. Luego corrige cada problema que encuentre el revisor.", 115 ("strategy", "split_work"): "Divide el trabajo entre sub-agentes especializados: uno construye la interfaz y el renderizado, otro construye la logica del juego y la gestion de estado. Antes de que comiencen, un tercer agente define las interfaces entre sus componentes. Despues de que ambos terminen, el tercer agente integra y prueba el producto final.", 116 ("error_checking", "self_verify"): "Despues de construir el juego, ejecutalo tu mismo. Abre la pagina, revisa la consola del navegador en busca de errores, verifica que las piezas se muevan y roten correctamente, prueba que las lineas se limpien y la puntuacion se actualice, y corrige cualquier problema que encuentres. No declares la tarea completa hasta que hayas verificado que el juego funciona de principio a fin.", 117 ("playwright", "instructed"): "Usa Playwright para probar tu trabajo mientras lo construyes. Escribe y ejecuta pruebas con frecuencia para detectar problemas temprano.", 118 } 119 120 # Strategy values that enable the Agent tool 121 STRATEGIES_WITH_AGENTS = {"use_subagents", "delegate", "review", "compete", "split_work"} 122 123 124 def create_workspace(project_dir: Path, task: str, cell: dict) -> Path: 125 """Create an isolated temp directory with appropriate setup.""" 126 workspace = Path(tempfile.mkdtemp(prefix="loop-bench-")) 127 128 language = cell.get("language", "typescript") 129 linter = cell.get("linter", "off") 130 playwright = cell.get("playwright", "off") 131 132 # npm init 133 subprocess.run(["npm", "init", "-y"], cwd=workspace, capture_output=True) 134 135 # TypeScript (only pre-install if explicitly requested, not for "unspecified") 136 if language == "typescript": 137 subprocess.run( 138 ["npm", "install", "--save-dev", "typescript", "@types/node"], 139 cwd=workspace, capture_output=True, 140 ) 141 142 # Linter 143 if linter == "on": 144 subprocess.run( 145 ["npm", "install", "--save-dev", "eslint", "@eslint/js"], 146 cwd=workspace, capture_output=True, 147 ) 148 149 # Playwright 150 if playwright in ("on", "available", "instructed"): 151 subprocess.run( 152 ["npm", "install", "--save-dev", "@playwright/test"], 153 cwd=workspace, capture_output=True, 154 ) 155 subprocess.run( 156 ["npx", "playwright", "install", "chromium", "--with-deps"], 157 cwd=workspace, capture_output=True, 158 ) 159 160 # Copy fixtures 161 fixtures_dir = project_dir / "tasks" / task / "fixtures" 162 if fixtures_dir.is_dir(): 163 for item in fixtures_dir.iterdir(): 164 dest = workspace / item.name 165 if item.is_dir(): 166 shutil.copytree(item, dest) 167 else: 168 shutil.copy2(item, dest) 169 170 # Copy test fixtures if tests_provided 171 tests_provided = cell.get("tests_provided", "none") 172 if tests_provided != "none": 173 variant = "tests-few" if tests_provided == "a_few" else "tests-full" 174 tests_src = project_dir / "tasks" / task / "fixtures" / variant 175 if tests_src.is_dir(): 176 shutil.copytree(tests_src, workspace / "tests") 177 178 return workspace 179 180 181 def build_prompt(project_dir: Path, cell: dict) -> str: 182 """Read the base prompt file and append axis-specific snippets.""" 183 task = cell["task"] 184 style = cell["prompt_style"] 185 lang_code = cell["human_language"] 186 187 prompt_file = project_dir / "tasks" / task / "prompts" / f"{style}.{lang_code}.md" 188 prompt = prompt_file.read_text() 189 190 # Select language-appropriate snippets 191 snippets = PROMPT_SNIPPETS_ES if lang_code == "es" else PROMPT_SNIPPETS 192 193 # Append snippets from each prompt-modifying axis 194 for axis in PROMPT_SNIPPET_ORDER: 195 value = cell.get(axis) 196 if value is None: 197 continue 198 snippet = snippets.get((axis, value)) 199 if snippet: 200 prompt += f"\n\n{snippet}" 201 202 # Context noise prepended (not appended) 203 noise_type = cell.get("context_noise", "clean") 204 if noise_type != "clean": 205 noise_file = project_dir / "tasks" / task / "noise" / f"{noise_type}.txt" 206 if noise_file.exists(): 207 noise_text = noise_file.read_text() 208 prompt = noise_text + "\n\n---\n\nNow for your actual task:\n\n" + prompt 209 210 return prompt 211 212 213 def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path, provider_config: dict = None) -> int: 214 """Invoke claude CLI and capture output.""" 215 prompt = build_prompt(project_dir, cell) 216 model = cell["model"] 217 # Map display model name to CLI arg (e.g., "sonnet-4.6" -> "sonnet") 218 cli_model_map = (provider_config or {}).get("cli_model_map", {}) 219 cli_model = cli_model_map.get(model, model) 220 effort = cell.get("effort", "high") 221 budget = cell.get("max_budget_usd", 0.50) 222 timeout = cell.get("timeout_seconds", 600) 223 # Build tool list from individual tool axes 224 # Bash is always available - it's the agent's escape hatch 225 tools_list = ["Bash"] 226 if cell.get("tool_read", "on") == "on": 227 tools_list.append("Read") 228 if cell.get("tool_write", "on") == "on": 229 tools_list.append("Write") 230 if cell.get("tool_edit", "on") == "on": 231 tools_list.append("Edit") 232 if cell.get("tool_glob", "on") == "on": 233 tools_list.append("Glob") 234 if cell.get("tool_grep", "on") == "on": 235 tools_list.append("Grep") 236 if cell.get("sub_agents") == "on": 237 tools_list.append("Agent") 238 if cell.get("web_search") == "on": 239 tools_list.extend(["WebSearch", "WebFetch"]) 240 tools = ",".join(tools_list) 241 242 # For large prompts (noise cells), write to temp file and read via shell 243 prompt_file = None 244 if len(prompt) > 100000: 245 prompt_file = Path(tempfile.mktemp(suffix=".txt", prefix="prompt-")) 246 prompt_file.write_text(prompt) 247 248 # Auth helper only for anthropic (reads OAuth from ~/.claude/.credentials.json). 249 # Non-anthropic providers authenticate via ANTHROPIC_AUTH_TOKEN env var set below; 250 # apiKeyHelper would override that env var and send the wrong credential. 251 settings = {} 252 if not provider_config or not provider_config.get("base_url"): 253 settings["apiKeyHelper"] = str(SCRIPT_DIR / "lib" / "get-oauth-token.sh") 254 255 # Build base command (prompt added separately for large prompts) 256 cmd_base = [ 257 "claude", 258 "--bare", 259 "--model", cli_model, 260 "--output-format", "stream-json", 261 "--verbose", 262 "--permission-mode", "dontAsk", 263 "--max-budget-usd", str(budget), 264 "--allowedTools", tools, 265 "--settings", json.dumps(settings), 266 ] 267 268 if effort: 269 cmd_base.extend(["--effort", effort]) 270 271 # Context file 272 if cell.get("context_file") == "provided": 273 ctx_file = project_dir / "tasks" / cell["task"] / "context.md" 274 if ctx_file.exists(): 275 cmd_base.extend(["--append-system-prompt", ctx_file.read_text()]) 276 277 # Build final command: for large prompts, write a shell wrapper script 278 if prompt_file: 279 wrapper = Path(tempfile.mktemp(suffix=".sh", prefix="run-claude-")) 280 wrapper_lines = ["#!/bin/bash"] 281 wrapper_lines.append(f'PROMPT=$(cat {shlex.quote(str(prompt_file))})') 282 wrapper_lines.append(" ".join(shlex.quote(c) for c in cmd_base) + ' -p "$PROMPT"') 283 wrapper.write_text("\n".join(wrapper_lines)) 284 wrapper.chmod(0o755) 285 cmd = ["bash", str(wrapper)] 286 else: 287 cmd = [*cmd_base, "-p", prompt] 288 289 # Run claude 290 transcript_path = run_dir / "transcript.jsonl" 291 stderr_path = run_dir / "claude_stderr.log" 292 293 # Inject harness metadata, prompt, and context as the first transcript entries 294 with open(transcript_path, "w") as transcript_f: 295 # Run configuration (everything the harness set up) 296 config_event = { 297 "type": "harness", 298 "subtype": "config", 299 "model": model, 300 "effort": effort, 301 "tools": tools.split(","), 302 "max_budget_usd": budget, 303 "timeout_seconds": timeout, 304 "task": cell["task"], 305 "language": cell.get("language"), 306 "prompt_style": cell.get("prompt_style"), 307 } 308 transcript_f.write(json.dumps(config_event) + "\n") 309 310 # The user's prompt 311 prompt_event = { 312 "type": "user", 313 "subtype": "prompt", 314 "message": {"role": "user", "content": prompt}, 315 } 316 transcript_f.write(json.dumps(prompt_event) + "\n") 317 318 # The context file if provided 319 if cell.get("context_file") == "provided": 320 ctx_file = project_dir / "tasks" / cell["task"] / "context.md" 321 if ctx_file.exists(): 322 ctx_event = { 323 "type": "user", 324 "subtype": "context", 325 "message": {"role": "system", "content": ctx_file.read_text()}, 326 } 327 transcript_f.write(json.dumps(ctx_event) + "\n") 328 329 # Provider-specific env overrides 330 run_env = os.environ.copy() 331 if provider_config: 332 if provider_config.get("base_url"): 333 run_env["ANTHROPIC_BASE_URL"] = provider_config["base_url"] 334 else: 335 run_env.pop("ANTHROPIC_BASE_URL", None) 336 if provider_config.get("auth_token"): 337 run_env["ANTHROPIC_AUTH_TOKEN"] = provider_config["auth_token"] 338 elif provider_config.get("api_key_env"): 339 key = os.environ.get(provider_config["api_key_env"]) 340 if key: 341 run_env["ANTHROPIC_AUTH_TOKEN"] = key 342 else: 343 run_env.pop("ANTHROPIC_BASE_URL", None) 344 345 with open(transcript_path, "a") as transcript_f, open(stderr_path, "w") as stderr_f: 346 try: 347 result = subprocess.run( 348 cmd, 349 cwd=workspace, 350 stdout=transcript_f, 351 stderr=stderr_f, 352 timeout=timeout, 353 env=run_env, 354 ) 355 exit_code = result.returncode 356 except subprocess.TimeoutExpired: 357 exit_code = 124 # Same as timeout(1) convention 358 359 # Extract final result line 360 output_path = run_dir / "claude_output.json" 361 try: 362 lines = transcript_path.read_text().strip().split("\n") 363 if lines: 364 output_path.write_text(lines[-1]) 365 except Exception: 366 output_path.write_text("{}") 367 368 return exit_code 369 370 371 def run_eval_script(script: Path, workspace: Path, language: str) -> str: 372 """Run a bash eval script and return its stdout.""" 373 try: 374 result = subprocess.run( 375 ["bash", str(script), str(workspace), language], 376 capture_output=True, text=True, timeout=120, 377 ) 378 return result.stdout.strip() 379 except Exception as e: 380 return json.dumps({"pass": False, "error": str(e)}) 381 382 383 def safe_parse_json(text: str, fallback_key: str = "error") -> dict: 384 """Parse JSON, returning an error dict if parsing fails.""" 385 if not text: 386 return {"pass": False, "error": "no output"} 387 try: 388 return json.loads(text) 389 except json.JSONDecodeError: 390 return {"pass": False, "error": text[:500]} 391 392 393 def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): 394 """Run all evaluation scripts and write eval_results.json.""" 395 language = cell.get("language", "typescript") 396 397 results = { 398 "structural": None, 399 "quality": None, 400 "code_analysis": None, 401 "transcript_analysis": None, 402 "gameplay_bot": None, 403 "outcome_score": None, 404 "score": None, 405 } 406 407 # Structural 408 structural_sh = task_dir / "eval" / "structural.sh" 409 if structural_sh.exists(): 410 output = run_eval_script(structural_sh, workspace, language) 411 results["structural"] = safe_parse_json(output) 412 413 # Quality (lint, typecheck, bundle size) 414 quality_sh = task_dir / "eval" / "quality.sh" 415 if quality_sh.exists(): 416 output = run_eval_script(quality_sh, workspace, language) 417 results["quality"] = safe_parse_json(output) 418 419 # Code analysis (file count, LOC, unnecessary files, dependencies, quality metrics) 420 code_analysis_py = task_dir / "eval" / "code-analysis.py" 421 if code_analysis_py.exists(): 422 try: 423 result = subprocess.run( 424 ["python3", str(code_analysis_py), str(workspace), language], 425 capture_output=True, text=True, timeout=120, 426 ) 427 results["code_analysis"] = safe_parse_json(result.stdout.strip()) 428 except Exception as e: 429 results["code_analysis"] = {"error": str(e), "score": 0} 430 else: 431 code_analysis_sh = task_dir / "eval" / "code-analysis.sh" 432 if code_analysis_sh.exists(): 433 output = run_eval_script(code_analysis_sh, workspace, language) 434 results["code_analysis"] = safe_parse_json(output) 435 436 # Transcript analysis (agent efficiency, wasted turns, self-testing) 437 transcript_py = task_dir / "eval" / "transcript-analysis.py" 438 if transcript_py.exists(): 439 try: 440 result = subprocess.run( 441 ["python3", str(transcript_py), str(run_dir)], 442 capture_output=True, text=True, timeout=30, 443 ) 444 results["transcript_analysis"] = safe_parse_json(result.stdout.strip()) 445 except Exception as e: 446 results["transcript_analysis"] = {"error": str(e), "score": 0} 447 448 # Gameplay bot (Playwright-based interactive testing, e.g. Tetris) 449 # Use V2 bot (two-tier architecture) if available, fall back to V1 450 gameplay_bot_v2 = task_dir / "eval" / "gameplay-bot-v2" / "index.ts" 451 gameplay_bot_v1 = task_dir / "eval" / "gameplay-bot" / "index.ts" 452 gameplay_bot_entry = gameplay_bot_v2 if gameplay_bot_v2.exists() else gameplay_bot_v1 453 if gameplay_bot_entry.exists(): 454 # Pre-check: is there an HTML file to test? 455 html_files = list(workspace.rglob("*.html")) 456 html_files = [f for f in html_files if "node_modules" not in str(f)] 457 if not html_files: 458 results["gameplay_bot"] = { 459 "pass": False, 460 "score": 0, 461 "error": "no HTML files in workspace - game was not built", 462 } 463 else: 464 report_path = run_dir / "gameplay-bot-report.json" 465 bot_dir = gameplay_bot_entry.parent 466 playwright_config = bot_dir / "playwright.config.ts" 467 try: 468 bot_env = os.environ.copy() 469 bot_env["WORKSPACE_PATH"] = str(workspace) 470 bot_env["REPORT_OUTPUT_PATH"] = str(report_path) 471 bot_proc = subprocess.Popen( 472 ["npx", "playwright", "test", "--config", str(playwright_config)], 473 cwd=str(PROJECT_DIR), 474 stdout=subprocess.PIPE, 475 stderr=subprocess.PIPE, 476 text=True, 477 env=bot_env, 478 start_new_session=True, 479 ) 480 try: 481 stdout, stderr = bot_proc.communicate(timeout=900) 482 except subprocess.TimeoutExpired: 483 # Kill entire process group (playwright + child serve processes) 484 try: 485 os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM) 486 except Exception: 487 pass 488 bot_proc.kill() 489 bot_proc.wait() 490 if report_path.exists(): 491 report_data = json.loads(report_path.read_text()) 492 summary = report_data.get("summary", {}) 493 results["gameplay_bot"] = { 494 "pass": summary.get("failed", 1) == 0, 495 "score": summary.get("score", 0), 496 "total": summary.get("total", 0), 497 "passed": summary.get("passed", 0), 498 "failed": summary.get("failed", 0), 499 "report": report_data, 500 "timed_out": True, 501 } 502 else: 503 results["gameplay_bot"] = { 504 "pass": False, 505 "score": 0, 506 "error": "Gameplay bot timed out after 900 seconds", 507 } 508 else: 509 if report_path.exists(): 510 report_data = json.loads(report_path.read_text()) 511 summary = report_data.get("summary", {}) 512 results["gameplay_bot"] = { 513 "pass": summary.get("failed", 1) == 0, 514 "score": summary.get("score", 0), 515 "total": summary.get("total", 0), 516 "passed": summary.get("passed", 0), 517 "failed": summary.get("failed", 0), 518 "report": report_data, 519 } 520 else: 521 results["gameplay_bot"] = { 522 "pass": False, 523 "score": 0, 524 "error": f"Report file not created. Exit code: {bot_proc.returncode}. " 525 f"stderr: {stderr[:1000] if stderr else ''}", 526 } 527 finally: 528 # Always clean up the process group to prevent orphaned serve processes 529 try: 530 os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM) 531 except Exception: 532 pass 533 except FileNotFoundError: 534 results["gameplay_bot"] = { 535 "pass": False, 536 "score": 0, 537 "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test", 538 } 539 except Exception as e: 540 results["gameplay_bot"] = { 541 "pass": False, 542 "score": 0, 543 "error": str(e), 544 } 545 546 # SonarQube analysis (if SonarQube is running) 547 sonar_script = task_dir / "eval" / "sonarqube-scan.py" 548 if sonar_script.exists(): 549 # Use cell_id + run_number as unique project key 550 project_key = f"tetris-{run_dir.name}"[:250].replace("=", "-").replace("/", "-") 551 try: 552 result = subprocess.run( 553 ["python3", str(sonar_script), str(workspace), project_key], 554 capture_output=True, text=True, timeout=90, 555 ) 556 results["sonarqube"] = safe_parse_json(result.stdout.strip()) 557 except subprocess.TimeoutExpired: 558 results["sonarqube"] = {"error": "SonarQube scan timed out", "score": 0} 559 except Exception as e: 560 results["sonarqube"] = {"error": str(e), "score": 0} 561 562 # Compute outcome score from scoring.yaml (gameplay_bot + quality only) 563 try: 564 scoring_file = task_dir / "scoring.yaml" 565 if scoring_file.exists(): 566 import yaml 567 scoring = yaml.safe_load(scoring_file.read_text()) 568 outcome_weights = scoring.get("outcome_weights", {}) 569 570 score = 0.0 571 total_weight = 0.0 572 for category, weight in outcome_weights.items(): 573 cat_data = results.get(category) 574 if cat_data and isinstance(cat_data.get("score"), (int, float)): 575 score += cat_data["score"] * weight 576 total_weight += weight 577 578 if total_weight > 0: 579 results["outcome_score"] = round(score / total_weight, 4) 580 else: 581 results["outcome_score"] = 0 582 583 # Alias so existing code that reads "score" keeps working 584 results["score"] = results["outcome_score"] 585 except Exception: 586 pass 587 588 (run_dir / "eval_results.json").write_text(json.dumps(results, indent=2)) 589 590 591 def archive_workspace(workspace: Path, run_dir: Path): 592 """Archive workspace, extract to dashboard artifacts, then delete.""" 593 archive_path = run_dir / "workspace.tar.gz" 594 try: 595 with tarfile.open(archive_path, "w:gz") as tar: 596 tar.add(workspace, arcname=workspace.name, 597 filter=lambda t: None if "node_modules" in t.name else t) 598 except Exception: 599 pass 600 601 # Extract to dashboard/public/artifacts/ for iframe preview 602 run_id = run_dir.name 603 artifacts_dir = PROJECT_DIR / "artifacts" / run_id 604 try: 605 if artifacts_dir.exists(): 606 shutil.rmtree(artifacts_dir) 607 # Copy workspace contents (excluding node_modules) 608 shutil.copytree( 609 workspace, artifacts_dir, 610 ignore=shutil.ignore_patterns("node_modules", ".git", "__pycache__", "report"), 611 ) 612 # Ensure index.html exists at root 613 if not (artifacts_dir / "index.html").exists(): 614 for html in artifacts_dir.rglob("*.html"): 615 shutil.copy2(html, artifacts_dir / "index.html") 616 break 617 except Exception: 618 pass 619 620 try: 621 shutil.rmtree(workspace) 622 except Exception: 623 pass 624 625 626 _print_lock = threading.Lock() 627 _index_lock = threading.Lock() 628 _counter_lock = threading.Lock() 629 630 631 def log(msg: str): 632 with _print_lock: 633 print(msg, flush=True) 634 635 636 def is_valid_run(run_dir: Path) -> bool: 637 """Check whether a completed run directory contains valid results. 638 639 Returns False (invalid) only for unambiguous failures: 640 - claude_output.json missing entirely 641 - num_turns is 0 or null (no work done at all) 642 - "Invalid API key" in the result field 643 - transcript.jsonl missing or empty 644 645 Does NOT reject: turns=1 (GLM models complete in 1 turn), 646 timeouts (may have produced valid work), cost=0 with turns>0. 647 """ 648 # Check transcript exists 649 transcript_path = run_dir / "transcript.jsonl" 650 if not transcript_path.exists(): 651 return False 652 try: 653 lines = transcript_path.read_text().strip().split("\n") 654 if len(lines) < 3: 655 return False 656 except OSError: 657 return False 658 659 # Check claude_output.json 660 output_path = run_dir / "claude_output.json" 661 if not output_path.exists(): 662 return False 663 664 try: 665 output = json.loads(output_path.read_text()) 666 except (json.JSONDecodeError, OSError): 667 return False 668 669 # num_turns: 0 or null = no work done 670 num_turns = output.get("num_turns") 671 if num_turns is None or num_turns == 0: 672 return False 673 674 # "Invalid API key" in result field 675 result_text = output.get("result", "") 676 if isinstance(result_text, str) and "Invalid API key" in result_text: 677 return False 678 679 return True 680 681 682 def run_single( 683 cell: dict, 684 run_num: int, 685 results_dir: Path, 686 project_dir: Path, 687 claude_version: str, 688 providers_config: dict = None, 689 ) -> str: 690 """Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'.""" 691 cell_id = cell["cell_id"] 692 task = cell["task"] 693 model = cell["model"] 694 prompt_style = cell["prompt_style"] 695 run_id = f"{cell_id}_run{run_num}" 696 run_dir = results_dir / "runs" / run_id 697 698 # Resume support: skip only if the run completed AND is valid 699 if (run_dir / "eval_results.json").exists(): 700 if is_valid_run(run_dir): 701 log(f"SKIP: {run_id}") 702 return "skipped" 703 else: 704 log(f"INVALID: {run_id} - deleting and re-running") 705 shutil.rmtree(run_dir) 706 707 # Resolve provider config 708 provider_name = cell.get("provider", "anthropic") 709 provider_config = (providers_config or {}).get(provider_name) or {} 710 711 log(f"START: {task} | {model} | {prompt_style} | run{run_num}") 712 713 run_dir.mkdir(parents=True, exist_ok=True) 714 715 # Save meta 716 meta = { 717 **cell, 718 "run_id": run_id, 719 "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8], 720 "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8], 721 "run_number": run_num, 722 "actual_model": model, 723 "claude_version": claude_version, 724 "started_at": datetime.now(timezone.utc).isoformat(), 725 } 726 (run_dir / "meta.json").write_text(json.dumps(meta, indent=2)) 727 728 # Create workspace 729 try: 730 workspace = create_workspace(project_dir, task, cell) 731 except Exception as e: 732 log(f" ERROR creating workspace for {run_id}: {e}") 733 return "failed" 734 735 # Invoke claude 736 start_time = time.time() 737 exit_code = invoke_claude(cell, workspace, run_dir, project_dir, provider_config) 738 wall_time = int(time.time() - start_time) 739 740 status = "ok" if exit_code == 0 else f"exit {exit_code}" 741 742 # Update meta with timing 743 meta["wall_time_seconds"] = wall_time 744 meta["exit_code"] = exit_code 745 meta["completed_at"] = datetime.now(timezone.utc).isoformat() 746 (run_dir / "meta.json").write_text(json.dumps(meta, indent=2)) 747 748 # Guard: if claude produced nothing (0 turns), discard the run 749 output_path = run_dir / "claude_output.json" 750 if output_path.exists(): 751 try: 752 output = json.loads(output_path.read_text()) 753 if (output.get("num_turns") or 0) == 0: 754 log(f" DISCARD: {run_id} - 0 turns (no work done)") 755 shutil.rmtree(run_dir, ignore_errors=True) 756 shutil.rmtree(workspace, ignore_errors=True) 757 return "failed" 758 except Exception: 759 pass 760 761 # Evaluate 762 task_dir = project_dir / "tasks" / task 763 evaluate(task_dir, workspace, cell, run_dir) 764 765 # Append to index (thread-safe) 766 index_entry = { 767 "run_id": run_id, 768 "task": task, 769 "model": model, 770 "cell_id": cell_id, 771 "short_id": meta["short_id"], 772 "short_cell_id": meta["short_cell_id"], 773 "completed_at": meta["completed_at"], 774 } 775 with _index_lock: 776 with open(results_dir / "index.jsonl", "a") as f: 777 f.write(json.dumps(index_entry) + "\n") 778 779 # Archive and cleanup 780 archive_workspace(workspace, run_dir) 781 782 result = "completed" if (run_dir / "eval_results.json").exists() else "failed" 783 log(f" DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}") 784 return result 785 786 787 def main(): 788 # Parse args 789 args = sys.argv[1:] 790 parallel = 1 791 baseline_model = None 792 provider_filter = None 793 max_runs = None 794 commit_every = None 795 override_runs_per_cell = None 796 grid_file = str(PROJECT_DIR / "grid.yaml") 797 profile = "smoke" 798 799 do_reeval = False 800 do_analyze = False 801 802 i = 0 803 positional = [] 804 while i < len(args): 805 if args[i] == "-j" and i + 1 < len(args): 806 parallel = int(args[i + 1]) 807 i += 2 808 elif args[i] == "--model" and i + 1 < len(args): 809 baseline_model = args[i + 1] 810 i += 2 811 elif args[i] == "--provider" and i + 1 < len(args): 812 provider_filter = args[i + 1] 813 i += 2 814 elif args[i] in ("-n", "--max-runs") and i + 1 < len(args): 815 max_runs = int(args[i + 1]) 816 i += 2 817 elif args[i] == "--commit-every" and i + 1 < len(args): 818 commit_every = int(args[i + 1]) 819 i += 2 820 elif args[i] == "--runs-per-cell" and i + 1 < len(args): 821 override_runs_per_cell = int(args[i + 1]) 822 i += 2 823 elif args[i] == "--reeval": 824 do_reeval = True 825 i += 1 826 elif args[i] == "--analyze": 827 do_analyze = True 828 i += 1 829 elif args[i] == "--full-pipeline": 830 do_reeval = True 831 do_analyze = True 832 i += 1 833 else: 834 positional.append(args[i]) 835 i += 1 836 837 if len(positional) >= 1: 838 grid_file = positional[0] 839 if len(positional) >= 2: 840 profile = positional[1] 841 842 results_dir = PROJECT_DIR / "results" 843 results_dir.mkdir(exist_ok=True) 844 (results_dir / "runs").mkdir(exist_ok=True) 845 846 # Preflight 847 if shutil.which("claude") is None: 848 print("ERROR: claude CLI not found in PATH.") 849 sys.exit(1) 850 851 # Capture claude version for metadata 852 claude_version = "unknown" 853 try: 854 result = subprocess.run( 855 ["claude", "--version"], capture_output=True, text=True, timeout=5 856 ) 857 claude_version = result.stdout.strip() or result.stderr.strip() 858 except Exception: 859 pass 860 861 print("=" * 40) 862 print("Loop Benchmarking Harness") 863 print("=" * 40) 864 print(f"Grid file: {grid_file}") 865 print(f"Profile: {profile}") 866 print(f"Provider: {provider_filter}") 867 print(f"Parallel: {parallel}") 868 print(f"Results: {results_dir}") 869 print("=" * 40) 870 871 # --provider is required 872 if not provider_filter: 873 print("ERROR: --provider is required. Use --provider anthropic or --provider zai") 874 sys.exit(1) 875 876 grid = load_grid(grid_file) 877 providers_config = grid.get("providers", {}) 878 879 if provider_filter not in providers_config and provider_filter not in [v for spec in grid["axes"].values() for v in spec.get("values", [])]: 880 valid = grid["axes"].get("provider", {}).get("values", []) 881 print(f"ERROR: unknown provider '{provider_filter}'. Valid: {valid}") 882 sys.exit(1) 883 884 # Build baseline override from --model flag 885 # For non-anthropic providers, accept actual model names (e.g., glm-4.5-air) 886 # and reverse-map to the Claude arg (e.g., haiku) 887 baseline = None 888 if baseline_model: 889 axes = {name: spec["values"] for name, spec in grid["axes"].items()} 890 baseline = {name: values[0] for name, values in axes.items()} 891 baseline["model"] = baseline_model 892 if provider_filter: 893 baseline["provider"] = provider_filter 894 895 # Determine cell generation strategy 896 if profile == "main_effects": 897 cells = main_effects_plan(grid, baseline=baseline) 898 print(f"Design: main effects sweep" + (f" (baseline model: {baseline_model})" if baseline_model else "")) 899 elif profile == "plackett_burman": 900 cells = plackett_burman_plan(grid) 901 print(f"Design: Plackett-Burman screening") 902 elif profile.startswith("interaction_hunt:"): 903 top_axes = profile.split(":", 1)[1].split(",") 904 cells = interaction_hunt_plan(grid, top_axes) 905 print(f"Design: interaction hunt on {top_axes}") 906 else: 907 cells = compute_cells(grid, profile) 908 print(f"Profile: {profile}") 909 910 # Filter cells to requested provider 911 cells = [c for c in cells if c.get("provider", "anthropic") == provider_filter] 912 913 # Build the full list of (cell, run_num) jobs 914 jobs = [] 915 for cell in cells: 916 runs_per_cell = override_runs_per_cell or cell.get("runs_per_cell", 3) 917 for run_num in range(1, runs_per_cell + 1): 918 jobs.append((cell, run_num)) 919 920 if max_runs and len(jobs) > max_runs: 921 jobs = jobs[:max_runs] 922 print(f"Total jobs: {len(jobs)} (limited by -n {max_runs})") 923 else: 924 print(f"Total jobs: {len(jobs)}") 925 print() 926 927 # Periodic commit helper 928 _last_commit_count = [0] # mutable for closure 929 930 def periodic_commit(completed_so_far): 931 if not commit_every or completed_so_far - _last_commit_count[0] < commit_every: 932 return 933 _last_commit_count[0] = completed_so_far 934 log(f" --- Checkpoint: analyzing and pushing {completed_so_far} completed runs ---") 935 # Run analysis 936 analysis_dir = results_dir / "analysis" 937 analysis_dir.mkdir(exist_ok=True) 938 for metric in ["score", "cost", "turns", "wall_time", "gameplay", "sonarqube", "code_quality", "structural", "transcript", "build_quality"]: 939 try: 940 effects = analyze_main_effects(str(results_dir), metric) 941 (analysis_dir / f"main_effects_{metric}.json").write_text(json.dumps(effects, indent=2)) 942 except Exception: 943 pass 944 # Commit and push 945 try: 946 subprocess.run(["git", "add", "-A", "results/", "artifacts/"], cwd=str(PROJECT_DIR), capture_output=True, timeout=30) 947 total_runs = len(list((results_dir / "runs").iterdir())) 948 msg = f"Checkpoint: {completed_so_far} runs ({total_runs} total)" 949 subprocess.run(["git", "commit", "-m", msg], cwd=str(PROJECT_DIR), capture_output=True, timeout=30) 950 subprocess.run(["git", "push"], cwd=str(PROJECT_DIR), capture_output=True, timeout=60) 951 log(f" --- Pushed checkpoint ---") 952 except Exception as e: 953 log(f" --- Checkpoint push failed: {e} ---") 954 955 # Start auth keepalive in background (refreshes OAuth token every 5 min) 956 auth_keepalive = subprocess.Popen( 957 ["bash", str(SCRIPT_DIR / "lib" / "keep-auth-alive.sh"), "300"], 958 stdout=subprocess.DEVNULL, 959 stderr=subprocess.DEVNULL, 960 ) 961 print(f"Auth keepalive started (PID {auth_keepalive.pid})") 962 963 completed = 0 964 skipped = 0 965 failed = 0 966 967 if parallel <= 1: 968 # Sequential 969 for cell, run_num in jobs: 970 result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config) 971 if result == "completed": 972 completed += 1 973 elif result == "skipped": 974 skipped += 1 975 else: 976 failed += 1 977 periodic_commit(completed) 978 else: 979 # Parallel with rolling concurrency 980 with ThreadPoolExecutor(max_workers=parallel) as executor: 981 futures = { 982 executor.submit( 983 run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config 984 ): (cell, run_num) 985 for cell, run_num in jobs 986 } 987 988 for future in as_completed(futures): 989 try: 990 result = future.result() 991 except Exception as e: 992 log(f" ERROR: {e}") 993 result = "failed" 994 995 with _counter_lock: 996 if result == "completed": 997 completed += 1 998 elif result == "skipped": 999 skipped += 1 1000 else: 1001 failed += 1 1002 1003 total_done = completed + skipped + failed 1004 log(f" Progress: {total_done}/{len(jobs)} ({completed} completed, {skipped} skipped, {failed} failed)") 1005 periodic_commit(completed) 1006 1007 # Stop auth keepalive 1008 auth_keepalive.terminate() 1009 auth_keepalive.wait() 1010 1011 print() 1012 print("=" * 40) 1013 print("All runs complete.") 1014 print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}") 1015 print("=" * 40) 1016 1017 # Re-evaluate all runs with latest eval scripts (only when explicitly requested) 1018 if do_reeval: 1019 print() 1020 print("Re-evaluating ALL runs with latest eval scripts...") 1021 reeval_result = subprocess.run( 1022 ["python3", str(SCRIPT_DIR / "reeval.py"), str(results_dir), "-j", str(max(parallel, 4))], 1023 cwd=str(PROJECT_DIR), 1024 ) 1025 if reeval_result.returncode == 0: 1026 print("Re-evaluation complete.") 1027 else: 1028 print("Re-evaluation had errors (continuing).") 1029 1030 # Run analysis and save results 1031 if do_analyze or completed > 0: 1032 print() 1033 print("Running analysis...") 1034 analysis_dir = results_dir / "analysis" 1035 analysis_dir.mkdir(exist_ok=True) 1036 1037 metrics = ["score", "cost", "turns", "wall_time", "gameplay", "code_quality"] 1038 for metric in metrics: 1039 try: 1040 effects = analyze_main_effects(str(results_dir), metric) 1041 (analysis_dir / f"main_effects_{metric}.json").write_text( 1042 json.dumps(effects, indent=2) 1043 ) 1044 print(f" Saved main_effects_{metric}.json") 1045 except Exception as e: 1046 print(f" Error analyzing {metric}: {e}") 1047 1048 # Auto-commit and push results 1049 if completed > 0: 1050 print() 1051 print("Committing results...") 1052 try: 1053 subprocess.run( 1054 ["git", "add", "-A", "results/", "artifacts/"], 1055 cwd=str(PROJECT_DIR), capture_output=True, timeout=30, 1056 ) 1057 total_runs = len(list((results_dir / "runs").iterdir())) 1058 msg = ( 1059 f"Add {completed} new runs ({total_runs} total)\n\n" 1060 f"Profile: {profile}\n" 1061 f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}" 1062 ) 1063 subprocess.run( 1064 ["git", "commit", "-m", msg], 1065 cwd=str(PROJECT_DIR), capture_output=True, 1066 ) 1067 result = subprocess.run( 1068 ["git", "push"], 1069 cwd=str(PROJECT_DIR), capture_output=True, text=True, 1070 ) 1071 if result.returncode == 0: 1072 print("Results committed and pushed.") 1073 else: 1074 print(f"Push failed: {result.stderr.strip()}") 1075 print("Results committed locally. Push manually with: git push") 1076 except Exception as e: 1077 print(f"Auto-commit failed: {e}") 1078 print("Commit manually with: git add -A results/ artifacts/ && git commit && git push") 1079 1080 1081 if __name__ == "__main__": 1082 main()