clean-and-reeval.py (6740B)
1 #!/usr/bin/env python3 2 """Clean bad runs, re-evaluate everything, run analysis, commit and push. 3 4 Usage: 5 python3 harness/clean-and-reeval.py [-j N] [--no-push] 6 """ 7 8 import json 9 import shutil 10 import subprocess 11 import sys 12 from pathlib import Path 13 14 PROJECT_DIR = Path(__file__).resolve().parent.parent 15 RESULTS_DIR = PROJECT_DIR / "results" 16 ARTIFACTS_DIR = PROJECT_DIR / "artifacts" 17 18 sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib")) 19 20 21 def clean_bad_runs(): 22 """Delete runs that are invalid, incomplete, or have no HTML output.""" 23 runs_dir = RESULTS_DIR / "runs" 24 if not runs_dir.exists(): 25 return 0 26 27 deleted = 0 28 for run_dir in sorted(runs_dir.iterdir()): 29 if not run_dir.is_dir(): 30 continue 31 32 output_path = run_dir / "claude_output.json" 33 meta_path = run_dir / "meta.json" 34 artifact_dir = ARTIFACTS_DIR / run_dir.name 35 36 bad = False 37 reason = "" 38 39 if not output_path.exists() or not meta_path.exists(): 40 bad, reason = True, "missing output or meta" 41 else: 42 try: 43 output = json.loads(output_path.read_text()) 44 meta = json.loads(meta_path.read_text()) 45 46 cost = output.get("total_cost_usd") 47 turns = output.get("num_turns") 48 exit_code = meta.get("exit_code") 49 result_text = output.get("result", "") 50 51 if cost is None and turns in (None, 0): 52 bad, reason = True, "no cost and no turns" 53 elif turns in (None, 0) and (cost is None or cost == 0): 54 bad, reason = True, f"turns={turns}, cost={cost}" 55 elif "Invalid API key" in str(result_text): 56 bad, reason = True, "invalid API key" 57 58 # No HTML = game wasn't built 59 if not bad and artifact_dir.exists(): 60 htmls = [ 61 f for f in artifact_dir.rglob("*.html") 62 if "node_modules" not in str(f) 63 ] 64 if not htmls: 65 bad, reason = True, "no HTML files" 66 elif not bad and not artifact_dir.exists(): 67 bad, reason = True, "no artifact directory" 68 69 except (json.JSONDecodeError, OSError) as e: 70 bad, reason = True, str(e) 71 72 if bad: 73 print(f" DELETE: {run_dir.name[:60]}... ({reason})") 74 shutil.rmtree(run_dir, ignore_errors=True) 75 if artifact_dir.exists(): 76 shutil.rmtree(artifact_dir, ignore_errors=True) 77 deleted += 1 78 79 return deleted 80 81 82 def rebuild_index(): 83 """Rebuild results/index.jsonl from remaining runs.""" 84 index_path = RESULTS_DIR / "index.jsonl" 85 if index_path.exists(): 86 index_path.unlink() 87 88 runs_dir = RESULTS_DIR / "runs" 89 count = 0 90 with open(index_path, "w") as f: 91 for run_dir in sorted(runs_dir.iterdir()): 92 meta_path = run_dir / "meta.json" 93 eval_path = run_dir / "eval_results.json" 94 if meta_path.exists() and eval_path.exists(): 95 meta = json.loads(meta_path.read_text()) 96 entry = { 97 "run_id": meta.get("run_id", run_dir.name), 98 "task": meta.get("task"), 99 "model": meta.get("model"), 100 "cell_id": meta.get("cell_id"), 101 "completed_at": meta.get("completed_at"), 102 } 103 f.write(json.dumps(entry) + "\n") 104 count += 1 105 106 return count 107 108 109 def run_analysis(): 110 """Run main effects analysis for all metrics.""" 111 from experiment_design import analyze_main_effects 112 113 analysis_dir = RESULTS_DIR / "analysis" 114 analysis_dir.mkdir(exist_ok=True) 115 116 metrics = [ 117 "score", "cost", "turns", "wall_time", 118 "gameplay", "sonarqube", "code_quality", 119 "structural", "transcript", "build_quality", 120 ] 121 for metric in metrics: 122 effects = analyze_main_effects(str(RESULTS_DIR), metric) 123 (analysis_dir / f"main_effects_{metric}.json").write_text( 124 json.dumps(effects, indent=2) 125 ) 126 print(f" Analysis updated for {len(metrics)} metrics") 127 128 129 def main(): 130 args = sys.argv[1:] 131 parallel = 4 132 do_push = True 133 134 i = 0 135 while i < len(args): 136 if args[i] == "-j" and i + 1 < len(args): 137 parallel = int(args[i + 1]) 138 i += 2 139 elif args[i] == "--no-push": 140 do_push = False 141 i += 1 142 else: 143 i += 1 144 145 print("=" * 50) 146 print("Clean and Re-evaluate") 147 print("=" * 50) 148 149 # Step 1: Clean 150 print("\n1. Cleaning bad/incomplete runs...") 151 deleted = clean_bad_runs() 152 print(f" Deleted {deleted} runs") 153 154 # Step 2: Rebuild index 155 print("\n2. Rebuilding index...") 156 count = rebuild_index() 157 print(f" {count} valid runs indexed") 158 159 # Step 3: Count by model 160 runs_dir = RESULTS_DIR / "runs" 161 models: dict[str, int] = {} 162 for run_dir in runs_dir.iterdir(): 163 if run_dir.is_dir(): 164 meta_path = run_dir / "meta.json" 165 if meta_path.exists(): 166 model = json.loads(meta_path.read_text()).get("model", "?") 167 models[model] = models.get(model, 0) + 1 168 for model, n in sorted(models.items()): 169 print(f" {model}: {n}") 170 171 # Step 4: Re-evaluate 172 print(f"\n3. Re-evaluating all runs (parallel={parallel})...") 173 reeval_result = subprocess.run( 174 ["python3", str(PROJECT_DIR / "harness" / "reeval.py"), 175 str(RESULTS_DIR), "-j", str(parallel)], 176 cwd=str(PROJECT_DIR), 177 ) 178 if reeval_result.returncode != 0: 179 print(" WARNING: Re-evaluation had errors") 180 181 # Step 5: Analysis 182 print("\n4. Running analysis...") 183 run_analysis() 184 185 # Step 6: Commit and push 186 print("\n5. Committing results...") 187 subprocess.run( 188 ["git", "add", "-A", "results/", "artifacts/"], 189 cwd=str(PROJECT_DIR), capture_output=True, 190 ) 191 192 total = sum(models.values()) 193 msg = f"Re-eval {total} runs ({', '.join(f'{n} {m}' for m, n in sorted(models.items()))})" 194 subprocess.run( 195 ["git", "commit", "-m", msg], 196 cwd=str(PROJECT_DIR), capture_output=True, 197 ) 198 199 if do_push: 200 result = subprocess.run( 201 ["git", "push"], 202 cwd=str(PROJECT_DIR), capture_output=True, text=True, 203 ) 204 if result.returncode == 0: 205 print(" Pushed.") 206 else: 207 print(f" Push failed: {result.stderr.strip()}") 208 else: 209 print(" Committed locally (--no-push)") 210 211 print("\nDone.") 212 213 214 if __name__ == "__main__": 215 main()