reeval-calibration.py (2402B)
1 #!/usr/bin/env python3 2 """Re-run eval scripts against the 17 calibration runs only. 3 4 Keeps agent-generated artifacts in place and rewrites eval_results.json 5 and gameplay-bot-report.json. 6 7 Usage: 8 python3 harness/reeval-calibration.py [-j N] 9 """ 10 11 import json 12 import sys 13 from concurrent.futures import ThreadPoolExecutor, as_completed 14 from pathlib import Path 15 16 SCRIPT_DIR = Path(__file__).resolve().parent 17 PROJECT_DIR = SCRIPT_DIR.parent 18 19 sys.path.insert(0, str(SCRIPT_DIR)) 20 from reeval import reeval_single 21 22 23 def main(): 24 parallel = 1 25 args = sys.argv[1:] 26 i = 0 27 while i < len(args): 28 if args[i] == "-j" and i + 1 < len(args): 29 parallel = int(args[i + 1]) 30 i += 2 31 else: 32 i += 1 33 34 calib_dir = PROJECT_DIR / "tasks" / "tetris" / "eval" / "gameplay-bot" / "calibration" 35 run_ids = [json.loads(f.read_text())["run_id"] for f in sorted(calib_dir.glob("*.json"))] 36 run_dirs = [PROJECT_DIR / "results" / "runs" / rid for rid in run_ids] 37 38 missing = [rd for rd in run_dirs if not rd.exists()] 39 if missing: 40 print(f"ERROR: {len(missing)} calibration run dirs missing:") 41 for rd in missing: 42 print(f" {rd.name}") 43 sys.exit(1) 44 45 print(f"Re-evaluating {len(run_dirs)} calibration runs (parallel={parallel})") 46 print() 47 48 completed = skipped = errors = 0 49 50 if parallel <= 1: 51 for rd in run_dirs: 52 result = reeval_single(rd, PROJECT_DIR) 53 if result == "completed": 54 completed += 1 55 elif result.startswith("skip"): 56 skipped += 1 57 else: 58 errors += 1 59 else: 60 with ThreadPoolExecutor(max_workers=parallel) as ex: 61 futures = {ex.submit(reeval_single, rd, PROJECT_DIR): rd for rd in run_dirs} 62 for f in as_completed(futures): 63 try: 64 result = f.result() 65 except Exception as e: 66 print(f" ERROR {futures[f].name}: {e}") 67 result = "error" 68 if result == "completed": 69 completed += 1 70 elif result.startswith("skip"): 71 skipped += 1 72 else: 73 errors += 1 74 75 print() 76 print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}") 77 78 79 if __name__ == "__main__": 80 main()