reeval-calibration.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

reeval-calibration.py (2402B)

      1 #!/usr/bin/env python3
      2 """Re-run eval scripts against the 17 calibration runs only.
      3 
      4 Keeps agent-generated artifacts in place and rewrites eval_results.json
      5 and gameplay-bot-report.json.
      6 
      7 Usage:
      8     python3 harness/reeval-calibration.py [-j N]
      9 """
     10 
     11 import json
     12 import sys
     13 from concurrent.futures import ThreadPoolExecutor, as_completed
     14 from pathlib import Path
     15 
     16 SCRIPT_DIR = Path(__file__).resolve().parent
     17 PROJECT_DIR = SCRIPT_DIR.parent
     18 
     19 sys.path.insert(0, str(SCRIPT_DIR))
     20 from reeval import reeval_single
     21 
     22 
     23 def main():
     24     parallel = 1
     25     args = sys.argv[1:]
     26     i = 0
     27     while i < len(args):
     28         if args[i] == "-j" and i + 1 < len(args):
     29             parallel = int(args[i + 1])
     30             i += 2
     31         else:
     32             i += 1
     33 
     34     calib_dir = PROJECT_DIR / "tasks" / "tetris" / "eval" / "gameplay-bot" / "calibration"
     35     run_ids = [json.loads(f.read_text())["run_id"] for f in sorted(calib_dir.glob("*.json"))]
     36     run_dirs = [PROJECT_DIR / "results" / "runs" / rid for rid in run_ids]
     37 
     38     missing = [rd for rd in run_dirs if not rd.exists()]
     39     if missing:
     40         print(f"ERROR: {len(missing)} calibration run dirs missing:")
     41         for rd in missing:
     42             print(f"  {rd.name}")
     43         sys.exit(1)
     44 
     45     print(f"Re-evaluating {len(run_dirs)} calibration runs (parallel={parallel})")
     46     print()
     47 
     48     completed = skipped = errors = 0
     49 
     50     if parallel <= 1:
     51         for rd in run_dirs:
     52             result = reeval_single(rd, PROJECT_DIR)
     53             if result == "completed":
     54                 completed += 1
     55             elif result.startswith("skip"):
     56                 skipped += 1
     57             else:
     58                 errors += 1
     59     else:
     60         with ThreadPoolExecutor(max_workers=parallel) as ex:
     61             futures = {ex.submit(reeval_single, rd, PROJECT_DIR): rd for rd in run_dirs}
     62             for f in as_completed(futures):
     63                 try:
     64                     result = f.result()
     65                 except Exception as e:
     66                     print(f"  ERROR {futures[f].name}: {e}")
     67                     result = "error"
     68                 if result == "completed":
     69                     completed += 1
     70                 elif result.startswith("skip"):
     71                     skipped += 1
     72                 else:
     73                     errors += 1
     74 
     75     print()
     76     print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}")
     77 
     78 
     79 if __name__ == "__main__":
     80     main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README