reeval.py (4379B)
1 #!/usr/bin/env python3 2 """Re-evaluate existing runs with current eval scripts. 3 4 Extracts workspace archives, runs all eval scripts against them, 5 and overwrites eval_results.json. Does NOT re-run claude. 6 7 Usage: 8 python3 reeval.py [results_dir] [-j N] 9 """ 10 11 import json 12 import shutil 13 import subprocess 14 import sys 15 import tarfile 16 import tempfile 17 from concurrent.futures import ThreadPoolExecutor, as_completed 18 from pathlib import Path 19 20 SCRIPT_DIR = Path(__file__).resolve().parent 21 PROJECT_DIR = SCRIPT_DIR.parent 22 23 sys.path.insert(0, str(SCRIPT_DIR / "lib")) 24 from run import evaluate, run_eval_script, safe_parse_json 25 26 27 def reeval_single(run_dir: Path, project_dir: Path) -> str: 28 """Re-evaluate a single run.""" 29 meta_path = run_dir / "meta.json" 30 archive_path = run_dir / "workspace.tar.gz" 31 32 if not meta_path.exists(): 33 return "skip_no_meta" 34 35 meta = json.loads(meta_path.read_text()) 36 task = meta.get("task", "") 37 language = meta.get("language", "typescript") 38 run_id = meta.get("run_id", run_dir.name) 39 task_dir = project_dir / "tasks" / task 40 41 if not task_dir.exists(): 42 print(f" SKIP {run_id}: task '{task}' not found") 43 return "skip_no_task" 44 45 # Extract workspace from archive if available 46 workspace = None 47 artifact_dir = project_dir / "artifacts" / run_dir.name 48 49 if artifact_dir.exists() and any(artifact_dir.iterdir()): 50 # Use existing artifact directory as workspace 51 workspace = artifact_dir 52 elif archive_path.exists(): 53 # Extract from archive to temp dir 54 workspace = Path(tempfile.mkdtemp(prefix="reeval-")) 55 try: 56 with tarfile.open(archive_path, "r:gz") as tar: 57 tar.extractall(workspace, filter="data") 58 # The archive has a subdirectory, find it 59 subdirs = [d for d in workspace.iterdir() if d.is_dir()] 60 if subdirs: 61 workspace = subdirs[0] 62 except Exception as e: 63 print(f" ERROR {run_id}: failed to extract archive: {e}") 64 shutil.rmtree(workspace, ignore_errors=True) 65 return "error" 66 67 if workspace is None: 68 print(f" SKIP {run_id}: no workspace or archive") 69 return "skip_no_workspace" 70 71 print(f" EVAL {run_id} ({task}, {meta.get('model', '?')})") 72 73 # Build cell dict from meta 74 cell = dict(meta) 75 76 # Run evaluation 77 evaluate(task_dir, workspace, cell, run_dir) 78 79 # Clean up temp workspace (but not artifact dirs) 80 if not str(workspace).startswith(str(project_dir / "artifacts")): 81 shutil.rmtree(workspace, ignore_errors=True) 82 83 return "completed" 84 85 86 def main(): 87 args = sys.argv[1:] 88 parallel = 1 89 results_dir = PROJECT_DIR / "results" 90 91 i = 0 92 while i < len(args): 93 if args[i] == "-j" and i + 1 < len(args): 94 parallel = int(args[i + 1]) 95 i += 2 96 else: 97 results_dir = Path(args[i]) 98 i += 1 99 100 runs_dir = results_dir / "runs" 101 if not runs_dir.exists(): 102 print("No runs directory found.") 103 return 104 105 run_dirs = sorted([d for d in runs_dir.iterdir() if d.is_dir()]) 106 print(f"Re-evaluating {len(run_dirs)} runs (parallel={parallel})") 107 print() 108 109 completed = 0 110 skipped = 0 111 errors = 0 112 113 if parallel <= 1: 114 for run_dir in run_dirs: 115 result = reeval_single(run_dir, PROJECT_DIR) 116 if result == "completed": 117 completed += 1 118 elif result.startswith("skip"): 119 skipped += 1 120 else: 121 errors += 1 122 else: 123 with ThreadPoolExecutor(max_workers=parallel) as executor: 124 futures = { 125 executor.submit(reeval_single, rd, PROJECT_DIR): rd 126 for rd in run_dirs 127 } 128 for future in as_completed(futures): 129 try: 130 result = future.result() 131 except Exception as e: 132 print(f" ERROR: {e}") 133 result = "error" 134 if result == "completed": 135 completed += 1 136 elif result.startswith("skip"): 137 skipped += 1 138 else: 139 errors += 1 140 141 print() 142 print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}") 143 144 145 if __name__ == "__main__": 146 main()