reeval.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

reeval.py (4379B)
      1 #!/usr/bin/env python3
      2 """Re-evaluate existing runs with current eval scripts.
      3 
      4 Extracts workspace archives, runs all eval scripts against them,
      5 and overwrites eval_results.json. Does NOT re-run claude.
      6 
      7 Usage:
      8     python3 reeval.py [results_dir] [-j N]
      9 """
     10 
     11 import json
     12 import shutil
     13 import subprocess
     14 import sys
     15 import tarfile
     16 import tempfile
     17 from concurrent.futures import ThreadPoolExecutor, as_completed
     18 from pathlib import Path
     19 
     20 SCRIPT_DIR = Path(__file__).resolve().parent
     21 PROJECT_DIR = SCRIPT_DIR.parent
     22 
     23 sys.path.insert(0, str(SCRIPT_DIR / "lib"))
     24 from run import evaluate, run_eval_script, safe_parse_json
     25 
     26 
     27 def reeval_single(run_dir: Path, project_dir: Path) -> str:
     28     """Re-evaluate a single run."""
     29     meta_path = run_dir / "meta.json"
     30     archive_path = run_dir / "workspace.tar.gz"
     31 
     32     if not meta_path.exists():
     33         return "skip_no_meta"
     34 
     35     meta = json.loads(meta_path.read_text())
     36     task = meta.get("task", "")
     37     language = meta.get("language", "typescript")
     38     run_id = meta.get("run_id", run_dir.name)
     39     task_dir = project_dir / "tasks" / task
     40 
     41     if not task_dir.exists():
     42         print(f"  SKIP {run_id}: task '{task}' not found")
     43         return "skip_no_task"
     44 
     45     # Extract workspace from archive if available
     46     workspace = None
     47     artifact_dir = project_dir / "artifacts" / run_dir.name
     48 
     49     if artifact_dir.exists() and any(artifact_dir.iterdir()):
     50         # Use existing artifact directory as workspace
     51         workspace = artifact_dir
     52     elif archive_path.exists():
     53         # Extract from archive to temp dir
     54         workspace = Path(tempfile.mkdtemp(prefix="reeval-"))
     55         try:
     56             with tarfile.open(archive_path, "r:gz") as tar:
     57                 tar.extractall(workspace, filter="data")
     58             # The archive has a subdirectory, find it
     59             subdirs = [d for d in workspace.iterdir() if d.is_dir()]
     60             if subdirs:
     61                 workspace = subdirs[0]
     62         except Exception as e:
     63             print(f"  ERROR {run_id}: failed to extract archive: {e}")
     64             shutil.rmtree(workspace, ignore_errors=True)
     65             return "error"
     66 
     67     if workspace is None:
     68         print(f"  SKIP {run_id}: no workspace or archive")
     69         return "skip_no_workspace"
     70 
     71     print(f"  EVAL {run_id} ({task}, {meta.get('model', '?')})")
     72 
     73     # Build cell dict from meta
     74     cell = dict(meta)
     75 
     76     # Run evaluation
     77     evaluate(task_dir, workspace, cell, run_dir)
     78 
     79     # Clean up temp workspace (but not artifact dirs)
     80     if not str(workspace).startswith(str(project_dir / "artifacts")):
     81         shutil.rmtree(workspace, ignore_errors=True)
     82 
     83     return "completed"
     84 
     85 
     86 def main():
     87     args = sys.argv[1:]
     88     parallel = 1
     89     results_dir = PROJECT_DIR / "results"
     90 
     91     i = 0
     92     while i < len(args):
     93         if args[i] == "-j" and i + 1 < len(args):
     94             parallel = int(args[i + 1])
     95             i += 2
     96         else:
     97             results_dir = Path(args[i])
     98             i += 1
     99 
    100     runs_dir = results_dir / "runs"
    101     if not runs_dir.exists():
    102         print("No runs directory found.")
    103         return
    104 
    105     run_dirs = sorted([d for d in runs_dir.iterdir() if d.is_dir()])
    106     print(f"Re-evaluating {len(run_dirs)} runs (parallel={parallel})")
    107     print()
    108 
    109     completed = 0
    110     skipped = 0
    111     errors = 0
    112 
    113     if parallel <= 1:
    114         for run_dir in run_dirs:
    115             result = reeval_single(run_dir, PROJECT_DIR)
    116             if result == "completed":
    117                 completed += 1
    118             elif result.startswith("skip"):
    119                 skipped += 1
    120             else:
    121                 errors += 1
    122     else:
    123         with ThreadPoolExecutor(max_workers=parallel) as executor:
    124             futures = {
    125                 executor.submit(reeval_single, rd, PROJECT_DIR): rd
    126                 for rd in run_dirs
    127             }
    128             for future in as_completed(futures):
    129                 try:
    130                     result = future.result()
    131                 except Exception as e:
    132                     print(f"  ERROR: {e}")
    133                     result = "error"
    134                 if result == "completed":
    135                     completed += 1
    136                 elif result.startswith("skip"):
    137                     skipped += 1
    138                 else:
    139                     errors += 1
    140 
    141     print()
    142     print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}")
    143 
    144 
    145 if __name__ == "__main__":
    146     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README