clean-and-reeval.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

clean-and-reeval.py (6740B)
      1 #!/usr/bin/env python3
      2 """Clean bad runs, re-evaluate everything, run analysis, commit and push.
      3 
      4 Usage:
      5     python3 harness/clean-and-reeval.py [-j N] [--no-push]
      6 """
      7 
      8 import json
      9 import shutil
     10 import subprocess
     11 import sys
     12 from pathlib import Path
     13 
     14 PROJECT_DIR = Path(__file__).resolve().parent.parent
     15 RESULTS_DIR = PROJECT_DIR / "results"
     16 ARTIFACTS_DIR = PROJECT_DIR / "artifacts"
     17 
     18 sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
     19 
     20 
     21 def clean_bad_runs():
     22     """Delete runs that are invalid, incomplete, or have no HTML output."""
     23     runs_dir = RESULTS_DIR / "runs"
     24     if not runs_dir.exists():
     25         return 0
     26 
     27     deleted = 0
     28     for run_dir in sorted(runs_dir.iterdir()):
     29         if not run_dir.is_dir():
     30             continue
     31 
     32         output_path = run_dir / "claude_output.json"
     33         meta_path = run_dir / "meta.json"
     34         artifact_dir = ARTIFACTS_DIR / run_dir.name
     35 
     36         bad = False
     37         reason = ""
     38 
     39         if not output_path.exists() or not meta_path.exists():
     40             bad, reason = True, "missing output or meta"
     41         else:
     42             try:
     43                 output = json.loads(output_path.read_text())
     44                 meta = json.loads(meta_path.read_text())
     45 
     46                 cost = output.get("total_cost_usd")
     47                 turns = output.get("num_turns")
     48                 exit_code = meta.get("exit_code")
     49                 result_text = output.get("result", "")
     50 
     51                 if cost is None and turns in (None, 0):
     52                     bad, reason = True, "no cost and no turns"
     53                 elif turns in (None, 0) and (cost is None or cost == 0):
     54                     bad, reason = True, f"turns={turns}, cost={cost}"
     55                 elif "Invalid API key" in str(result_text):
     56                     bad, reason = True, "invalid API key"
     57 
     58                 # No HTML = game wasn't built
     59                 if not bad and artifact_dir.exists():
     60                     htmls = [
     61                         f for f in artifact_dir.rglob("*.html")
     62                         if "node_modules" not in str(f)
     63                     ]
     64                     if not htmls:
     65                         bad, reason = True, "no HTML files"
     66                 elif not bad and not artifact_dir.exists():
     67                     bad, reason = True, "no artifact directory"
     68 
     69             except (json.JSONDecodeError, OSError) as e:
     70                 bad, reason = True, str(e)
     71 
     72         if bad:
     73             print(f"  DELETE: {run_dir.name[:60]}... ({reason})")
     74             shutil.rmtree(run_dir, ignore_errors=True)
     75             if artifact_dir.exists():
     76                 shutil.rmtree(artifact_dir, ignore_errors=True)
     77             deleted += 1
     78 
     79     return deleted
     80 
     81 
     82 def rebuild_index():
     83     """Rebuild results/index.jsonl from remaining runs."""
     84     index_path = RESULTS_DIR / "index.jsonl"
     85     if index_path.exists():
     86         index_path.unlink()
     87 
     88     runs_dir = RESULTS_DIR / "runs"
     89     count = 0
     90     with open(index_path, "w") as f:
     91         for run_dir in sorted(runs_dir.iterdir()):
     92             meta_path = run_dir / "meta.json"
     93             eval_path = run_dir / "eval_results.json"
     94             if meta_path.exists() and eval_path.exists():
     95                 meta = json.loads(meta_path.read_text())
     96                 entry = {
     97                     "run_id": meta.get("run_id", run_dir.name),
     98                     "task": meta.get("task"),
     99                     "model": meta.get("model"),
    100                     "cell_id": meta.get("cell_id"),
    101                     "completed_at": meta.get("completed_at"),
    102                 }
    103                 f.write(json.dumps(entry) + "\n")
    104                 count += 1
    105 
    106     return count
    107 
    108 
    109 def run_analysis():
    110     """Run main effects analysis for all metrics."""
    111     from experiment_design import analyze_main_effects
    112 
    113     analysis_dir = RESULTS_DIR / "analysis"
    114     analysis_dir.mkdir(exist_ok=True)
    115 
    116     metrics = [
    117         "score", "cost", "turns", "wall_time",
    118         "gameplay", "sonarqube", "code_quality",
    119         "structural", "transcript", "build_quality",
    120     ]
    121     for metric in metrics:
    122         effects = analyze_main_effects(str(RESULTS_DIR), metric)
    123         (analysis_dir / f"main_effects_{metric}.json").write_text(
    124             json.dumps(effects, indent=2)
    125         )
    126     print(f"  Analysis updated for {len(metrics)} metrics")
    127 
    128 
    129 def main():
    130     args = sys.argv[1:]
    131     parallel = 4
    132     do_push = True
    133 
    134     i = 0
    135     while i < len(args):
    136         if args[i] == "-j" and i + 1 < len(args):
    137             parallel = int(args[i + 1])
    138             i += 2
    139         elif args[i] == "--no-push":
    140             do_push = False
    141             i += 1
    142         else:
    143             i += 1
    144 
    145     print("=" * 50)
    146     print("Clean and Re-evaluate")
    147     print("=" * 50)
    148 
    149     # Step 1: Clean
    150     print("\n1. Cleaning bad/incomplete runs...")
    151     deleted = clean_bad_runs()
    152     print(f"   Deleted {deleted} runs")
    153 
    154     # Step 2: Rebuild index
    155     print("\n2. Rebuilding index...")
    156     count = rebuild_index()
    157     print(f"   {count} valid runs indexed")
    158 
    159     # Step 3: Count by model
    160     runs_dir = RESULTS_DIR / "runs"
    161     models: dict[str, int] = {}
    162     for run_dir in runs_dir.iterdir():
    163         if run_dir.is_dir():
    164             meta_path = run_dir / "meta.json"
    165             if meta_path.exists():
    166                 model = json.loads(meta_path.read_text()).get("model", "?")
    167                 models[model] = models.get(model, 0) + 1
    168     for model, n in sorted(models.items()):
    169         print(f"   {model}: {n}")
    170 
    171     # Step 4: Re-evaluate
    172     print(f"\n3. Re-evaluating all runs (parallel={parallel})...")
    173     reeval_result = subprocess.run(
    174         ["python3", str(PROJECT_DIR / "harness" / "reeval.py"),
    175          str(RESULTS_DIR), "-j", str(parallel)],
    176         cwd=str(PROJECT_DIR),
    177     )
    178     if reeval_result.returncode != 0:
    179         print("   WARNING: Re-evaluation had errors")
    180 
    181     # Step 5: Analysis
    182     print("\n4. Running analysis...")
    183     run_analysis()
    184 
    185     # Step 6: Commit and push
    186     print("\n5. Committing results...")
    187     subprocess.run(
    188         ["git", "add", "-A", "results/", "artifacts/"],
    189         cwd=str(PROJECT_DIR), capture_output=True,
    190     )
    191 
    192     total = sum(models.values())
    193     msg = f"Re-eval {total} runs ({', '.join(f'{n} {m}' for m, n in sorted(models.items()))})"
    194     subprocess.run(
    195         ["git", "commit", "-m", msg],
    196         cwd=str(PROJECT_DIR), capture_output=True,
    197     )
    198 
    199     if do_push:
    200         result = subprocess.run(
    201             ["git", "push"],
    202             cwd=str(PROJECT_DIR), capture_output=True, text=True,
    203         )
    204         if result.returncode == 0:
    205             print("   Pushed.")
    206         else:
    207             print(f"   Push failed: {result.stderr.strip()}")
    208     else:
    209         print("   Committed locally (--no-push)")
    210 
    211     print("\nDone.")
    212 
    213 
    214 if __name__ == "__main__":
    215     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README