loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 64c6b20e80b1e88b3b7649aa8941428f92699ab1
parent a17094f75481fdd28a4029943e449f81dcce0273
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 10:30:17 +0200

Add clean-and-reeval command

Single command to clean bad runs, re-evaluate, analyze, commit, push:
  python3 harness/clean-and-reeval.py -j 4

Flags: -j N (parallel), --no-push (commit locally only)

Steps: delete bad runs -> rebuild index -> reeval all -> analyze -> commit -> push

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Aharness/clean-and-reeval.py | 216+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 216 insertions(+), 0 deletions(-)

diff --git a/harness/clean-and-reeval.py b/harness/clean-and-reeval.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +"""Clean bad runs, re-evaluate everything, run analysis, commit and push. + +Usage: + python3 harness/clean-and-reeval.py [-j N] [--no-push] +""" + +import json +import shutil +import subprocess +import sys +from pathlib import Path + +PROJECT_DIR = Path(__file__).resolve().parent.parent +RESULTS_DIR = PROJECT_DIR / "results" +ARTIFACTS_DIR = PROJECT_DIR / "artifacts" + +sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib")) + + +def clean_bad_runs(): + """Delete runs that are invalid, incomplete, or have no HTML output.""" + runs_dir = RESULTS_DIR / "runs" + if not runs_dir.exists(): + return 0 + + deleted = 0 + for run_dir in sorted(runs_dir.iterdir()): + if not run_dir.is_dir(): + continue + + output_path = run_dir / "claude_output.json" + meta_path = run_dir / "meta.json" + artifact_dir = ARTIFACTS_DIR / run_dir.name + + bad = False + reason = "" + + if not output_path.exists() or not meta_path.exists(): + bad, reason = True, "missing output or meta" + else: + try: + output = json.loads(output_path.read_text()) + meta = json.loads(meta_path.read_text()) + + cost = output.get("total_cost_usd") + turns = output.get("num_turns") + exit_code = meta.get("exit_code") + result_text = output.get("result", "") + + if cost in (None, 0): + bad, reason = True, f"cost={cost}" + elif turns in (None, 1): + bad, reason = True, f"turns={turns}" + elif exit_code == 124: + bad, reason = True, "timeout" + elif "Invalid API key" in str(result_text): + bad, reason = True, "invalid API key" + + # No HTML = game wasn't built + if not bad and artifact_dir.exists(): + htmls = [ + f for f in artifact_dir.rglob("*.html") + if "node_modules" not in str(f) + ] + if not htmls: + bad, reason = True, "no HTML files" + elif not bad and not artifact_dir.exists(): + bad, reason = True, "no artifact directory" + + except (json.JSONDecodeError, OSError) as e: + bad, reason = True, str(e) + + if bad: + print(f" DELETE: {run_dir.name[:60]}... ({reason})") + shutil.rmtree(run_dir, ignore_errors=True) + if artifact_dir.exists(): + shutil.rmtree(artifact_dir, ignore_errors=True) + deleted += 1 + + return deleted + + +def rebuild_index(): + """Rebuild results/index.jsonl from remaining runs.""" + index_path = RESULTS_DIR / "index.jsonl" + if index_path.exists(): + index_path.unlink() + + runs_dir = RESULTS_DIR / "runs" + count = 0 + with open(index_path, "w") as f: + for run_dir in sorted(runs_dir.iterdir()): + meta_path = run_dir / "meta.json" + eval_path = run_dir / "eval_results.json" + if meta_path.exists() and eval_path.exists(): + meta = json.loads(meta_path.read_text()) + entry = { + "run_id": meta.get("run_id", run_dir.name), + "task": meta.get("task"), + "model": meta.get("model"), + "cell_id": meta.get("cell_id"), + "completed_at": meta.get("completed_at"), + } + f.write(json.dumps(entry) + "\n") + count += 1 + + return count + + +def run_analysis(): + """Run main effects analysis for all metrics.""" + from experiment_design import analyze_main_effects + + analysis_dir = RESULTS_DIR / "analysis" + analysis_dir.mkdir(exist_ok=True) + + metrics = [ + "score", "cost", "turns", "wall_time", + "gameplay", "code_quality", "structural", "transcript", + ] + for metric in metrics: + effects = analyze_main_effects(str(RESULTS_DIR), metric) + (analysis_dir / f"main_effects_{metric}.json").write_text( + json.dumps(effects, indent=2) + ) + print(f" Analysis updated for {len(metrics)} metrics") + + +def main(): + args = sys.argv[1:] + parallel = 4 + do_push = True + + i = 0 + while i < len(args): + if args[i] == "-j" and i + 1 < len(args): + parallel = int(args[i + 1]) + i += 2 + elif args[i] == "--no-push": + do_push = False + i += 1 + else: + i += 1 + + print("=" * 50) + print("Clean and Re-evaluate") + print("=" * 50) + + # Step 1: Clean + print("\n1. Cleaning bad/incomplete runs...") + deleted = clean_bad_runs() + print(f" Deleted {deleted} runs") + + # Step 2: Rebuild index + print("\n2. Rebuilding index...") + count = rebuild_index() + print(f" {count} valid runs indexed") + + # Step 3: Count by model + runs_dir = RESULTS_DIR / "runs" + models: dict[str, int] = {} + for run_dir in runs_dir.iterdir(): + if run_dir.is_dir(): + meta_path = run_dir / "meta.json" + if meta_path.exists(): + model = json.loads(meta_path.read_text()).get("model", "?") + models[model] = models.get(model, 0) + 1 + for model, n in sorted(models.items()): + print(f" {model}: {n}") + + # Step 4: Re-evaluate + print(f"\n3. Re-evaluating all runs (parallel={parallel})...") + reeval_result = subprocess.run( + ["python3", str(PROJECT_DIR / "harness" / "reeval.py"), + str(RESULTS_DIR), "-j", str(parallel)], + cwd=str(PROJECT_DIR), + ) + if reeval_result.returncode != 0: + print(" WARNING: Re-evaluation had errors") + + # Step 5: Analysis + print("\n4. Running analysis...") + run_analysis() + + # Step 6: Commit and push + print("\n5. Committing results...") + subprocess.run( + ["git", "add", "-A", "results/", "artifacts/"], + cwd=str(PROJECT_DIR), capture_output=True, + ) + + total = sum(models.values()) + msg = f"Re-eval {total} runs ({', '.join(f'{n} {m}' for m, n in sorted(models.items()))})" + subprocess.run( + ["git", "commit", "-m", msg], + cwd=str(PROJECT_DIR), capture_output=True, + ) + + if do_push: + result = subprocess.run( + ["git", "push"], + cwd=str(PROJECT_DIR), capture_output=True, text=True, + ) + if result.returncode == 0: + print(" Pushed.") + else: + print(f" Push failed: {result.stderr.strip()}") + else: + print(" Committed locally (--no-push)") + + print("\nDone.") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz