Add clean-and-reeval command - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 64c6b20e80b1e88b3b7649aa8941428f92699ab1
parent a17094f75481fdd28a4029943e449f81dcce0273
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 10:30:17 +0200

Add clean-and-reeval command

Single command to clean bad runs, re-evaluate, analyze, commit, push:
  python3 harness/clean-and-reeval.py -j 4

Flags: -j N (parallel), --no-push (commit locally only)

Steps: delete bad runs -> rebuild index -> reeval all -> analyze -> commit -> push

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
A harness/clean-and-reeval.py  | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 216 insertions(+), 0 deletions(-)
diff --git a/harness/clean-and-reeval.py b/harness/clean-and-reeval.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""Clean bad runs, re-evaluate everything, run analysis, commit and push.
+
+Usage:
+    python3 harness/clean-and-reeval.py [-j N] [--no-push]
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+RESULTS_DIR = PROJECT_DIR / "results"
+ARTIFACTS_DIR = PROJECT_DIR / "artifacts"
+
+sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
+
+
+def clean_bad_runs():
+    """Delete runs that are invalid, incomplete, or have no HTML output."""
+    runs_dir = RESULTS_DIR / "runs"
+    if not runs_dir.exists():
+        return 0
+
+    deleted = 0
+    for run_dir in sorted(runs_dir.iterdir()):
+        if not run_dir.is_dir():
+            continue
+
+        output_path = run_dir / "claude_output.json"
+        meta_path = run_dir / "meta.json"
+        artifact_dir = ARTIFACTS_DIR / run_dir.name
+
+        bad = False
+        reason = ""
+
+        if not output_path.exists() or not meta_path.exists():
+            bad, reason = True, "missing output or meta"
+        else:
+            try:
+                output = json.loads(output_path.read_text())
+                meta = json.loads(meta_path.read_text())
+
+                cost = output.get("total_cost_usd")
+                turns = output.get("num_turns")
+                exit_code = meta.get("exit_code")
+                result_text = output.get("result", "")
+
+                if cost in (None, 0):
+                    bad, reason = True, f"cost={cost}"
+                elif turns in (None, 1):
+                    bad, reason = True, f"turns={turns}"
+                elif exit_code == 124:
+                    bad, reason = True, "timeout"
+                elif "Invalid API key" in str(result_text):
+                    bad, reason = True, "invalid API key"
+
+                # No HTML = game wasn't built
+                if not bad and artifact_dir.exists():
+                    htmls = [
+                        f for f in artifact_dir.rglob("*.html")
+                        if "node_modules" not in str(f)
+                    ]
+                    if not htmls:
+                        bad, reason = True, "no HTML files"
+                elif not bad and not artifact_dir.exists():
+                    bad, reason = True, "no artifact directory"
+
+            except (json.JSONDecodeError, OSError) as e:
+                bad, reason = True, str(e)
+
+        if bad:
+            print(f"  DELETE: {run_dir.name[:60]}... ({reason})")
+            shutil.rmtree(run_dir, ignore_errors=True)
+            if artifact_dir.exists():
+                shutil.rmtree(artifact_dir, ignore_errors=True)
+            deleted += 1
+
+    return deleted
+
+
+def rebuild_index():
+    """Rebuild results/index.jsonl from remaining runs."""
+    index_path = RESULTS_DIR / "index.jsonl"
+    if index_path.exists():
+        index_path.unlink()
+
+    runs_dir = RESULTS_DIR / "runs"
+    count = 0
+    with open(index_path, "w") as f:
+        for run_dir in sorted(runs_dir.iterdir()):
+            meta_path = run_dir / "meta.json"
+            eval_path = run_dir / "eval_results.json"
+            if meta_path.exists() and eval_path.exists():
+                meta = json.loads(meta_path.read_text())
+                entry = {
+                    "run_id": meta.get("run_id", run_dir.name),
+                    "task": meta.get("task"),
+                    "model": meta.get("model"),
+                    "cell_id": meta.get("cell_id"),
+                    "completed_at": meta.get("completed_at"),
+                }
+                f.write(json.dumps(entry) + "\n")
+                count += 1
+
+    return count
+
+
+def run_analysis():
+    """Run main effects analysis for all metrics."""
+    from experiment_design import analyze_main_effects
+
+    analysis_dir = RESULTS_DIR / "analysis"
+    analysis_dir.mkdir(exist_ok=True)
+
+    metrics = [
+        "score", "cost", "turns", "wall_time",
+        "gameplay", "code_quality", "structural", "transcript",
+    ]
+    for metric in metrics:
+        effects = analyze_main_effects(str(RESULTS_DIR), metric)
+        (analysis_dir / f"main_effects_{metric}.json").write_text(
+            json.dumps(effects, indent=2)
+        )
+    print(f"  Analysis updated for {len(metrics)} metrics")
+
+
+def main():
+    args = sys.argv[1:]
+    parallel = 4
+    do_push = True
+
+    i = 0
+    while i < len(args):
+        if args[i] == "-j" and i + 1 < len(args):
+            parallel = int(args[i + 1])
+            i += 2
+        elif args[i] == "--no-push":
+            do_push = False
+            i += 1
+        else:
+            i += 1
+
+    print("=" * 50)
+    print("Clean and Re-evaluate")
+    print("=" * 50)
+
+    # Step 1: Clean
+    print("\n1. Cleaning bad/incomplete runs...")
+    deleted = clean_bad_runs()
+    print(f"   Deleted {deleted} runs")
+
+    # Step 2: Rebuild index
+    print("\n2. Rebuilding index...")
+    count = rebuild_index()
+    print(f"   {count} valid runs indexed")
+
+    # Step 3: Count by model
+    runs_dir = RESULTS_DIR / "runs"
+    models: dict[str, int] = {}
+    for run_dir in runs_dir.iterdir():
+        if run_dir.is_dir():
+            meta_path = run_dir / "meta.json"
+            if meta_path.exists():
+                model = json.loads(meta_path.read_text()).get("model", "?")
+                models[model] = models.get(model, 0) + 1
+    for model, n in sorted(models.items()):
+        print(f"   {model}: {n}")
+
+    # Step 4: Re-evaluate
+    print(f"\n3. Re-evaluating all runs (parallel={parallel})...")
+    reeval_result = subprocess.run(
+        ["python3", str(PROJECT_DIR / "harness" / "reeval.py"),
+         str(RESULTS_DIR), "-j", str(parallel)],
+        cwd=str(PROJECT_DIR),
+    )
+    if reeval_result.returncode != 0:
+        print("   WARNING: Re-evaluation had errors")
+
+    # Step 5: Analysis
+    print("\n4. Running analysis...")
+    run_analysis()
+
+    # Step 6: Commit and push
+    print("\n5. Committing results...")
+    subprocess.run(
+        ["git", "add", "-A", "results/", "artifacts/"],
+        cwd=str(PROJECT_DIR), capture_output=True,
+    )
+
+    total = sum(models.values())
+    msg = f"Re-eval {total} runs ({', '.join(f'{n} {m}' for m, n in sorted(models.items()))})"
+    subprocess.run(
+        ["git", "commit", "-m", msg],
+        cwd=str(PROJECT_DIR), capture_output=True,
+    )
+
+    if do_push:
+        result = subprocess.run(
+            ["git", "push"],
+            cwd=str(PROJECT_DIR), capture_output=True, text=True,
+        )
+        if result.returncode == 0:
+            print("   Pushed.")
+        else:
+            print(f"   Push failed: {result.stderr.strip()}")
+    else:
+        print("   Committed locally (--no-push)")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README