commit 64c6b20e80b1e88b3b7649aa8941428f92699ab1
parent a17094f75481fdd28a4029943e449f81dcce0273
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 10:30:17 +0200
Add clean-and-reeval command
Single command to clean bad runs, re-evaluate, analyze, commit, push:
python3 harness/clean-and-reeval.py -j 4
Flags: -j N (parallel), --no-push (commit locally only)
Steps: delete bad runs -> rebuild index -> reeval all -> analyze -> commit -> push
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 216 insertions(+), 0 deletions(-)
diff --git a/harness/clean-and-reeval.py b/harness/clean-and-reeval.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""Clean bad runs, re-evaluate everything, run analysis, commit and push.
+
+Usage:
+ python3 harness/clean-and-reeval.py [-j N] [--no-push]
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+RESULTS_DIR = PROJECT_DIR / "results"
+ARTIFACTS_DIR = PROJECT_DIR / "artifacts"
+
+sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
+
+
+def clean_bad_runs():
+ """Delete runs that are invalid, incomplete, or have no HTML output."""
+ runs_dir = RESULTS_DIR / "runs"
+ if not runs_dir.exists():
+ return 0
+
+ deleted = 0
+ for run_dir in sorted(runs_dir.iterdir()):
+ if not run_dir.is_dir():
+ continue
+
+ output_path = run_dir / "claude_output.json"
+ meta_path = run_dir / "meta.json"
+ artifact_dir = ARTIFACTS_DIR / run_dir.name
+
+ bad = False
+ reason = ""
+
+ if not output_path.exists() or not meta_path.exists():
+ bad, reason = True, "missing output or meta"
+ else:
+ try:
+ output = json.loads(output_path.read_text())
+ meta = json.loads(meta_path.read_text())
+
+ cost = output.get("total_cost_usd")
+ turns = output.get("num_turns")
+ exit_code = meta.get("exit_code")
+ result_text = output.get("result", "")
+
+ if cost in (None, 0):
+ bad, reason = True, f"cost={cost}"
+ elif turns in (None, 1):
+ bad, reason = True, f"turns={turns}"
+ elif exit_code == 124:
+ bad, reason = True, "timeout"
+ elif "Invalid API key" in str(result_text):
+ bad, reason = True, "invalid API key"
+
+ # No HTML = game wasn't built
+ if not bad and artifact_dir.exists():
+ htmls = [
+ f for f in artifact_dir.rglob("*.html")
+ if "node_modules" not in str(f)
+ ]
+ if not htmls:
+ bad, reason = True, "no HTML files"
+ elif not bad and not artifact_dir.exists():
+ bad, reason = True, "no artifact directory"
+
+ except (json.JSONDecodeError, OSError) as e:
+ bad, reason = True, str(e)
+
+ if bad:
+ print(f" DELETE: {run_dir.name[:60]}... ({reason})")
+ shutil.rmtree(run_dir, ignore_errors=True)
+ if artifact_dir.exists():
+ shutil.rmtree(artifact_dir, ignore_errors=True)
+ deleted += 1
+
+ return deleted
+
+
+def rebuild_index():
+ """Rebuild results/index.jsonl from remaining runs."""
+ index_path = RESULTS_DIR / "index.jsonl"
+ if index_path.exists():
+ index_path.unlink()
+
+ runs_dir = RESULTS_DIR / "runs"
+ count = 0
+ with open(index_path, "w") as f:
+ for run_dir in sorted(runs_dir.iterdir()):
+ meta_path = run_dir / "meta.json"
+ eval_path = run_dir / "eval_results.json"
+ if meta_path.exists() and eval_path.exists():
+ meta = json.loads(meta_path.read_text())
+ entry = {
+ "run_id": meta.get("run_id", run_dir.name),
+ "task": meta.get("task"),
+ "model": meta.get("model"),
+ "cell_id": meta.get("cell_id"),
+ "completed_at": meta.get("completed_at"),
+ }
+ f.write(json.dumps(entry) + "\n")
+ count += 1
+
+ return count
+
+
+def run_analysis():
+ """Run main effects analysis for all metrics."""
+ from experiment_design import analyze_main_effects
+
+ analysis_dir = RESULTS_DIR / "analysis"
+ analysis_dir.mkdir(exist_ok=True)
+
+ metrics = [
+ "score", "cost", "turns", "wall_time",
+ "gameplay", "code_quality", "structural", "transcript",
+ ]
+ for metric in metrics:
+ effects = analyze_main_effects(str(RESULTS_DIR), metric)
+ (analysis_dir / f"main_effects_{metric}.json").write_text(
+ json.dumps(effects, indent=2)
+ )
+ print(f" Analysis updated for {len(metrics)} metrics")
+
+
+def main():
+ args = sys.argv[1:]
+ parallel = 4
+ do_push = True
+
+ i = 0
+ while i < len(args):
+ if args[i] == "-j" and i + 1 < len(args):
+ parallel = int(args[i + 1])
+ i += 2
+ elif args[i] == "--no-push":
+ do_push = False
+ i += 1
+ else:
+ i += 1
+
+ print("=" * 50)
+ print("Clean and Re-evaluate")
+ print("=" * 50)
+
+ # Step 1: Clean
+ print("\n1. Cleaning bad/incomplete runs...")
+ deleted = clean_bad_runs()
+ print(f" Deleted {deleted} runs")
+
+ # Step 2: Rebuild index
+ print("\n2. Rebuilding index...")
+ count = rebuild_index()
+ print(f" {count} valid runs indexed")
+
+ # Step 3: Count by model
+ runs_dir = RESULTS_DIR / "runs"
+ models: dict[str, int] = {}
+ for run_dir in runs_dir.iterdir():
+ if run_dir.is_dir():
+ meta_path = run_dir / "meta.json"
+ if meta_path.exists():
+ model = json.loads(meta_path.read_text()).get("model", "?")
+ models[model] = models.get(model, 0) + 1
+ for model, n in sorted(models.items()):
+ print(f" {model}: {n}")
+
+ # Step 4: Re-evaluate
+ print(f"\n3. Re-evaluating all runs (parallel={parallel})...")
+ reeval_result = subprocess.run(
+ ["python3", str(PROJECT_DIR / "harness" / "reeval.py"),
+ str(RESULTS_DIR), "-j", str(parallel)],
+ cwd=str(PROJECT_DIR),
+ )
+ if reeval_result.returncode != 0:
+ print(" WARNING: Re-evaluation had errors")
+
+ # Step 5: Analysis
+ print("\n4. Running analysis...")
+ run_analysis()
+
+ # Step 6: Commit and push
+ print("\n5. Committing results...")
+ subprocess.run(
+ ["git", "add", "-A", "results/", "artifacts/"],
+ cwd=str(PROJECT_DIR), capture_output=True,
+ )
+
+ total = sum(models.values())
+ msg = f"Re-eval {total} runs ({', '.join(f'{n} {m}' for m, n in sorted(models.items()))})"
+ subprocess.run(
+ ["git", "commit", "-m", msg],
+ cwd=str(PROJECT_DIR), capture_output=True,
+ )
+
+ if do_push:
+ result = subprocess.run(
+ ["git", "push"],
+ cwd=str(PROJECT_DIR), capture_output=True, text=True,
+ )
+ if result.returncode == 0:
+ print(" Pushed.")
+ else:
+ print(f" Push failed: {result.stderr.strip()}")
+ else:
+ print(" Committed locally (--no-push)")
+
+ print("\nDone.")
+
+
+if __name__ == "__main__":
+ main()