Add --reeval, --analyze, --full-pipeline flags to harness - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit c025b7ca4c96967759955589f80afbfe941906ab
parent c05152b062d03bc89f211f476b00aebcdd9b6d00
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 10:46:28 +0200

Add --reeval, --analyze, --full-pipeline flags to harness

Full automated pipeline:
  python3 harness/run.py grid.yaml main_effects -j 6 --full-pipeline

This runs the sweep, then:
1. Re-evaluates all runs with latest eval scripts
2. Runs main effects analysis for score, cost, turns, wall_time,
   gameplay, code_quality and saves to results/analysis/
3. Commits and pushes everything

Individual flags:
  --reeval     Re-evaluate only (no sweep)
  --analyze    Run analysis only (no sweep)
  --full-pipeline  Both after sweep

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-

1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/harness/run.py b/harness/run.py
@@ -13,6 +13,10 @@ Usage:
       - interaction_hunt:axis1,axis2,axis3
 
     -j N: run N experiments in parallel (default 1)
+    --model MODEL: set baseline model for main_effects sweep
+    --reeval: re-evaluate all existing runs with latest eval scripts
+    --analyze: run analysis and save results to results/analysis/
+    --full-pipeline: reeval + analyze after sweep completes
 """
 
 import json
@@ -592,6 +596,9 @@ def main():
     grid_file = str(PROJECT_DIR / "grid.yaml")
     profile = "smoke"
 
+    do_reeval = False
+    do_analyze = False
+
     i = 0
     positional = []
     while i < len(args):
@@ -601,6 +608,16 @@ def main():
         elif args[i] == "--model" and i + 1 < len(args):
             baseline_model = args[i + 1]
             i += 2
+        elif args[i] == "--reeval":
+            do_reeval = True
+            i += 1
+        elif args[i] == "--analyze":
+            do_analyze = True
+            i += 1
+        elif args[i] == "--full-pipeline":
+            do_reeval = True
+            do_analyze = True
+            i += 1
         else:
             positional.append(args[i])
             i += 1
@@ -732,6 +749,37 @@ def main():
     print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}")
     print("=" * 40)
 
+    # Re-evaluate all runs with latest eval scripts
+    if do_reeval or (completed > 0 and do_analyze):
+        print()
+        print("Re-evaluating all runs with latest eval scripts...")
+        reeval_result = subprocess.run(
+            ["python3", str(SCRIPT_DIR / "reeval.py"), str(results_dir), "-j", str(max(parallel, 4))],
+            cwd=str(PROJECT_DIR),
+        )
+        if reeval_result.returncode == 0:
+            print("Re-evaluation complete.")
+        else:
+            print("Re-evaluation had errors (continuing).")
+
+    # Run analysis and save results
+    if do_analyze:
+        print()
+        print("Running analysis...")
+        analysis_dir = results_dir / "analysis"
+        analysis_dir.mkdir(exist_ok=True)
+
+        metrics = ["score", "cost", "turns", "wall_time", "gameplay", "code_quality"]
+        for metric in metrics:
+            try:
+                effects = analyze_main_effects(str(results_dir), metric)
+                (analysis_dir / f"main_effects_{metric}.json").write_text(
+                    json.dumps(effects, indent=2)
+                )
+                print(f"  Saved main_effects_{metric}.json")
+            except Exception as e:
+                print(f"  Error analyzing {metric}: {e}")
+
     # Auto-commit and push results
     if completed > 0:
         print()
@@ -739,7 +787,7 @@ def main():
         try:
             subprocess.run(
                 ["git", "add", "results/", "dashboard/public/artifacts/"],
-                cwd=str(PROJECT_DIR), capture_output=True,
+                cwd=str(PROJECT_DIR), capture_output=True, timeout=30,
             )
             total_runs = len(list((results_dir / "runs").iterdir()))
             msg = (

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README