commit c025b7ca4c96967759955589f80afbfe941906ab
parent c05152b062d03bc89f211f476b00aebcdd9b6d00
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 10:46:28 +0200
Add --reeval, --analyze, --full-pipeline flags to harness
Full automated pipeline:
python3 harness/run.py grid.yaml main_effects -j 6 --full-pipeline
This runs the sweep, then:
1. Re-evaluates all runs with latest eval scripts
2. Runs main effects analysis for score, cost, turns, wall_time,
gameplay, code_quality and saves to results/analysis/
3. Commits and pushes everything
Individual flags:
--reeval Re-evaluate only (no sweep)
--analyze Run analysis only (no sweep)
--full-pipeline Both after sweep
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
| M | harness/run.py | | | 50 | +++++++++++++++++++++++++++++++++++++++++++++++++- |
1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/harness/run.py b/harness/run.py
@@ -13,6 +13,10 @@ Usage:
- interaction_hunt:axis1,axis2,axis3
-j N: run N experiments in parallel (default 1)
+ --model MODEL: set baseline model for main_effects sweep
+ --reeval: re-evaluate all existing runs with latest eval scripts
+ --analyze: run analysis and save results to results/analysis/
+ --full-pipeline: reeval + analyze after sweep completes
"""
import json
@@ -592,6 +596,9 @@ def main():
grid_file = str(PROJECT_DIR / "grid.yaml")
profile = "smoke"
+ do_reeval = False
+ do_analyze = False
+
i = 0
positional = []
while i < len(args):
@@ -601,6 +608,16 @@ def main():
elif args[i] == "--model" and i + 1 < len(args):
baseline_model = args[i + 1]
i += 2
+ elif args[i] == "--reeval":
+ do_reeval = True
+ i += 1
+ elif args[i] == "--analyze":
+ do_analyze = True
+ i += 1
+ elif args[i] == "--full-pipeline":
+ do_reeval = True
+ do_analyze = True
+ i += 1
else:
positional.append(args[i])
i += 1
@@ -732,6 +749,37 @@ def main():
print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}")
print("=" * 40)
+ # Re-evaluate all runs with latest eval scripts
+ if do_reeval or (completed > 0 and do_analyze):
+ print()
+ print("Re-evaluating all runs with latest eval scripts...")
+ reeval_result = subprocess.run(
+ ["python3", str(SCRIPT_DIR / "reeval.py"), str(results_dir), "-j", str(max(parallel, 4))],
+ cwd=str(PROJECT_DIR),
+ )
+ if reeval_result.returncode == 0:
+ print("Re-evaluation complete.")
+ else:
+ print("Re-evaluation had errors (continuing).")
+
+ # Run analysis and save results
+ if do_analyze:
+ print()
+ print("Running analysis...")
+ analysis_dir = results_dir / "analysis"
+ analysis_dir.mkdir(exist_ok=True)
+
+ metrics = ["score", "cost", "turns", "wall_time", "gameplay", "code_quality"]
+ for metric in metrics:
+ try:
+ effects = analyze_main_effects(str(results_dir), metric)
+ (analysis_dir / f"main_effects_{metric}.json").write_text(
+ json.dumps(effects, indent=2)
+ )
+ print(f" Saved main_effects_{metric}.json")
+ except Exception as e:
+ print(f" Error analyzing {metric}: {e}")
+
# Auto-commit and push results
if completed > 0:
print()
@@ -739,7 +787,7 @@ def main():
try:
subprocess.run(
["git", "add", "results/", "dashboard/public/artifacts/"],
- cwd=str(PROJECT_DIR), capture_output=True,
+ cwd=str(PROJECT_DIR), capture_output=True, timeout=30,
)
total_runs = len(list((results_dir / "runs").iterdir()))
msg = (