loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit c025b7ca4c96967759955589f80afbfe941906ab
parent c05152b062d03bc89f211f476b00aebcdd9b6d00
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 10:46:28 +0200

Add --reeval, --analyze, --full-pipeline flags to harness

Full automated pipeline:
  python3 harness/run.py grid.yaml main_effects -j 6 --full-pipeline

This runs the sweep, then:
1. Re-evaluates all runs with latest eval scripts
2. Runs main effects analysis for score, cost, turns, wall_time,
   gameplay, code_quality and saves to results/analysis/
3. Commits and pushes everything

Individual flags:
  --reeval     Re-evaluate only (no sweep)
  --analyze    Run analysis only (no sweep)
  --full-pipeline  Both after sweep

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 50+++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/harness/run.py b/harness/run.py @@ -13,6 +13,10 @@ Usage: - interaction_hunt:axis1,axis2,axis3 -j N: run N experiments in parallel (default 1) + --model MODEL: set baseline model for main_effects sweep + --reeval: re-evaluate all existing runs with latest eval scripts + --analyze: run analysis and save results to results/analysis/ + --full-pipeline: reeval + analyze after sweep completes """ import json @@ -592,6 +596,9 @@ def main(): grid_file = str(PROJECT_DIR / "grid.yaml") profile = "smoke" + do_reeval = False + do_analyze = False + i = 0 positional = [] while i < len(args): @@ -601,6 +608,16 @@ def main(): elif args[i] == "--model" and i + 1 < len(args): baseline_model = args[i + 1] i += 2 + elif args[i] == "--reeval": + do_reeval = True + i += 1 + elif args[i] == "--analyze": + do_analyze = True + i += 1 + elif args[i] == "--full-pipeline": + do_reeval = True + do_analyze = True + i += 1 else: positional.append(args[i]) i += 1 @@ -732,6 +749,37 @@ def main(): print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}") print("=" * 40) + # Re-evaluate all runs with latest eval scripts + if do_reeval or (completed > 0 and do_analyze): + print() + print("Re-evaluating all runs with latest eval scripts...") + reeval_result = subprocess.run( + ["python3", str(SCRIPT_DIR / "reeval.py"), str(results_dir), "-j", str(max(parallel, 4))], + cwd=str(PROJECT_DIR), + ) + if reeval_result.returncode == 0: + print("Re-evaluation complete.") + else: + print("Re-evaluation had errors (continuing).") + + # Run analysis and save results + if do_analyze: + print() + print("Running analysis...") + analysis_dir = results_dir / "analysis" + analysis_dir.mkdir(exist_ok=True) + + metrics = ["score", "cost", "turns", "wall_time", "gameplay", "code_quality"] + for metric in metrics: + try: + effects = analyze_main_effects(str(results_dir), metric) + (analysis_dir / f"main_effects_{metric}.json").write_text( + json.dumps(effects, indent=2) + ) + print(f" Saved main_effects_{metric}.json") + except Exception as e: + print(f" Error analyzing {metric}: {e}") + # Auto-commit and push results if completed > 0: print() @@ -739,7 +787,7 @@ def main(): try: subprocess.run( ["git", "add", "results/", "dashboard/public/artifacts/"], - cwd=str(PROJECT_DIR), capture_output=True, + cwd=str(PROJECT_DIR), capture_output=True, timeout=30, ) total_runs = len(list((results_dir / "runs").iterdir())) msg = (

Impressum · Datenschutz