loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 7b4564c84f68f63d9b0de348c5fa77c396fe7650
parent 9151010083c5ed6be0381775a8dd0c8c3f55a792
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 20:03:37 +0200

Add --commit-every N flag for periodic analyze+push

Runs analysis and pushes results every N completed runs.
Useful for long sweeps where you want to see results incrementally.

Usage: python3 harness/run.py grid.yaml main_effects --provider zai -j 4 --commit-every 20

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 34++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+), 0 deletions(-)

diff --git a/harness/run.py b/harness/run.py @@ -747,6 +747,7 @@ def main(): baseline_model = None provider_filter = None max_runs = None + commit_every = None grid_file = str(PROJECT_DIR / "grid.yaml") profile = "smoke" @@ -768,6 +769,9 @@ def main(): elif args[i] in ("-n", "--max-runs") and i + 1 < len(args): max_runs = int(args[i + 1]) i += 2 + elif args[i] == "--commit-every" and i + 1 < len(args): + commit_every = int(args[i + 1]) + i += 2 elif args[i] == "--reeval": do_reeval = True i += 1 @@ -870,6 +874,34 @@ def main(): print(f"Total jobs: {len(jobs)}") print() + # Periodic commit helper + _last_commit_count = [0] # mutable for closure + + def periodic_commit(completed_so_far): + if not commit_every or completed_so_far - _last_commit_count[0] < commit_every: + return + _last_commit_count[0] = completed_so_far + log(f" --- Checkpoint: analyzing and pushing {completed_so_far} completed runs ---") + # Run analysis + analysis_dir = results_dir / "analysis" + analysis_dir.mkdir(exist_ok=True) + for metric in ["score", "cost", "turns", "wall_time", "gameplay", "sonarqube", "code_quality", "structural", "transcript", "build_quality"]: + try: + effects = analyze_main_effects(str(results_dir), metric) + (analysis_dir / f"main_effects_{metric}.json").write_text(json.dumps(effects, indent=2)) + except Exception: + pass + # Commit and push + try: + subprocess.run(["git", "add", "-A", "results/", "artifacts/"], cwd=str(PROJECT_DIR), capture_output=True, timeout=30) + total_runs = len(list((results_dir / "runs").iterdir())) + msg = f"Checkpoint: {completed_so_far} runs ({total_runs} total)" + subprocess.run(["git", "commit", "-m", msg], cwd=str(PROJECT_DIR), capture_output=True, timeout=30) + subprocess.run(["git", "push"], cwd=str(PROJECT_DIR), capture_output=True, timeout=60) + log(f" --- Pushed checkpoint ---") + except Exception as e: + log(f" --- Checkpoint push failed: {e} ---") + # Start auth keepalive in background (refreshes OAuth token every 5 min) auth_keepalive = subprocess.Popen( ["bash", str(SCRIPT_DIR / "lib" / "keep-auth-alive.sh"), "300"], @@ -892,6 +924,7 @@ def main(): skipped += 1 else: failed += 1 + periodic_commit(completed) else: # Parallel with rolling concurrency with ThreadPoolExecutor(max_workers=parallel) as executor: @@ -919,6 +952,7 @@ def main(): total_done = completed + skipped + failed log(f" Progress: {total_done}/{len(jobs)} ({completed} completed, {skipped} skipped, {failed} failed)") + periodic_commit(completed) # Stop auth keepalive auth_keepalive.terminate()

Impressum · Datenschutz