loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d09bac8d4e2e29323b0de19e7977e8efd940b46e
parent e1fb0176902a996fded91653db4ed069d06b96a0
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:01:17 +0200

Require --provider flag for run.py

Must specify --provider anthropic or --provider zai to prevent
accidentally running against the wrong backend. Cells are filtered
to only the requested provider.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 19+++++++++++++++++++
1 file changed, 19 insertions(+), 0 deletions(-)

diff --git a/harness/run.py b/harness/run.py @@ -14,6 +14,7 @@ Usage: -j N: run N experiments in parallel (default 1) --model MODEL: set baseline model for main_effects sweep + --provider PROVIDER: required -- anthropic or zai --reeval: re-evaluate all existing runs with latest eval scripts --analyze: run analysis and save results to results/analysis/ --full-pipeline: reeval + analyze after sweep completes @@ -747,6 +748,7 @@ def main(): args = sys.argv[1:] parallel = 1 baseline_model = None + provider_filter = None grid_file = str(PROJECT_DIR / "grid.yaml") profile = "smoke" @@ -762,6 +764,9 @@ def main(): elif args[i] == "--model" and i + 1 < len(args): baseline_model = args[i + 1] i += 2 + elif args[i] == "--provider" and i + 1 < len(args): + provider_filter = args[i + 1] + i += 2 elif args[i] == "--reeval": do_reeval = True i += 1 @@ -805,13 +810,24 @@ def main(): print("=" * 40) print(f"Grid file: {grid_file}") print(f"Profile: {profile}") + print(f"Provider: {provider_filter}") print(f"Parallel: {parallel}") print(f"Results: {results_dir}") print("=" * 40) + # --provider is required + if not provider_filter: + print("ERROR: --provider is required. Use --provider anthropic or --provider zai") + sys.exit(1) + grid = load_grid(grid_file) providers_config = grid.get("providers", {}) + if provider_filter not in providers_config and provider_filter not in [v for spec in grid["axes"].values() for v in spec.get("values", [])]: + valid = grid["axes"].get("provider", {}).get("values", []) + print(f"ERROR: unknown provider '{provider_filter}'. Valid: {valid}") + sys.exit(1) + # Build baseline override from --model flag baseline = None if baseline_model: @@ -834,6 +850,9 @@ def main(): cells = compute_cells(grid, profile) print(f"Profile: {profile}") + # Filter cells to requested provider + cells = [c for c in cells if c.get("provider", "anthropic") == provider_filter] + # Build the full list of (cell, run_num) jobs jobs = [] for cell in cells:

Impressum · Datenschutz