Require --provider flag for run.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit d09bac8d4e2e29323b0de19e7977e8efd940b46e
parent e1fb0176902a996fded91653db4ed069d06b96a0
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:01:17 +0200

Require --provider flag for run.py

Must specify --provider anthropic or --provider zai to prevent
accidentally running against the wrong backend. Cells are filtered
to only the requested provider.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 19 +++++++++++++++++++

1 file changed, 19 insertions(+), 0 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -14,6 +14,7 @@ Usage:
 
     -j N: run N experiments in parallel (default 1)
     --model MODEL: set baseline model for main_effects sweep
+    --provider PROVIDER: required -- anthropic or zai
     --reeval: re-evaluate all existing runs with latest eval scripts
     --analyze: run analysis and save results to results/analysis/
     --full-pipeline: reeval + analyze after sweep completes
@@ -747,6 +748,7 @@ def main():
     args = sys.argv[1:]
     parallel = 1
     baseline_model = None
+    provider_filter = None
     grid_file = str(PROJECT_DIR / "grid.yaml")
     profile = "smoke"
 
@@ -762,6 +764,9 @@ def main():
         elif args[i] == "--model" and i + 1 < len(args):
             baseline_model = args[i + 1]
             i += 2
+        elif args[i] == "--provider" and i + 1 < len(args):
+            provider_filter = args[i + 1]
+            i += 2
         elif args[i] == "--reeval":
             do_reeval = True
             i += 1
@@ -805,13 +810,24 @@ def main():
     print("=" * 40)
     print(f"Grid file:  {grid_file}")
     print(f"Profile:    {profile}")
+    print(f"Provider:   {provider_filter}")
     print(f"Parallel:   {parallel}")
     print(f"Results:    {results_dir}")
     print("=" * 40)
 
+    # --provider is required
+    if not provider_filter:
+        print("ERROR: --provider is required. Use --provider anthropic or --provider zai")
+        sys.exit(1)
+
     grid = load_grid(grid_file)
     providers_config = grid.get("providers", {})
 
+    if provider_filter not in providers_config and provider_filter not in [v for spec in grid["axes"].values() for v in spec.get("values", [])]:
+        valid = grid["axes"].get("provider", {}).get("values", [])
+        print(f"ERROR: unknown provider '{provider_filter}'. Valid: {valid}")
+        sys.exit(1)
+
     # Build baseline override from --model flag
     baseline = None
     if baseline_model:
@@ -834,6 +850,9 @@ def main():
         cells = compute_cells(grid, profile)
         print(f"Profile:    {profile}")
 
+    # Filter cells to requested provider
+    cells = [c for c in cells if c.get("provider", "anthropic") == provider_filter]
+
     # Build the full list of (cell, run_num) jobs
     jobs = []
     for cell in cells:

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README