Add provider axis for Z.AI (GLM) model support - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit e1fb0176902a996fded91653db4ed069d06b96a0
parent 724f4222b877c7ff2bd457c2bac60ec69eb48ab9
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 18:54:20 +0200

Add provider axis for Z.AI (GLM) model support

New provider axis: anthropic (default) or zai. Z.AI gateway maps
Claude model names to GLM models (haiku->glm-4.5-air, sonnet->glm-4.7).

- Provider config in grid.yaml with base_url, api_key_env, model_map
- invoke_claude sets ANTHROPIC_BASE_URL per-subprocess (isolated, no leakage)
- Explicitly clears ANTHROPIC_BASE_URL for anthropic runs (safety)
- Records actual_model in meta.json (what really ran vs what was requested)
- Dashboard shows provider axis, skips actual_model in analysis
- Exclusion: opus+zai (maps to same glm-4.7 as sonnet)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/BumpChart.tsx  | 1 +
M dashboard/src/components/CorrelationMatrix.tsx  | 1 +
M dashboard/src/components/Filters.tsx  | 1 +
M dashboard/src/components/HeatmapMatrix.tsx  | 1 +
M dashboard/src/components/TornadoChart.tsx  | 1 +
M dashboard/src/components/Variability.tsx  | 1 +
M dashboard/src/lib/analysis.ts  | 1 +
M dashboard/src/lib/data.ts  | 2 ++
M dashboard/src/lib/types.ts  | 4 ++++
M dashboard/src/pages/compare.astro  | 1 +
M grid.yaml  | 21 +++++++++++++++++++++
M harness/lib/compute_grid.py  | 1 +
M harness/lib/experiment_design.py  | 2 +-
M harness/run.py  | 37 +++++++++++++++++++++++++++++++------

14 files changed, 68 insertions(+), 7 deletions(-)
diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx
@@ -48,6 +48,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 // All axes except "model" since we rank by model
diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx
@@ -27,6 +27,7 @@ const CONFIG_AXES = [
   { key: "error_checking", label: "Error Checking" },
   { key: "context_noise", label: "Context Noise" },
   { key: "renderer", label: "Renderer" },
+  { key: "provider", label: "Provider" },
 ] as const;
 
 type MetricExtractor = (run: Run) => number | null;
diff --git a/dashboard/src/components/Filters.tsx b/dashboard/src/components/Filters.tsx
@@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 export default function Filters({
diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx
@@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 interface CellData {
diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx
@@ -28,6 +28,7 @@ const AXIS_LABELS: Record<string, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 export default function TornadoChart({ effects, metric }: TornadoChartProps) {
diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx
@@ -31,6 +31,7 @@ const AXIS_LABELS: Record<string, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 /* ---------- helpers ---------- */
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -62,6 +62,7 @@ const SKIP_KEYS = new Set([
   "short_cell_id",
   "claude_version",
   "sub_agents",
+  "actual_model",
 ]);
 
 type MetricExtractor = (run: Run) => number | null;
diff --git a/dashboard/src/lib/data.ts b/dashboard/src/lib/data.ts
@@ -51,6 +51,8 @@ export function loadAllRuns(): Run[] {
       meta.error_checking = meta.error_checking || "none";
       meta.context_noise = meta.context_noise || "clean";
       meta.renderer = meta.renderer || "none";
+      meta.provider = meta.provider || "anthropic";
+      meta.actual_model = meta.actual_model || meta.model;
 
       // Compute short IDs if not in meta (backwards compat)
       if (!meta.short_id && meta.run_id) {
diff --git a/dashboard/src/lib/types.ts b/dashboard/src/lib/types.ts
@@ -26,6 +26,8 @@ export interface RunMeta {
   error_checking: string;
   context_noise: string;
   renderer: string;
+  provider: string;
+  actual_model: string;
   short_id?: string;
   short_cell_id?: string;
   max_budget: string;
@@ -101,6 +103,7 @@ export type AxisName = keyof Pick<
   | "error_checking"
   | "context_noise"
   | "renderer"
+  | "provider"
 >;
 
 export const AXIS_NAMES: AxisName[] = [
@@ -126,4 +129,5 @@ export const AXIS_NAMES: AxisName[] = [
   "error_checking",
   "context_noise",
   "renderer",
+  "provider",
 ];
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -100,6 +100,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
   error_checking: "Error Checking",
   context_noise: "Context Noise",
   renderer: "Renderer",
+  provider: "Provider",
 };
 
 // Pre-compute all cell stats once
diff --git a/grid.yaml b/grid.yaml
@@ -52,6 +52,19 @@ axes:
     values: ["clean", "wikipedia_25", "wikipedia_50", "wikipedia_75", "lorem_25", "lorem_50", "lorem_75"]
   renderer:
     values: ["none", "canvas", "svg", "dom", "webgl"]
+  provider:
+    values: ["anthropic", "zai"]
+
+providers:
+  anthropic:
+    # Default -- no overrides, uses standard Claude Code auth
+  zai:
+    base_url: "https://api.z.ai/api/anthropic"
+    api_key_env: "ZAI_API_KEY"
+    model_map:
+      haiku: "glm-4.5-air"
+      sonnet: "glm-4.7"
+      opus: "glm-4.7"
 
 exclusions:
   # Haiku does not support extended thinking
@@ -66,6 +79,10 @@ exclusions:
       playwright: "off"
   - when:
       strategy: compete
+  # opus + zai both map to glm-4.7 (same as sonnet), wasteful
+  - when:
+      provider: zai
+      model: opus
 
 tasks:
   - tetris
@@ -96,6 +113,7 @@ profiles:
       error_checking: ["none"]
       context_noise: ["clean"]
       renderer: ["none"]
+      provider: ["anthropic"]
     runs_per_cell: 1
 
   core:
@@ -123,6 +141,7 @@ profiles:
       error_checking: ["none"]
       context_noise: ["clean"]
       renderer: ["none"]
+      provider: ["anthropic"]
     runs_per_cell: 3
 
   all-on:
@@ -150,6 +169,7 @@ profiles:
       error_checking: ["self_verify"]
       context_noise: ["clean"]
       renderer: ["canvas"]
+      provider: ["anthropic"]
     runs_per_cell: 3
 
   all-off:
@@ -177,6 +197,7 @@ profiles:
       error_checking: ["none"]
       context_noise: ["clean"]
       renderer: ["none"]
+      provider: ["anthropic"]
     runs_per_cell: 3
 
   full:
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -42,6 +42,7 @@ AXIS_ABBREV = {
     "error_checking": "echk",
     "context_noise": "noise",
     "renderer": "rndr",
+    "provider": "prov",
 }
 
 # Short value names for cell_id to keep paths under 255 chars
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -310,7 +310,7 @@ def analyze_main_effects(results_dir, metric="score"):
         "task", "cell_id", "run_id", "run_number", "runs_per_cell",
         "max_budget_usd", "timeout_seconds", "base_tools",
         "started_at", "completed_at", "wall_time_seconds", "exit_code",
-        "short_id", "short_cell_id", "claude_version",
+        "short_id", "short_cell_id", "claude_version", "actual_model",
     }
     axis_names = sorted(meta_keys - skip_keys)
 
diff --git a/harness/run.py b/harness/run.py
@@ -208,7 +208,7 @@ def build_prompt(project_dir: Path, cell: dict) -> str:
     return prompt
 
 
-def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path) -> int:
+def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path, provider_config: dict = None) -> int:
     """Invoke claude CLI and capture output."""
     prompt = build_prompt(project_dir, cell)
     model = cell["model"]
@@ -299,6 +299,20 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path)
                 }
                 transcript_f.write(json.dumps(ctx_event) + "\n")
 
+    # Provider-specific env overrides
+    run_env = os.environ.copy()
+    if provider_config:
+        if provider_config.get("base_url"):
+            run_env["ANTHROPIC_BASE_URL"] = provider_config["base_url"]
+        else:
+            run_env.pop("ANTHROPIC_BASE_URL", None)
+        if provider_config.get("api_key_env"):
+            key = os.environ.get(provider_config["api_key_env"])
+            if key:
+                run_env["ANTHROPIC_AUTH_TOKEN"] = key
+    else:
+        run_env.pop("ANTHROPIC_BASE_URL", None)
+
     with open(transcript_path, "a") as transcript_f, open(stderr_path, "w") as stderr_f:
         try:
             result = subprocess.run(
@@ -307,6 +321,7 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path)
                 stdout=transcript_f,
                 stderr=stderr_f,
                 timeout=timeout,
+                env=run_env,
             )
             exit_code = result.returncode
         except subprocess.TimeoutExpired:
@@ -638,6 +653,7 @@ def run_single(
     results_dir: Path,
     project_dir: Path,
     claude_version: str,
+    providers_config: dict = None,
 ) -> str:
     """Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'."""
     cell_id = cell["cell_id"]
@@ -656,7 +672,14 @@ def run_single(
             log(f"INVALID: {run_id} - deleting and re-running")
             shutil.rmtree(run_dir)
 
-    log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
+    # Resolve provider and actual model
+    provider_name = cell.get("provider", "anthropic")
+    provider_config = (providers_config or {}).get(provider_name, {})
+    model_map = provider_config.get("model_map", {})
+    actual_model = model_map.get(model, model)
+    display_model = actual_model if provider_name != "anthropic" else model
+
+    log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}")
 
     run_dir.mkdir(parents=True, exist_ok=True)
 
@@ -667,6 +690,7 @@ def run_single(
         "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8],
         "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8],
         "run_number": run_num,
+        "actual_model": actual_model,
         "claude_version": claude_version,
         "started_at": datetime.now(timezone.utc).isoformat(),
     }
@@ -681,7 +705,7 @@ def run_single(
 
     # Invoke claude
     start_time = time.time()
-    exit_code = invoke_claude(cell, workspace, run_dir, project_dir)
+    exit_code = invoke_claude(cell, workspace, run_dir, project_dir, provider_config)
     wall_time = int(time.time() - start_time)
 
     status = "ok" if exit_code == 0 else f"exit {exit_code}"
@@ -714,7 +738,7 @@ def run_single(
     archive_workspace(workspace, run_dir)
 
     result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
-    log(f"  DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
+    log(f"  DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
     return result
 
 
@@ -786,6 +810,7 @@ def main():
     print("=" * 40)
 
     grid = load_grid(grid_file)
+    providers_config = grid.get("providers", {})
 
     # Build baseline override from --model flag
     baseline = None
@@ -834,7 +859,7 @@ def main():
     if parallel <= 1:
         # Sequential
         for cell, run_num in jobs:
-            result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version)
+            result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config)
             if result == "completed":
                 completed += 1
             elif result == "skipped":
@@ -846,7 +871,7 @@ def main():
         with ThreadPoolExecutor(max_workers=parallel) as executor:
             futures = {
                 executor.submit(
-                    run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version
+                    run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config
                 ): (cell, run_num)
                 for cell, run_num in jobs
             }

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/BumpChart.tsx	\|	1	+
M	dashboard/src/components/CorrelationMatrix.tsx	\|	1	+
M	dashboard/src/components/Filters.tsx	\|	1	+
M	dashboard/src/components/HeatmapMatrix.tsx	\|	1	+
M	dashboard/src/components/TornadoChart.tsx	\|	1	+
M	dashboard/src/components/Variability.tsx	\|	1	+
M	dashboard/src/lib/analysis.ts	\|	1	+
M	dashboard/src/lib/data.ts	\|	2	++
M	dashboard/src/lib/types.ts	\|	4	++++
M	dashboard/src/pages/compare.astro	\|	1	+
M	grid.yaml	\|	21	+++++++++++++++++++++
M	harness/lib/compute_grid.py	\|	1	+
M	harness/lib/experiment_design.py	\|	2	+-
M	harness/run.py	\|	37	+++++++++++++++++++++++++++++++------