loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit e1fb0176902a996fded91653db4ed069d06b96a0
parent 724f4222b877c7ff2bd457c2bac60ec69eb48ab9
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 18:54:20 +0200

Add provider axis for Z.AI (GLM) model support

New provider axis: anthropic (default) or zai. Z.AI gateway maps
Claude model names to GLM models (haiku->glm-4.5-air, sonnet->glm-4.7).

- Provider config in grid.yaml with base_url, api_key_env, model_map
- invoke_claude sets ANTHROPIC_BASE_URL per-subprocess (isolated, no leakage)
- Explicitly clears ANTHROPIC_BASE_URL for anthropic runs (safety)
- Records actual_model in meta.json (what really ran vs what was requested)
- Dashboard shows provider axis, skips actual_model in analysis
- Exclusion: opus+zai (maps to same glm-4.7 as sonnet)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/BumpChart.tsx | 1+
Mdashboard/src/components/CorrelationMatrix.tsx | 1+
Mdashboard/src/components/Filters.tsx | 1+
Mdashboard/src/components/HeatmapMatrix.tsx | 1+
Mdashboard/src/components/TornadoChart.tsx | 1+
Mdashboard/src/components/Variability.tsx | 1+
Mdashboard/src/lib/analysis.ts | 1+
Mdashboard/src/lib/data.ts | 2++
Mdashboard/src/lib/types.ts | 4++++
Mdashboard/src/pages/compare.astro | 1+
Mgrid.yaml | 21+++++++++++++++++++++
Mharness/lib/compute_grid.py | 1+
Mharness/lib/experiment_design.py | 2+-
Mharness/run.py | 37+++++++++++++++++++++++++++++++------
14 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx @@ -48,6 +48,7 @@ const AXIS_LABELS: Record<AxisName, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; // All axes except "model" since we rank by model diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx @@ -27,6 +27,7 @@ const CONFIG_AXES = [ { key: "error_checking", label: "Error Checking" }, { key: "context_noise", label: "Context Noise" }, { key: "renderer", label: "Renderer" }, + { key: "provider", label: "Provider" }, ] as const; type MetricExtractor = (run: Run) => number | null; diff --git a/dashboard/src/components/Filters.tsx b/dashboard/src/components/Filters.tsx @@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; export default function Filters({ diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx @@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; interface CellData { diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx @@ -28,6 +28,7 @@ const AXIS_LABELS: Record<string, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; export default function TornadoChart({ effects, metric }: TornadoChartProps) { diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx @@ -31,6 +31,7 @@ const AXIS_LABELS: Record<string, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; /* ---------- helpers ---------- */ diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts @@ -62,6 +62,7 @@ const SKIP_KEYS = new Set([ "short_cell_id", "claude_version", "sub_agents", + "actual_model", ]); type MetricExtractor = (run: Run) => number | null; diff --git a/dashboard/src/lib/data.ts b/dashboard/src/lib/data.ts @@ -51,6 +51,8 @@ export function loadAllRuns(): Run[] { meta.error_checking = meta.error_checking || "none"; meta.context_noise = meta.context_noise || "clean"; meta.renderer = meta.renderer || "none"; + meta.provider = meta.provider || "anthropic"; + meta.actual_model = meta.actual_model || meta.model; // Compute short IDs if not in meta (backwards compat) if (!meta.short_id && meta.run_id) { diff --git a/dashboard/src/lib/types.ts b/dashboard/src/lib/types.ts @@ -26,6 +26,8 @@ export interface RunMeta { error_checking: string; context_noise: string; renderer: string; + provider: string; + actual_model: string; short_id?: string; short_cell_id?: string; max_budget: string; @@ -101,6 +103,7 @@ export type AxisName = keyof Pick< | "error_checking" | "context_noise" | "renderer" + | "provider" >; export const AXIS_NAMES: AxisName[] = [ @@ -126,4 +129,5 @@ export const AXIS_NAMES: AxisName[] = [ "error_checking", "context_noise", "renderer", + "provider", ]; diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro @@ -100,6 +100,7 @@ const AXIS_LABELS: Record<AxisName, string> = { error_checking: "Error Checking", context_noise: "Context Noise", renderer: "Renderer", + provider: "Provider", }; // Pre-compute all cell stats once diff --git a/grid.yaml b/grid.yaml @@ -52,6 +52,19 @@ axes: values: ["clean", "wikipedia_25", "wikipedia_50", "wikipedia_75", "lorem_25", "lorem_50", "lorem_75"] renderer: values: ["none", "canvas", "svg", "dom", "webgl"] + provider: + values: ["anthropic", "zai"] + +providers: + anthropic: + # Default -- no overrides, uses standard Claude Code auth + zai: + base_url: "https://api.z.ai/api/anthropic" + api_key_env: "ZAI_API_KEY" + model_map: + haiku: "glm-4.5-air" + sonnet: "glm-4.7" + opus: "glm-4.7" exclusions: # Haiku does not support extended thinking @@ -66,6 +79,10 @@ exclusions: playwright: "off" - when: strategy: compete + # opus + zai both map to glm-4.7 (same as sonnet), wasteful + - when: + provider: zai + model: opus tasks: - tetris @@ -96,6 +113,7 @@ profiles: error_checking: ["none"] context_noise: ["clean"] renderer: ["none"] + provider: ["anthropic"] runs_per_cell: 1 core: @@ -123,6 +141,7 @@ profiles: error_checking: ["none"] context_noise: ["clean"] renderer: ["none"] + provider: ["anthropic"] runs_per_cell: 3 all-on: @@ -150,6 +169,7 @@ profiles: error_checking: ["self_verify"] context_noise: ["clean"] renderer: ["canvas"] + provider: ["anthropic"] runs_per_cell: 3 all-off: @@ -177,6 +197,7 @@ profiles: error_checking: ["none"] context_noise: ["clean"] renderer: ["none"] + provider: ["anthropic"] runs_per_cell: 3 full: diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py @@ -42,6 +42,7 @@ AXIS_ABBREV = { "error_checking": "echk", "context_noise": "noise", "renderer": "rndr", + "provider": "prov", } # Short value names for cell_id to keep paths under 255 chars diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py @@ -310,7 +310,7 @@ def analyze_main_effects(results_dir, metric="score"): "task", "cell_id", "run_id", "run_number", "runs_per_cell", "max_budget_usd", "timeout_seconds", "base_tools", "started_at", "completed_at", "wall_time_seconds", "exit_code", - "short_id", "short_cell_id", "claude_version", + "short_id", "short_cell_id", "claude_version", "actual_model", } axis_names = sorted(meta_keys - skip_keys) diff --git a/harness/run.py b/harness/run.py @@ -208,7 +208,7 @@ def build_prompt(project_dir: Path, cell: dict) -> str: return prompt -def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path) -> int: +def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path, provider_config: dict = None) -> int: """Invoke claude CLI and capture output.""" prompt = build_prompt(project_dir, cell) model = cell["model"] @@ -299,6 +299,20 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path) } transcript_f.write(json.dumps(ctx_event) + "\n") + # Provider-specific env overrides + run_env = os.environ.copy() + if provider_config: + if provider_config.get("base_url"): + run_env["ANTHROPIC_BASE_URL"] = provider_config["base_url"] + else: + run_env.pop("ANTHROPIC_BASE_URL", None) + if provider_config.get("api_key_env"): + key = os.environ.get(provider_config["api_key_env"]) + if key: + run_env["ANTHROPIC_AUTH_TOKEN"] = key + else: + run_env.pop("ANTHROPIC_BASE_URL", None) + with open(transcript_path, "a") as transcript_f, open(stderr_path, "w") as stderr_f: try: result = subprocess.run( @@ -307,6 +321,7 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path) stdout=transcript_f, stderr=stderr_f, timeout=timeout, + env=run_env, ) exit_code = result.returncode except subprocess.TimeoutExpired: @@ -638,6 +653,7 @@ def run_single( results_dir: Path, project_dir: Path, claude_version: str, + providers_config: dict = None, ) -> str: """Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'.""" cell_id = cell["cell_id"] @@ -656,7 +672,14 @@ def run_single( log(f"INVALID: {run_id} - deleting and re-running") shutil.rmtree(run_dir) - log(f"START: {task} | {model} | {prompt_style} | run{run_num}") + # Resolve provider and actual model + provider_name = cell.get("provider", "anthropic") + provider_config = (providers_config or {}).get(provider_name, {}) + model_map = provider_config.get("model_map", {}) + actual_model = model_map.get(model, model) + display_model = actual_model if provider_name != "anthropic" else model + + log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}") run_dir.mkdir(parents=True, exist_ok=True) @@ -667,6 +690,7 @@ def run_single( "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8], "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8], "run_number": run_num, + "actual_model": actual_model, "claude_version": claude_version, "started_at": datetime.now(timezone.utc).isoformat(), } @@ -681,7 +705,7 @@ def run_single( # Invoke claude start_time = time.time() - exit_code = invoke_claude(cell, workspace, run_dir, project_dir) + exit_code = invoke_claude(cell, workspace, run_dir, project_dir, provider_config) wall_time = int(time.time() - start_time) status = "ok" if exit_code == 0 else f"exit {exit_code}" @@ -714,7 +738,7 @@ def run_single( archive_workspace(workspace, run_dir) result = "completed" if (run_dir / "eval_results.json").exists() else "failed" - log(f" DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}") + log(f" DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}") return result @@ -786,6 +810,7 @@ def main(): print("=" * 40) grid = load_grid(grid_file) + providers_config = grid.get("providers", {}) # Build baseline override from --model flag baseline = None @@ -834,7 +859,7 @@ def main(): if parallel <= 1: # Sequential for cell, run_num in jobs: - result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version) + result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config) if result == "completed": completed += 1 elif result == "skipped": @@ -846,7 +871,7 @@ def main(): with ThreadPoolExecutor(max_workers=parallel) as executor: futures = { executor.submit( - run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version + run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config ): (cell, run_num) for cell, run_num in jobs }

Impressum · Datenschutz