loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 96ae9ecef7c6590632967d2aa49cf464c5c2a30f
parent 932df6b569f5de093993e826d9e5d3e5b2ba5b6d
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:26:26 +0200

Use real GLM model names directly, drop model_map

GLM models (glm-4.5-air, glm-4.7, glm-5.1) are now first-class values
in the model axis. The Z.AI gateway accepts them directly via --model flag.
No more confusing haiku/sonnet/opus mapping.

Exclusions enforce provider/model alignment: GLM models only with zai,
Anthropic models only with anthropic.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mgrid.yaml | 21+++++++++++----------
Mharness/lib/compute_grid.py | 20++++++--------------
Mharness/lib/experiment_design.py | 15++-------------
Mharness/run.py | 21++++++---------------
4 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/grid.yaml b/grid.yaml @@ -9,7 +9,7 @@ defaults: axes: model: - values: [haiku, sonnet, opus] + values: [haiku, sonnet, opus, "glm-4.5-air", "glm-4.7", "glm-5.1"] effort: values: [high, max] prompt_style: @@ -61,10 +61,7 @@ providers: zai: base_url: "https://api.z.ai/api/anthropic" api_key_env: "ZAI_API_KEY" - model_map: - haiku: "glm-4.5-air" - sonnet: "glm-4.7" - opus: "glm-4.7" + models: ["glm-4.5-air", "glm-4.7", "glm-5.1"] exclusions: # Haiku does not support extended thinking @@ -79,10 +76,14 @@ exclusions: playwright: "off" - when: strategy: compete - # opus + zai both map to glm-4.7 (same as sonnet), wasteful - - when: - provider: zai - model: opus + # GLM models only with zai provider + - when: { provider: anthropic, model: "glm-4.5-air" } + - when: { provider: anthropic, model: "glm-4.7" } + - when: { provider: anthropic, model: "glm-5.1" } + # Anthropic models only with anthropic provider + - when: { provider: zai, model: haiku } + - when: { provider: zai, model: sonnet } + - when: { provider: zai, model: opus } tasks: - tetris @@ -119,7 +120,7 @@ profiles: zai-smoke: description: "Quick validation for Z.AI GLM models" axes: - model: [haiku, sonnet] + model: ["glm-4.5-air", "glm-4.7", "glm-5.1"] effort: [high] prompt_style: [simple, detailed] language: [typescript] diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py @@ -68,6 +68,7 @@ VALUE_ABBREV = { "lorem_75": "lor75", "glm-4.5-air": "glm45air", "glm-4.7": "glm47", + "glm-5.1": "glm51", "anthropic": "anth", } @@ -149,20 +150,11 @@ def compute_cells(grid, profile_name): if excluded: continue - # Resolve actual_model from provider config - provider_name = cell.get("provider", "anthropic") - providers_config = grid.get("providers", {}) - model_map = (providers_config.get(provider_name) or {}).get("model_map", {}) - actual_model = model_map.get(cell.get("model", ""), cell.get("model", "")) - cell["actual_model"] = actual_model - - # Build cell ID using actual_model instead of model for clarity - cell_id_parts = [task] - for k in axis_names: - val = cell[k] - if k == "model": - val = actual_model # use resolved model name in cell_id - cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}") + # actual_model = model (no mapping needed, models are their real names) + cell["actual_model"] = cell.get("model", "") + + # Build cell ID from task + abbreviated axis values (deterministic, filename-safe) + cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names] cell_id = "_".join(cell_id_parts) # Resolve budget value diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py @@ -448,22 +448,11 @@ def _build_cell(task, cell, defaults, grid): from compute_grid import AXIS_ABBREV, VALUE_ABBREV axis_names = sorted(cell.keys()) - # Resolve actual_model from provider config - provider_name = cell.get("provider", "anthropic") - providers_config = grid.get("providers", {}) - model_map = (providers_config.get(provider_name) or {}).get("model_map", {}) - actual_model = model_map.get(cell.get("model", ""), cell.get("model", "")) - - cell_id_parts = [task] - for k in axis_names: - val = cell[k] - if k == "model": - val = actual_model - cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}") + cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names] result = dict(cell) result["task"] = task - result["actual_model"] = actual_model + result["actual_model"] = cell.get("model", "") result["cell_id"] = "_".join(cell_id_parts) result["runs_per_cell"] = defaults.get("runs_per_cell", 3) result["timeout_seconds"] = defaults.get("timeout_seconds", 600) diff --git a/harness/run.py b/harness/run.py @@ -673,14 +673,11 @@ def run_single( log(f"INVALID: {run_id} - deleting and re-running") shutil.rmtree(run_dir) - # Resolve provider and actual model + # Resolve provider config provider_name = cell.get("provider", "anthropic") - provider_config = (providers_config or {}).get(provider_name, {}) - model_map = provider_config.get("model_map", {}) - actual_model = model_map.get(model, model) - display_model = actual_model if provider_name != "anthropic" else model + provider_config = (providers_config or {}).get(provider_name) or {} - log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}") + log(f"START: {task} | {model} | {prompt_style} | run{run_num}") run_dir.mkdir(parents=True, exist_ok=True) @@ -691,7 +688,7 @@ def run_single( "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8], "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8], "run_number": run_num, - "actual_model": actual_model, + "actual_model": model, "claude_version": claude_version, "started_at": datetime.now(timezone.utc).isoformat(), } @@ -739,7 +736,7 @@ def run_single( archive_workspace(workspace, run_dir) result = "completed" if (run_dir / "eval_results.json").exists() else "failed" - log(f" DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}") + log(f" DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}") return result @@ -833,15 +830,9 @@ def main(): # and reverse-map to the Claude arg (e.g., haiku) baseline = None if baseline_model: - provider_cfg = (providers_config.get(provider_filter) or {}) - model_map = provider_cfg.get("model_map", {}) - reverse_map = {v: k for k, v in model_map.items()} - resolved_model = reverse_map.get(baseline_model, baseline_model) - if resolved_model != baseline_model: - print(f"Model: {baseline_model} (mapped to {resolved_model} for Claude CLI)") axes = {name: spec["values"] for name, spec in grid["axes"].items()} baseline = {name: values[0] for name, values in axes.items()} - baseline["model"] = resolved_model + baseline["model"] = baseline_model # Determine cell generation strategy if profile == "main_effects":

Impressum · Datenschutz