Use real GLM model names directly, drop model_map - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 96ae9ecef7c6590632967d2aa49cf464c5c2a30f
parent 932df6b569f5de093993e826d9e5d3e5b2ba5b6d
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:26:26 +0200

Use real GLM model names directly, drop model_map

GLM models (glm-4.5-air, glm-4.7, glm-5.1) are now first-class values
in the model axis. The Z.AI gateway accepts them directly via --model flag.
No more confusing haiku/sonnet/opus mapping.

Exclusions enforce provider/model alignment: GLM models only with zai,
Anthropic models only with anthropic.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M grid.yaml  | 21 +++++++++++----------
M harness/lib/compute_grid.py  | 20 ++++++--------------
M harness/lib/experiment_design.py  | 15 ++-------------
M harness/run.py  | 21 ++++++---------------

4 files changed, 25 insertions(+), 52 deletions(-)
diff --git a/grid.yaml b/grid.yaml
@@ -9,7 +9,7 @@ defaults:
 
 axes:
   model:
-    values: [haiku, sonnet, opus]
+    values: [haiku, sonnet, opus, "glm-4.5-air", "glm-4.7", "glm-5.1"]
   effort:
     values: [high, max]
   prompt_style:
@@ -61,10 +61,7 @@ providers:
   zai:
     base_url: "https://api.z.ai/api/anthropic"
     api_key_env: "ZAI_API_KEY"
-    model_map:
-      haiku: "glm-4.5-air"
-      sonnet: "glm-4.7"
-      opus: "glm-4.7"
+    models: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
 
 exclusions:
   # Haiku does not support extended thinking
@@ -79,10 +76,14 @@ exclusions:
       playwright: "off"
   - when:
       strategy: compete
-  # opus + zai both map to glm-4.7 (same as sonnet), wasteful
-  - when:
-      provider: zai
-      model: opus
+  # GLM models only with zai provider
+  - when: { provider: anthropic, model: "glm-4.5-air" }
+  - when: { provider: anthropic, model: "glm-4.7" }
+  - when: { provider: anthropic, model: "glm-5.1" }
+  # Anthropic models only with anthropic provider
+  - when: { provider: zai, model: haiku }
+  - when: { provider: zai, model: sonnet }
+  - when: { provider: zai, model: opus }
 
 tasks:
   - tetris
@@ -119,7 +120,7 @@ profiles:
   zai-smoke:
     description: "Quick validation for Z.AI GLM models"
     axes:
-      model: [haiku, sonnet]
+      model: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
       effort: [high]
       prompt_style: [simple, detailed]
       language: [typescript]
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -68,6 +68,7 @@ VALUE_ABBREV = {
     "lorem_75": "lor75",
     "glm-4.5-air": "glm45air",
     "glm-4.7": "glm47",
+    "glm-5.1": "glm51",
     "anthropic": "anth",
 }
 
@@ -149,20 +150,11 @@ def compute_cells(grid, profile_name):
             if excluded:
                 continue
 
-            # Resolve actual_model from provider config
-            provider_name = cell.get("provider", "anthropic")
-            providers_config = grid.get("providers", {})
-            model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
-            actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
-            cell["actual_model"] = actual_model
-
-            # Build cell ID using actual_model instead of model for clarity
-            cell_id_parts = [task]
-            for k in axis_names:
-                val = cell[k]
-                if k == "model":
-                    val = actual_model  # use resolved model name in cell_id
-                cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
+            # actual_model = model (no mapping needed, models are their real names)
+            cell["actual_model"] = cell.get("model", "")
+
+            # Build cell ID from task + abbreviated axis values (deterministic, filename-safe)
+            cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
             cell_id = "_".join(cell_id_parts)
 
             # Resolve budget value
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -448,22 +448,11 @@ def _build_cell(task, cell, defaults, grid):
     from compute_grid import AXIS_ABBREV, VALUE_ABBREV
     axis_names = sorted(cell.keys())
 
-    # Resolve actual_model from provider config
-    provider_name = cell.get("provider", "anthropic")
-    providers_config = grid.get("providers", {})
-    model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
-    actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
-
-    cell_id_parts = [task]
-    for k in axis_names:
-        val = cell[k]
-        if k == "model":
-            val = actual_model
-        cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
+    cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
 
     result = dict(cell)
     result["task"] = task
-    result["actual_model"] = actual_model
+    result["actual_model"] = cell.get("model", "")
     result["cell_id"] = "_".join(cell_id_parts)
     result["runs_per_cell"] = defaults.get("runs_per_cell", 3)
     result["timeout_seconds"] = defaults.get("timeout_seconds", 600)
diff --git a/harness/run.py b/harness/run.py
@@ -673,14 +673,11 @@ def run_single(
             log(f"INVALID: {run_id} - deleting and re-running")
             shutil.rmtree(run_dir)
 
-    # Resolve provider and actual model
+    # Resolve provider config
     provider_name = cell.get("provider", "anthropic")
-    provider_config = (providers_config or {}).get(provider_name, {})
-    model_map = provider_config.get("model_map", {})
-    actual_model = model_map.get(model, model)
-    display_model = actual_model if provider_name != "anthropic" else model
+    provider_config = (providers_config or {}).get(provider_name) or {}
 
-    log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}")
+    log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
 
     run_dir.mkdir(parents=True, exist_ok=True)
 
@@ -691,7 +688,7 @@ def run_single(
         "short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8],
         "short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8],
         "run_number": run_num,
-        "actual_model": actual_model,
+        "actual_model": model,
         "claude_version": claude_version,
         "started_at": datetime.now(timezone.utc).isoformat(),
     }
@@ -739,7 +736,7 @@ def run_single(
     archive_workspace(workspace, run_dir)
 
     result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
-    log(f"  DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
+    log(f"  DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
     return result
 
 
@@ -833,15 +830,9 @@ def main():
     # and reverse-map to the Claude arg (e.g., haiku)
     baseline = None
     if baseline_model:
-        provider_cfg = (providers_config.get(provider_filter) or {})
-        model_map = provider_cfg.get("model_map", {})
-        reverse_map = {v: k for k, v in model_map.items()}
-        resolved_model = reverse_map.get(baseline_model, baseline_model)
-        if resolved_model != baseline_model:
-            print(f"Model:      {baseline_model} (mapped to {resolved_model} for Claude CLI)")
         axes = {name: spec["values"] for name, spec in grid["axes"].items()}
         baseline = {name: values[0] for name, values in axes.items()}
-        baseline["model"] = resolved_model
+        baseline["model"] = baseline_model
 
     # Determine cell generation strategy
     if profile == "main_effects":

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	grid.yaml	\|	21	+++++++++++----------
M	harness/lib/compute_grid.py	\|	20	++++++--------------
M	harness/lib/experiment_design.py	\|	15	++-------------
M	harness/run.py	\|	21	++++++---------------