commit 96ae9ecef7c6590632967d2aa49cf464c5c2a30f
parent 932df6b569f5de093993e826d9e5d3e5b2ba5b6d
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 19:26:26 +0200
Use real GLM model names directly, drop model_map
GLM models (glm-4.5-air, glm-4.7, glm-5.1) are now first-class values
in the model axis. The Z.AI gateway accepts them directly via --model flag.
No more confusing haiku/sonnet/opus mapping.
Exclusions enforce provider/model alignment: GLM models only with zai,
Anthropic models only with anthropic.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 25 insertions(+), 52 deletions(-)
diff --git a/grid.yaml b/grid.yaml
@@ -9,7 +9,7 @@ defaults:
axes:
model:
- values: [haiku, sonnet, opus]
+ values: [haiku, sonnet, opus, "glm-4.5-air", "glm-4.7", "glm-5.1"]
effort:
values: [high, max]
prompt_style:
@@ -61,10 +61,7 @@ providers:
zai:
base_url: "https://api.z.ai/api/anthropic"
api_key_env: "ZAI_API_KEY"
- model_map:
- haiku: "glm-4.5-air"
- sonnet: "glm-4.7"
- opus: "glm-4.7"
+ models: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
exclusions:
# Haiku does not support extended thinking
@@ -79,10 +76,14 @@ exclusions:
playwright: "off"
- when:
strategy: compete
- # opus + zai both map to glm-4.7 (same as sonnet), wasteful
- - when:
- provider: zai
- model: opus
+ # GLM models only with zai provider
+ - when: { provider: anthropic, model: "glm-4.5-air" }
+ - when: { provider: anthropic, model: "glm-4.7" }
+ - when: { provider: anthropic, model: "glm-5.1" }
+ # Anthropic models only with anthropic provider
+ - when: { provider: zai, model: haiku }
+ - when: { provider: zai, model: sonnet }
+ - when: { provider: zai, model: opus }
tasks:
- tetris
@@ -119,7 +120,7 @@ profiles:
zai-smoke:
description: "Quick validation for Z.AI GLM models"
axes:
- model: [haiku, sonnet]
+ model: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
effort: [high]
prompt_style: [simple, detailed]
language: [typescript]
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -68,6 +68,7 @@ VALUE_ABBREV = {
"lorem_75": "lor75",
"glm-4.5-air": "glm45air",
"glm-4.7": "glm47",
+ "glm-5.1": "glm51",
"anthropic": "anth",
}
@@ -149,20 +150,11 @@ def compute_cells(grid, profile_name):
if excluded:
continue
- # Resolve actual_model from provider config
- provider_name = cell.get("provider", "anthropic")
- providers_config = grid.get("providers", {})
- model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
- actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
- cell["actual_model"] = actual_model
-
- # Build cell ID using actual_model instead of model for clarity
- cell_id_parts = [task]
- for k in axis_names:
- val = cell[k]
- if k == "model":
- val = actual_model # use resolved model name in cell_id
- cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
+ # actual_model = model (no mapping needed, models are their real names)
+ cell["actual_model"] = cell.get("model", "")
+
+ # Build cell ID from task + abbreviated axis values (deterministic, filename-safe)
+ cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
cell_id = "_".join(cell_id_parts)
# Resolve budget value
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -448,22 +448,11 @@ def _build_cell(task, cell, defaults, grid):
from compute_grid import AXIS_ABBREV, VALUE_ABBREV
axis_names = sorted(cell.keys())
- # Resolve actual_model from provider config
- provider_name = cell.get("provider", "anthropic")
- providers_config = grid.get("providers", {})
- model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
- actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
-
- cell_id_parts = [task]
- for k in axis_names:
- val = cell[k]
- if k == "model":
- val = actual_model
- cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
+ cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
result = dict(cell)
result["task"] = task
- result["actual_model"] = actual_model
+ result["actual_model"] = cell.get("model", "")
result["cell_id"] = "_".join(cell_id_parts)
result["runs_per_cell"] = defaults.get("runs_per_cell", 3)
result["timeout_seconds"] = defaults.get("timeout_seconds", 600)
diff --git a/harness/run.py b/harness/run.py
@@ -673,14 +673,11 @@ def run_single(
log(f"INVALID: {run_id} - deleting and re-running")
shutil.rmtree(run_dir)
- # Resolve provider and actual model
+ # Resolve provider config
provider_name = cell.get("provider", "anthropic")
- provider_config = (providers_config or {}).get(provider_name, {})
- model_map = provider_config.get("model_map", {})
- actual_model = model_map.get(model, model)
- display_model = actual_model if provider_name != "anthropic" else model
+ provider_config = (providers_config or {}).get(provider_name) or {}
- log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}")
+ log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
run_dir.mkdir(parents=True, exist_ok=True)
@@ -691,7 +688,7 @@ def run_single(
"short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8],
"short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8],
"run_number": run_num,
- "actual_model": actual_model,
+ "actual_model": model,
"claude_version": claude_version,
"started_at": datetime.now(timezone.utc).isoformat(),
}
@@ -739,7 +736,7 @@ def run_single(
archive_workspace(workspace, run_dir)
result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
- log(f" DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
+ log(f" DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
return result
@@ -833,15 +830,9 @@ def main():
# and reverse-map to the Claude arg (e.g., haiku)
baseline = None
if baseline_model:
- provider_cfg = (providers_config.get(provider_filter) or {})
- model_map = provider_cfg.get("model_map", {})
- reverse_map = {v: k for k, v in model_map.items()}
- resolved_model = reverse_map.get(baseline_model, baseline_model)
- if resolved_model != baseline_model:
- print(f"Model: {baseline_model} (mapped to {resolved_model} for Claude CLI)")
axes = {name: spec["values"] for name, spec in grid["axes"].items()}
baseline = {name: values[0] for name, values in axes.items()}
- baseline["model"] = resolved_model
+ baseline["model"] = baseline_model
# Determine cell generation strategy
if profile == "main_effects":