commit e1fb0176902a996fded91653db4ed069d06b96a0
parent 724f4222b877c7ff2bd457c2bac60ec69eb48ab9
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 18:54:20 +0200
Add provider axis for Z.AI (GLM) model support
New provider axis: anthropic (default) or zai. Z.AI gateway maps
Claude model names to GLM models (haiku->glm-4.5-air, sonnet->glm-4.7).
- Provider config in grid.yaml with base_url, api_key_env, model_map
- invoke_claude sets ANTHROPIC_BASE_URL per-subprocess (isolated, no leakage)
- Explicitly clears ANTHROPIC_BASE_URL for anthropic runs (safety)
- Records actual_model in meta.json (what really ran vs what was requested)
- Dashboard shows provider axis, skips actual_model in analysis
- Exclusion: opus+zai (maps to same glm-4.7 as sonnet)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
14 files changed, 68 insertions(+), 7 deletions(-)
diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx
@@ -48,6 +48,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
// All axes except "model" since we rank by model
diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx
@@ -27,6 +27,7 @@ const CONFIG_AXES = [
{ key: "error_checking", label: "Error Checking" },
{ key: "context_noise", label: "Context Noise" },
{ key: "renderer", label: "Renderer" },
+ { key: "provider", label: "Provider" },
] as const;
type MetricExtractor = (run: Run) => number | null;
diff --git a/dashboard/src/components/Filters.tsx b/dashboard/src/components/Filters.tsx
@@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
export default function Filters({
diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx
@@ -30,6 +30,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
interface CellData {
diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx
@@ -28,6 +28,7 @@ const AXIS_LABELS: Record<string, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
export default function TornadoChart({ effects, metric }: TornadoChartProps) {
diff --git a/dashboard/src/components/Variability.tsx b/dashboard/src/components/Variability.tsx
@@ -31,6 +31,7 @@ const AXIS_LABELS: Record<string, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
/* ---------- helpers ---------- */
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -62,6 +62,7 @@ const SKIP_KEYS = new Set([
"short_cell_id",
"claude_version",
"sub_agents",
+ "actual_model",
]);
type MetricExtractor = (run: Run) => number | null;
diff --git a/dashboard/src/lib/data.ts b/dashboard/src/lib/data.ts
@@ -51,6 +51,8 @@ export function loadAllRuns(): Run[] {
meta.error_checking = meta.error_checking || "none";
meta.context_noise = meta.context_noise || "clean";
meta.renderer = meta.renderer || "none";
+ meta.provider = meta.provider || "anthropic";
+ meta.actual_model = meta.actual_model || meta.model;
// Compute short IDs if not in meta (backwards compat)
if (!meta.short_id && meta.run_id) {
diff --git a/dashboard/src/lib/types.ts b/dashboard/src/lib/types.ts
@@ -26,6 +26,8 @@ export interface RunMeta {
error_checking: string;
context_noise: string;
renderer: string;
+ provider: string;
+ actual_model: string;
short_id?: string;
short_cell_id?: string;
max_budget: string;
@@ -101,6 +103,7 @@ export type AxisName = keyof Pick<
| "error_checking"
| "context_noise"
| "renderer"
+ | "provider"
>;
export const AXIS_NAMES: AxisName[] = [
@@ -126,4 +129,5 @@ export const AXIS_NAMES: AxisName[] = [
"error_checking",
"context_noise",
"renderer",
+ "provider",
];
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -100,6 +100,7 @@ const AXIS_LABELS: Record<AxisName, string> = {
error_checking: "Error Checking",
context_noise: "Context Noise",
renderer: "Renderer",
+ provider: "Provider",
};
// Pre-compute all cell stats once
diff --git a/grid.yaml b/grid.yaml
@@ -52,6 +52,19 @@ axes:
values: ["clean", "wikipedia_25", "wikipedia_50", "wikipedia_75", "lorem_25", "lorem_50", "lorem_75"]
renderer:
values: ["none", "canvas", "svg", "dom", "webgl"]
+ provider:
+ values: ["anthropic", "zai"]
+
+providers:
+ anthropic:
+ # Default -- no overrides, uses standard Claude Code auth
+ zai:
+ base_url: "https://api.z.ai/api/anthropic"
+ api_key_env: "ZAI_API_KEY"
+ model_map:
+ haiku: "glm-4.5-air"
+ sonnet: "glm-4.7"
+ opus: "glm-4.7"
exclusions:
# Haiku does not support extended thinking
@@ -66,6 +79,10 @@ exclusions:
playwright: "off"
- when:
strategy: compete
+ # opus + zai both map to glm-4.7 (same as sonnet), wasteful
+ - when:
+ provider: zai
+ model: opus
tasks:
- tetris
@@ -96,6 +113,7 @@ profiles:
error_checking: ["none"]
context_noise: ["clean"]
renderer: ["none"]
+ provider: ["anthropic"]
runs_per_cell: 1
core:
@@ -123,6 +141,7 @@ profiles:
error_checking: ["none"]
context_noise: ["clean"]
renderer: ["none"]
+ provider: ["anthropic"]
runs_per_cell: 3
all-on:
@@ -150,6 +169,7 @@ profiles:
error_checking: ["self_verify"]
context_noise: ["clean"]
renderer: ["canvas"]
+ provider: ["anthropic"]
runs_per_cell: 3
all-off:
@@ -177,6 +197,7 @@ profiles:
error_checking: ["none"]
context_noise: ["clean"]
renderer: ["none"]
+ provider: ["anthropic"]
runs_per_cell: 3
full:
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -42,6 +42,7 @@ AXIS_ABBREV = {
"error_checking": "echk",
"context_noise": "noise",
"renderer": "rndr",
+ "provider": "prov",
}
# Short value names for cell_id to keep paths under 255 chars
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -310,7 +310,7 @@ def analyze_main_effects(results_dir, metric="score"):
"task", "cell_id", "run_id", "run_number", "runs_per_cell",
"max_budget_usd", "timeout_seconds", "base_tools",
"started_at", "completed_at", "wall_time_seconds", "exit_code",
- "short_id", "short_cell_id", "claude_version",
+ "short_id", "short_cell_id", "claude_version", "actual_model",
}
axis_names = sorted(meta_keys - skip_keys)
diff --git a/harness/run.py b/harness/run.py
@@ -208,7 +208,7 @@ def build_prompt(project_dir: Path, cell: dict) -> str:
return prompt
-def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path) -> int:
+def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path, provider_config: dict = None) -> int:
"""Invoke claude CLI and capture output."""
prompt = build_prompt(project_dir, cell)
model = cell["model"]
@@ -299,6 +299,20 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path)
}
transcript_f.write(json.dumps(ctx_event) + "\n")
+ # Provider-specific env overrides
+ run_env = os.environ.copy()
+ if provider_config:
+ if provider_config.get("base_url"):
+ run_env["ANTHROPIC_BASE_URL"] = provider_config["base_url"]
+ else:
+ run_env.pop("ANTHROPIC_BASE_URL", None)
+ if provider_config.get("api_key_env"):
+ key = os.environ.get(provider_config["api_key_env"])
+ if key:
+ run_env["ANTHROPIC_AUTH_TOKEN"] = key
+ else:
+ run_env.pop("ANTHROPIC_BASE_URL", None)
+
with open(transcript_path, "a") as transcript_f, open(stderr_path, "w") as stderr_f:
try:
result = subprocess.run(
@@ -307,6 +321,7 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path)
stdout=transcript_f,
stderr=stderr_f,
timeout=timeout,
+ env=run_env,
)
exit_code = result.returncode
except subprocess.TimeoutExpired:
@@ -638,6 +653,7 @@ def run_single(
results_dir: Path,
project_dir: Path,
claude_version: str,
+ providers_config: dict = None,
) -> str:
"""Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'."""
cell_id = cell["cell_id"]
@@ -656,7 +672,14 @@ def run_single(
log(f"INVALID: {run_id} - deleting and re-running")
shutil.rmtree(run_dir)
- log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
+ # Resolve provider and actual model
+ provider_name = cell.get("provider", "anthropic")
+ provider_config = (providers_config or {}).get(provider_name, {})
+ model_map = provider_config.get("model_map", {})
+ actual_model = model_map.get(model, model)
+ display_model = actual_model if provider_name != "anthropic" else model
+
+ log(f"START: {task} | {display_model} | {prompt_style} | run{run_num}")
run_dir.mkdir(parents=True, exist_ok=True)
@@ -667,6 +690,7 @@ def run_single(
"short_id": hashlib.sha256(run_id.encode()).hexdigest()[:8],
"short_cell_id": hashlib.sha256(cell_id.encode()).hexdigest()[:8],
"run_number": run_num,
+ "actual_model": actual_model,
"claude_version": claude_version,
"started_at": datetime.now(timezone.utc).isoformat(),
}
@@ -681,7 +705,7 @@ def run_single(
# Invoke claude
start_time = time.time()
- exit_code = invoke_claude(cell, workspace, run_dir, project_dir)
+ exit_code = invoke_claude(cell, workspace, run_dir, project_dir, provider_config)
wall_time = int(time.time() - start_time)
status = "ok" if exit_code == 0 else f"exit {exit_code}"
@@ -714,7 +738,7 @@ def run_single(
archive_workspace(workspace, run_dir)
result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
- log(f" DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
+ log(f" DONE: {task} | {display_model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
return result
@@ -786,6 +810,7 @@ def main():
print("=" * 40)
grid = load_grid(grid_file)
+ providers_config = grid.get("providers", {})
# Build baseline override from --model flag
baseline = None
@@ -834,7 +859,7 @@ def main():
if parallel <= 1:
# Sequential
for cell, run_num in jobs:
- result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version)
+ result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config)
if result == "completed":
completed += 1
elif result == "skipped":
@@ -846,7 +871,7 @@ def main():
with ThreadPoolExecutor(max_workers=parallel) as executor:
futures = {
executor.submit(
- run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version
+ run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version, providers_config
): (cell, run_num)
for cell, run_num in jobs
}