loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 6b806ab1e52180e1c3fdf0cefba9d1c9e4abd7a9
parent 1cfbc6fbd0a1ee476d350fd22f9cb179ad50df71
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:08:24 +0200

Use actual_model in cell_ids and dashboard display

Cell IDs now show the real model name (glm-4.5-air not haiku) when
using non-anthropic providers. Dashboard displays actual_model in
grid table, charts, and sort. Migrated 2 existing zai runs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 2+-
Mdashboard/src/components/Grid.tsx | 8++++----
Mharness/lib/compute_grid.py | 19+++++++++++++++++--
Mharness/lib/experiment_design.py | 15++++++++++++++-
Mharness/migrate-run-ids.py | 8++++++--
5 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -94,7 +94,7 @@ function aggregateCells(runs: Run[]): CellAggregate[] { const cellId = run.meta.cell_id; if (!byCell[cellId]) { byCell[cellId] = { - model: run.meta.model, + model: run.meta.actual_model || run.meta.model, task: run.meta.task, scores: [], costs: [], diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -31,7 +31,7 @@ function formatRunId(run: Run): React.ReactNode { <span style={{ display: "inline-flex", gap: "4px", alignItems: "center", flexWrap: "wrap" }}> <span className="badge badge-neutral" style={{ fontSize: "0.7rem" }}>{m.task}</span> <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}> - {m.model} {m.prompt_style} {m.language} + {m.actual_model || m.model} {m.prompt_style} {m.language} </span> </span> ); @@ -48,7 +48,7 @@ type SortKey = "task" | "model" | "effort" | "prompt" | "lang" | "score" | "cost function getSortValue(run: Run, key: SortKey): string | number { switch (key) { case "task": return run.meta.task; - case "model": return run.meta.model; + case "model": return run.meta.actual_model || run.meta.model; case "effort": return run.meta.effort; case "prompt": return run.meta.prompt_style; case "lang": return run.meta.language; @@ -246,7 +246,7 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124; }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>} </td> - <td><span className="badge badge-neutral">{g.meta.model}</span></td> + <td><span className="badge badge-neutral">{g.meta.actual_model || g.meta.model}</span></td> <td>{g.meta.effort}</td> <td>{g.meta.prompt_style}</td> <td>{g.meta.language}</td> @@ -287,7 +287,7 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { </a> </td> <td>{run.meta.task}</td> - <td><span className="badge badge-neutral">{run.meta.model}</span></td> + <td><span className="badge badge-neutral">{run.meta.actual_model || run.meta.model}</span></td> <td>{run.meta.effort}</td> <td>{run.meta.prompt_style}</td> <td>{run.meta.language}</td> diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py @@ -66,6 +66,9 @@ VALUE_ABBREV = { "lorem_25": "lor25", "lorem_50": "lor50", "lorem_75": "lor75", + "glm-4.5-air": "glm45air", + "glm-4.7": "glm47", + "anthropic": "anth", } @@ -146,8 +149,20 @@ def compute_cells(grid, profile_name): if excluded: continue - # Build cell ID from task + abbreviated axis values (deterministic, filename-safe) - cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names] + # Resolve actual_model from provider config + provider_name = cell.get("provider", "anthropic") + providers_config = grid.get("providers", {}) + model_map = (providers_config.get(provider_name) or {}).get("model_map", {}) + actual_model = model_map.get(cell.get("model", ""), cell.get("model", "")) + cell["actual_model"] = actual_model + + # Build cell ID using actual_model instead of model for clarity + cell_id_parts = [task] + for k in axis_names: + val = cell[k] + if k == "model": + val = actual_model # use resolved model name in cell_id + cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}") cell_id = "_".join(cell_id_parts) # Resolve budget value diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py @@ -447,10 +447,23 @@ def _is_excluded(cell, grid): def _build_cell(task, cell, defaults, grid): from compute_grid import AXIS_ABBREV, VALUE_ABBREV axis_names = sorted(cell.keys()) - cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names] + + # Resolve actual_model from provider config + provider_name = cell.get("provider", "anthropic") + providers_config = grid.get("providers", {}) + model_map = (providers_config.get(provider_name) or {}).get("model_map", {}) + actual_model = model_map.get(cell.get("model", ""), cell.get("model", "")) + + cell_id_parts = [task] + for k in axis_names: + val = cell[k] + if k == "model": + val = actual_model + cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}") result = dict(cell) result["task"] = task + result["actual_model"] = actual_model result["cell_id"] = "_".join(cell_id_parts) result["runs_per_cell"] = defaults.get("runs_per_cell", 3) result["timeout_seconds"] = defaults.get("timeout_seconds", 600) diff --git a/harness/migrate-run-ids.py b/harness/migrate-run-ids.py @@ -28,13 +28,17 @@ def compute_new_cell_id(meta: dict) -> str: "task", "cell_id", "run_id", "run_number", "runs_per_cell", "max_budget_usd", "timeout_seconds", "base_tools", "started_at", "completed_at", "wall_time_seconds", "exit_code", - "claude_version", "short_id", "short_cell_id", + "claude_version", "short_id", "short_cell_id", "actual_model", } axis_names = sorted(k for k in meta.keys() if k not in skip_keys) + actual_model = meta.get("actual_model", meta.get("model", "")) parts = [task] for k in axis_names: + val = meta[k] + if k == "model": + val = actual_model # use resolved model name abbrev_key = AXIS_ABBREV.get(k, k) - abbrev_val = VALUE_ABBREV.get(str(meta[k]), str(meta[k])) + abbrev_val = VALUE_ABBREV.get(str(val), str(val)) parts.append(f"{abbrev_key}={abbrev_val}") return "_".join(parts)

Impressum · Datenschutz