Use actual_model in cell_ids and dashboard display - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 6b806ab1e52180e1c3fdf0cefba9d1c9e4abd7a9
parent 1cfbc6fbd0a1ee476d350fd22f9cb179ad50df71
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 19:08:24 +0200

Use actual_model in cell_ids and dashboard display

Cell IDs now show the real model name (glm-4.5-air not haiku) when
using non-anthropic providers. Dashboard displays actual_model in
grid table, charts, and sort. Migrated 2 existing zai runs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/Charts.tsx  | 2 +-
M dashboard/src/components/Grid.tsx  | 8 ++++----
M harness/lib/compute_grid.py  | 19 +++++++++++++++++--
M harness/lib/experiment_design.py  | 15 ++++++++++++++-
M harness/migrate-run-ids.py  | 8 ++++++--

5 files changed, 42 insertions(+), 10 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -94,7 +94,7 @@ function aggregateCells(runs: Run[]): CellAggregate[] {
     const cellId = run.meta.cell_id;
     if (!byCell[cellId]) {
       byCell[cellId] = {
-        model: run.meta.model,
+        model: run.meta.actual_model || run.meta.model,
         task: run.meta.task,
         scores: [],
         costs: [],
diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx
@@ -31,7 +31,7 @@ function formatRunId(run: Run): React.ReactNode {
     <span style={{ display: "inline-flex", gap: "4px", alignItems: "center", flexWrap: "wrap" }}>
       <span className="badge badge-neutral" style={{ fontSize: "0.7rem" }}>{m.task}</span>
       <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}>
-        {m.model} {m.prompt_style} {m.language}
+        {m.actual_model || m.model} {m.prompt_style} {m.language}
       </span>
     </span>
   );
@@ -48,7 +48,7 @@ type SortKey = "task" | "model" | "effort" | "prompt" | "lang" | "score" | "cost
 function getSortValue(run: Run, key: SortKey): string | number {
   switch (key) {
     case "task": return run.meta.task;
-    case "model": return run.meta.model;
+    case "model": return run.meta.actual_model || run.meta.model;
     case "effort": return run.meta.effort;
     case "prompt": return run.meta.prompt_style;
     case "lang": return run.meta.language;
@@ -246,7 +246,7 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
                         return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124;
                       }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>}
                     </td>
-                    <td><span className="badge badge-neutral">{g.meta.model}</span></td>
+                    <td><span className="badge badge-neutral">{g.meta.actual_model || g.meta.model}</span></td>
                     <td>{g.meta.effort}</td>
                     <td>{g.meta.prompt_style}</td>
                     <td>{g.meta.language}</td>
@@ -287,7 +287,7 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
                       </a>
                     </td>
                     <td>{run.meta.task}</td>
-                    <td><span className="badge badge-neutral">{run.meta.model}</span></td>
+                    <td><span className="badge badge-neutral">{run.meta.actual_model || run.meta.model}</span></td>
                     <td>{run.meta.effort}</td>
                     <td>{run.meta.prompt_style}</td>
                     <td>{run.meta.language}</td>
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -66,6 +66,9 @@ VALUE_ABBREV = {
     "lorem_25": "lor25",
     "lorem_50": "lor50",
     "lorem_75": "lor75",
+    "glm-4.5-air": "glm45air",
+    "glm-4.7": "glm47",
+    "anthropic": "anth",
 }
 
 
@@ -146,8 +149,20 @@ def compute_cells(grid, profile_name):
             if excluded:
                 continue
 
-            # Build cell ID from task + abbreviated axis values (deterministic, filename-safe)
-            cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
+            # Resolve actual_model from provider config
+            provider_name = cell.get("provider", "anthropic")
+            providers_config = grid.get("providers", {})
+            model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
+            actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
+            cell["actual_model"] = actual_model
+
+            # Build cell ID using actual_model instead of model for clarity
+            cell_id_parts = [task]
+            for k in axis_names:
+                val = cell[k]
+                if k == "model":
+                    val = actual_model  # use resolved model name in cell_id
+                cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
             cell_id = "_".join(cell_id_parts)
 
             # Resolve budget value
diff --git a/harness/lib/experiment_design.py b/harness/lib/experiment_design.py
@@ -447,10 +447,23 @@ def _is_excluded(cell, grid):
 def _build_cell(task, cell, defaults, grid):
     from compute_grid import AXIS_ABBREV, VALUE_ABBREV
     axis_names = sorted(cell.keys())
-    cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
+
+    # Resolve actual_model from provider config
+    provider_name = cell.get("provider", "anthropic")
+    providers_config = grid.get("providers", {})
+    model_map = (providers_config.get(provider_name) or {}).get("model_map", {})
+    actual_model = model_map.get(cell.get("model", ""), cell.get("model", ""))
+
+    cell_id_parts = [task]
+    for k in axis_names:
+        val = cell[k]
+        if k == "model":
+            val = actual_model
+        cell_id_parts.append(f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(val), val)}")
 
     result = dict(cell)
     result["task"] = task
+    result["actual_model"] = actual_model
     result["cell_id"] = "_".join(cell_id_parts)
     result["runs_per_cell"] = defaults.get("runs_per_cell", 3)
     result["timeout_seconds"] = defaults.get("timeout_seconds", 600)
diff --git a/harness/migrate-run-ids.py b/harness/migrate-run-ids.py
@@ -28,13 +28,17 @@ def compute_new_cell_id(meta: dict) -> str:
         "task", "cell_id", "run_id", "run_number", "runs_per_cell",
         "max_budget_usd", "timeout_seconds", "base_tools",
         "started_at", "completed_at", "wall_time_seconds", "exit_code",
-        "claude_version", "short_id", "short_cell_id",
+        "claude_version", "short_id", "short_cell_id", "actual_model",
     }
     axis_names = sorted(k for k in meta.keys() if k not in skip_keys)
+    actual_model = meta.get("actual_model", meta.get("model", ""))
     parts = [task]
     for k in axis_names:
+        val = meta[k]
+        if k == "model":
+            val = actual_model  # use resolved model name
         abbrev_key = AXIS_ABBREV.get(k, k)
-        abbrev_val = VALUE_ABBREV.get(str(meta[k]), str(meta[k]))
+        abbrev_val = VALUE_ABBREV.get(str(val), str(val))
         parts.append(f"{abbrev_key}={abbrev_val}")
     return "_".join(parts)

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/Charts.tsx	\|	2	+-
M	dashboard/src/components/Grid.tsx	\|	8	++++----
M	harness/lib/compute_grid.py	\|	19	+++++++++++++++++--
M	harness/lib/experiment_design.py	\|	15	++++++++++++++-
M	harness/migrate-run-ids.py	\|	8	++++++--