commit e82be6aca0708fad30ff11975bd16e5be13f53ff
parent 6678831b7fac8cd35467d4539afc9ce70d68d388
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Wed, 15 Apr 2026 13:47:11 +0200
Fix compute_grid OOM: fail on unknown profile, stream via generator, dispatch DOE designs
Three fixes in one pass:
1. get_axes() used to silently fall back to the full top-level grid when
given an unknown profile name. With 23 axes this expands to ~40B
cartesian combinations, and the process OOMed the host (7.6GB+ before
swap-stormed into D-state). Now it raises ValueError listing the
known profiles.
2. compute_cells() accumulated every cell in a list before returning.
Even with lazy itertools.product, building the intermediate list
defeats it. Converted to a generator yielding one cell at a time.
Streaming the 'full' profile now peaks at ~12MB RSS instead of
unbounded growth. The only in-repo consumer (harness/run.py) already
materializes via a list comprehension, so the change is transparent
there.
3. compute_grid.py now recognizes the DOE design names (main_effects,
plackett_burman, interaction_hunt) and dispatches to
experiment_design.py. Previously 'compute_grid.py grid.yaml
main_effects' triggered the silent fallback (bug #1) because
main_effects is a design, not a profile. Now it produces the
expected one-at-a-time sweep.
Unknown names now print the full list of valid profiles and designs
instead of silently misbehaving.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 51 insertions(+), 28 deletions(-)
diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py
@@ -95,14 +95,18 @@ def load_grid(path):
def get_axes(grid, profile_name):
- """Get axis values for a given profile, falling back to top-level axes."""
+ """Get axis values for a given profile. Raises if the profile is unknown."""
top_axes = {name: spec["values"] for name, spec in grid["axes"].items()}
+ profiles = grid.get("profiles", {})
- if profile_name not in grid.get("profiles", {}):
- return top_axes
+ if profile_name not in profiles:
+ raise ValueError(
+ f"unknown profile '{profile_name}'. Known profiles: {sorted(profiles.keys())}"
+ )
- profile = grid["profiles"][profile_name]
+ profile = profiles[profile_name]
if "axes" not in profile:
+ # Profile intentionally omits axes (e.g. 'full') to use the full top-level grid.
return top_axes
# Profile axes override top-level axes
@@ -141,14 +145,17 @@ def apply_task_overrides(axes, task, grid):
def compute_cells(grid, profile_name):
+ """Yield one cell dict at a time.
+
+ Streams the cartesian product so peak memory stays at O(1 cell) regardless
+ of profile size. Callers that need a list should wrap with list(...).
+ """
base_axes = get_axes(grid, profile_name)
runs_per_cell = get_runs_per_cell(grid, profile_name)
exclusions = grid.get("exclusions", [])
tasks = grid["tasks"]
defaults = grid["defaults"]
- cells = []
-
for task in tasks:
axes = apply_task_overrides(base_axes, task, grid)
axis_names = sorted(axes.keys())
@@ -157,13 +164,7 @@ def compute_cells(grid, profile_name):
for combo in product(*axis_values):
cell = dict(zip(axis_names, combo))
- # Check exclusions
- excluded = False
- for exclusion in exclusions:
- if matches_exclusion(cell, exclusion):
- excluded = True
- break
- if excluded:
+ if any(matches_exclusion(cell, e) for e in exclusions):
continue
# actual_model = model (no mapping needed, models are their real names)
@@ -171,36 +172,58 @@ def compute_cells(grid, profile_name):
# Build cell ID from task + abbreviated axis values (deterministic, filename-safe)
cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names]
- cell_id = "_".join(cell_id_parts)
-
- # Resolve budget value
- budget_key = cell.get("max_budget", "low")
- budget_usd = defaults["budget"].get(budget_key, 0.50)
+ cell["cell_id"] = "_".join(cell_id_parts)
cell["task"] = task
- cell["cell_id"] = cell_id
cell["runs_per_cell"] = runs_per_cell
- cell["max_budget_usd"] = budget_usd
+ cell["max_budget_usd"] = defaults["budget"].get(cell.get("max_budget", "low"), 0.50)
cell["timeout_seconds"] = defaults["timeout_seconds"]
- cells.append(cell)
+ yield cell
+
- return cells
+DESIGNS = ("main_effects", "plackett_burman", "interaction_hunt")
def main():
if len(sys.argv) < 2:
- print("Usage: compute_grid.py <grid_file> [profile]", file=sys.stderr)
+ print("Usage: compute_grid.py <grid_file> [profile|design] [design_args]", file=sys.stderr)
+ print(" interaction_hunt takes a 3rd arg: comma-separated axis names", file=sys.stderr)
sys.exit(1)
grid_file = sys.argv[1]
- profile = sys.argv[2] if len(sys.argv) > 2 else "smoke"
+ name = sys.argv[2] if len(sys.argv) > 2 else "smoke"
grid = load_grid(grid_file)
- cells = compute_cells(grid, profile)
-
- for cell in cells:
- print(json.dumps(cell))
+ profiles = grid.get("profiles", {})
+
+ if name in profiles:
+ for cell in compute_cells(grid, name):
+ print(json.dumps(cell))
+ return
+
+ if name in DESIGNS:
+ from experiment_design import (
+ main_effects_plan,
+ plackett_burman_plan,
+ interaction_hunt_plan,
+ )
+ if name == "main_effects":
+ cells = main_effects_plan(grid)
+ elif name == "plackett_burman":
+ cells = plackett_burman_plan(grid)
+ else: # interaction_hunt
+ if len(sys.argv) < 4:
+ print("ERROR: interaction_hunt requires comma-separated axis names as 3rd arg", file=sys.stderr)
+ sys.exit(1)
+ cells = interaction_hunt_plan(grid, sys.argv[3].split(","))
+ for cell in cells:
+ print(json.dumps(cell))
+ return
+
+ known = sorted(profiles.keys()) + list(DESIGNS)
+ print(f"ERROR: unknown profile or design '{name}'. Known: {known}", file=sys.stderr)
+ sys.exit(1)
if __name__ == "__main__":