loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit e82be6aca0708fad30ff11975bd16e5be13f53ff
parent 6678831b7fac8cd35467d4539afc9ce70d68d388
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Wed, 15 Apr 2026 13:47:11 +0200

Fix compute_grid OOM: fail on unknown profile, stream via generator, dispatch DOE designs

Three fixes in one pass:

1. get_axes() used to silently fall back to the full top-level grid when
   given an unknown profile name. With 23 axes this expands to ~40B
   cartesian combinations, and the process OOMed the host (7.6GB+ before
   swap-stormed into D-state). Now it raises ValueError listing the
   known profiles.

2. compute_cells() accumulated every cell in a list before returning.
   Even with lazy itertools.product, building the intermediate list
   defeats it. Converted to a generator yielding one cell at a time.
   Streaming the 'full' profile now peaks at ~12MB RSS instead of
   unbounded growth. The only in-repo consumer (harness/run.py) already
   materializes via a list comprehension, so the change is transparent
   there.

3. compute_grid.py now recognizes the DOE design names (main_effects,
   plackett_burman, interaction_hunt) and dispatches to
   experiment_design.py. Previously 'compute_grid.py grid.yaml
   main_effects' triggered the silent fallback (bug #1) because
   main_effects is a design, not a profile. Now it produces the
   expected one-at-a-time sweep.

Unknown names now print the full list of valid profiles and designs
instead of silently misbehaving.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/lib/compute_grid.py | 79+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/harness/lib/compute_grid.py b/harness/lib/compute_grid.py @@ -95,14 +95,18 @@ def load_grid(path): def get_axes(grid, profile_name): - """Get axis values for a given profile, falling back to top-level axes.""" + """Get axis values for a given profile. Raises if the profile is unknown.""" top_axes = {name: spec["values"] for name, spec in grid["axes"].items()} + profiles = grid.get("profiles", {}) - if profile_name not in grid.get("profiles", {}): - return top_axes + if profile_name not in profiles: + raise ValueError( + f"unknown profile '{profile_name}'. Known profiles: {sorted(profiles.keys())}" + ) - profile = grid["profiles"][profile_name] + profile = profiles[profile_name] if "axes" not in profile: + # Profile intentionally omits axes (e.g. 'full') to use the full top-level grid. return top_axes # Profile axes override top-level axes @@ -141,14 +145,17 @@ def apply_task_overrides(axes, task, grid): def compute_cells(grid, profile_name): + """Yield one cell dict at a time. + + Streams the cartesian product so peak memory stays at O(1 cell) regardless + of profile size. Callers that need a list should wrap with list(...). + """ base_axes = get_axes(grid, profile_name) runs_per_cell = get_runs_per_cell(grid, profile_name) exclusions = grid.get("exclusions", []) tasks = grid["tasks"] defaults = grid["defaults"] - cells = [] - for task in tasks: axes = apply_task_overrides(base_axes, task, grid) axis_names = sorted(axes.keys()) @@ -157,13 +164,7 @@ def compute_cells(grid, profile_name): for combo in product(*axis_values): cell = dict(zip(axis_names, combo)) - # Check exclusions - excluded = False - for exclusion in exclusions: - if matches_exclusion(cell, exclusion): - excluded = True - break - if excluded: + if any(matches_exclusion(cell, e) for e in exclusions): continue # actual_model = model (no mapping needed, models are their real names) @@ -171,36 +172,58 @@ def compute_cells(grid, profile_name): # Build cell ID from task + abbreviated axis values (deterministic, filename-safe) cell_id_parts = [task] + [f"{AXIS_ABBREV.get(k, k)}={VALUE_ABBREV.get(str(cell[k]), cell[k])}" for k in axis_names] - cell_id = "_".join(cell_id_parts) - - # Resolve budget value - budget_key = cell.get("max_budget", "low") - budget_usd = defaults["budget"].get(budget_key, 0.50) + cell["cell_id"] = "_".join(cell_id_parts) cell["task"] = task - cell["cell_id"] = cell_id cell["runs_per_cell"] = runs_per_cell - cell["max_budget_usd"] = budget_usd + cell["max_budget_usd"] = defaults["budget"].get(cell.get("max_budget", "low"), 0.50) cell["timeout_seconds"] = defaults["timeout_seconds"] - cells.append(cell) + yield cell + - return cells +DESIGNS = ("main_effects", "plackett_burman", "interaction_hunt") def main(): if len(sys.argv) < 2: - print("Usage: compute_grid.py <grid_file> [profile]", file=sys.stderr) + print("Usage: compute_grid.py <grid_file> [profile|design] [design_args]", file=sys.stderr) + print(" interaction_hunt takes a 3rd arg: comma-separated axis names", file=sys.stderr) sys.exit(1) grid_file = sys.argv[1] - profile = sys.argv[2] if len(sys.argv) > 2 else "smoke" + name = sys.argv[2] if len(sys.argv) > 2 else "smoke" grid = load_grid(grid_file) - cells = compute_cells(grid, profile) - - for cell in cells: - print(json.dumps(cell)) + profiles = grid.get("profiles", {}) + + if name in profiles: + for cell in compute_cells(grid, name): + print(json.dumps(cell)) + return + + if name in DESIGNS: + from experiment_design import ( + main_effects_plan, + plackett_burman_plan, + interaction_hunt_plan, + ) + if name == "main_effects": + cells = main_effects_plan(grid) + elif name == "plackett_burman": + cells = plackett_burman_plan(grid) + else: # interaction_hunt + if len(sys.argv) < 4: + print("ERROR: interaction_hunt requires comma-separated axis names as 3rd arg", file=sys.stderr) + sys.exit(1) + cells = interaction_hunt_plan(grid, sys.argv[3].split(",")) + for cell in cells: + print(json.dumps(cell)) + return + + known = sorted(profiles.keys()) + list(DESIGNS) + print(f"ERROR: unknown profile or design '{name}'. Known: {known}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__":

Impressum · Datenschutz