Add parallel execution to harness (-j flag) - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 7dd5ec90a306d461240d147b5312e9fddc153ba1
parent 402389afc4dfd249919a7689faa430b7ce0bb9ac
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri,  3 Apr 2026 20:30:24 +0200

Add parallel execution to harness (-j flag)

Usage: python3 harness/run.py grid.yaml main_effects -j 2

Runs N experiments concurrently with rolling parallelism (as soon as
one finishes, the next starts). Thread-safe logging, index writes,
and progress tracking.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------

1 file changed, 155 insertions(+), 87 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -5,12 +5,14 @@ Computes the experiment grid, creates isolated workspaces, invokes claude,
 runs evaluation, and stores results.
 
 Usage:
-    python3 run.py [grid_file] [profile_or_design]
+    python3 run.py [grid_file] [profile_or_design] [-j N]
 
     profile_or_design can be:
       - A profile name from grid.yaml (e.g., smoke, core, full)
       - A DOE design: main_effects, plackett_burman
       - interaction_hunt:axis1,axis2,axis3
+
+    -j N: run N experiments in parallel (default 1)
 """
 
 import json
@@ -20,7 +22,9 @@ import subprocess
 import sys
 import tarfile
 import tempfile
+import threading
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -317,9 +321,116 @@ def archive_workspace(workspace: Path, run_dir: Path):
         pass
 
 
+_print_lock = threading.Lock()
+_index_lock = threading.Lock()
+_counter_lock = threading.Lock()
+
+
+def log(msg: str):
+    with _print_lock:
+        print(msg, flush=True)
+
+
+def run_single(
+    cell: dict,
+    run_num: int,
+    results_dir: Path,
+    project_dir: Path,
+    claude_version: str,
+) -> str:
+    """Execute a single experiment run. Returns 'completed', 'skipped', or 'failed'."""
+    cell_id = cell["cell_id"]
+    task = cell["task"]
+    model = cell["model"]
+    prompt_style = cell["prompt_style"]
+    run_id = f"{cell_id}_run{run_num}"
+    run_dir = results_dir / "runs" / run_id
+
+    # Resume support
+    if (run_dir / "eval_results.json").exists():
+        log(f"SKIP: {run_id}")
+        return "skipped"
+
+    log(f"START: {task} | {model} | {prompt_style} | run{run_num}")
+
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save meta
+    meta = {
+        **cell,
+        "run_id": run_id,
+        "run_number": run_num,
+        "claude_version": claude_version,
+        "started_at": datetime.now(timezone.utc).isoformat(),
+    }
+    (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
+
+    # Create workspace
+    try:
+        workspace = create_workspace(project_dir, task, cell)
+    except Exception as e:
+        log(f"  ERROR creating workspace for {run_id}: {e}")
+        return "failed"
+
+    # Invoke claude
+    start_time = time.time()
+    exit_code = invoke_claude(cell, workspace, run_dir, project_dir)
+    wall_time = int(time.time() - start_time)
+
+    status = "ok" if exit_code == 0 else f"exit {exit_code}"
+
+    # Update meta with timing
+    meta["wall_time_seconds"] = wall_time
+    meta["exit_code"] = exit_code
+    meta["completed_at"] = datetime.now(timezone.utc).isoformat()
+    (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
+
+    # Evaluate
+    task_dir = project_dir / "tasks" / task
+    evaluate(task_dir, workspace, cell, run_dir)
+
+    # Append to index (thread-safe)
+    index_entry = {
+        "run_id": run_id,
+        "task": task,
+        "model": model,
+        "cell_id": cell_id,
+        "completed_at": meta["completed_at"],
+    }
+    with _index_lock:
+        with open(results_dir / "index.jsonl", "a") as f:
+            f.write(json.dumps(index_entry) + "\n")
+
+    # Archive and cleanup
+    archive_workspace(workspace, run_dir)
+
+    result = "completed" if (run_dir / "eval_results.json").exists() else "failed"
+    log(f"  DONE: {task} | {model} | {prompt_style} | run{run_num} | {status} | {wall_time}s | {result}")
+    return result
+
+
 def main():
-    grid_file = sys.argv[1] if len(sys.argv) > 1 else str(PROJECT_DIR / "grid.yaml")
-    profile = sys.argv[2] if len(sys.argv) > 2 else "smoke"
+    # Parse args
+    args = sys.argv[1:]
+    parallel = 1
+    grid_file = str(PROJECT_DIR / "grid.yaml")
+    profile = "smoke"
+
+    i = 0
+    positional = []
+    while i < len(args):
+        if args[i] == "-j" and i + 1 < len(args):
+            parallel = int(args[i + 1])
+            i += 2
+        else:
+            positional.append(args[i])
+            i += 1
+
+    if len(positional) >= 1:
+        grid_file = positional[0]
+    if len(positional) >= 2:
+        profile = positional[1]
+
     results_dir = PROJECT_DIR / "results"
     results_dir.mkdir(exist_ok=True)
     (results_dir / "runs").mkdir(exist_ok=True)
@@ -344,6 +455,7 @@ def main():
     print("=" * 40)
     print(f"Grid file:  {grid_file}")
     print(f"Profile:    {profile}")
+    print(f"Parallel:   {parallel}")
     print(f"Results:    {results_dir}")
     print("=" * 40)
 
@@ -364,103 +476,59 @@ def main():
         cells = compute_cells(grid, profile)
         print(f"Profile:    {profile}")
 
-    print(f"Grid cells: {len(cells)}")
+    # Build the full list of (cell, run_num) jobs
+    jobs = []
+    for cell in cells:
+        runs_per_cell = cell.get("runs_per_cell", 3)
+        for run_num in range(1, runs_per_cell + 1):
+            jobs.append((cell, run_num))
+
+    print(f"Total jobs:  {len(jobs)}")
     print()
 
     completed = 0
     skipped = 0
     failed = 0
 
-    for cell in cells:
-        task = cell["task"]
-        cell_id = cell["cell_id"]
-        runs_per_cell = cell.get("runs_per_cell", 3)
-        model = cell["model"]
-        prompt_style = cell["prompt_style"]
-
-        for run_num in range(1, runs_per_cell + 1):
-            run_id = f"{cell_id}_run{run_num}"
-            run_dir = results_dir / "runs" / run_id
-
-            # Resume support
-            if (run_dir / "eval_results.json").exists():
-                print(f"SKIP: {run_id}")
+    if parallel <= 1:
+        # Sequential
+        for cell, run_num in jobs:
+            result = run_single(cell, run_num, results_dir, PROJECT_DIR, claude_version)
+            if result == "completed":
+                completed += 1
+            elif result == "skipped":
                 skipped += 1
-                continue
-
-            print("-" * 40)
-            print(f"RUN:  {run_id}")
-            print(f"Task: {task} | Model: {model} | Prompt: {prompt_style}")
-            print("-" * 40)
-
-            run_dir.mkdir(parents=True, exist_ok=True)
-
-            # Save meta
-            meta = {
-                **cell,
-                "run_id": run_id,
-                "run_number": run_num,
-                "claude_version": claude_version,
-                "started_at": datetime.now(timezone.utc).isoformat(),
-            }
-            (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
-
-            # Create workspace
-            print("  Creating workspace...")
-            try:
-                workspace = create_workspace(PROJECT_DIR, task, cell)
-                print(f"  Workspace: {workspace}")
-            except Exception as e:
-                print(f"  ERROR creating workspace: {e}")
-                failed += 1
-                continue
-
-            # Invoke claude
-            print(f"  Invoking claude (model={model})...")
-            start_time = time.time()
-            exit_code = invoke_claude(cell, workspace, run_dir, PROJECT_DIR)
-            wall_time = int(time.time() - start_time)
-
-            if exit_code == 0:
-                print("  Claude completed successfully")
             else:
-                print(f"  Claude exited with error (exit code: {exit_code})")
-
-            # Update meta with timing
-            meta["wall_time_seconds"] = wall_time
-            meta["exit_code"] = exit_code
-            meta["completed_at"] = datetime.now(timezone.utc).isoformat()
-            (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))
-
-            # Evaluate
-            print("  Running evaluation...")
-            task_dir = PROJECT_DIR / "tasks" / task
-            evaluate(task_dir, workspace, cell, run_dir)
-            print("  Evaluation complete")
-
-            # Append to index
-            index_entry = {
-                "run_id": run_id,
-                "task": task,
-                "model": model,
-                "cell_id": cell_id,
-                "completed_at": meta["completed_at"],
+                failed += 1
+    else:
+        # Parallel with rolling concurrency
+        with ThreadPoolExecutor(max_workers=parallel) as executor:
+            futures = {
+                executor.submit(
+                    run_single, cell, run_num, results_dir, PROJECT_DIR, claude_version
+                ): (cell, run_num)
+                for cell, run_num in jobs
             }
-            with open(results_dir / "index.jsonl", "a") as f:
-                f.write(json.dumps(index_entry) + "\n")
 
-            # Archive and cleanup
-            print("  Archiving workspace...")
-            archive_workspace(workspace, run_dir)
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    log(f"  ERROR: {e}")
+                    result = "failed"
 
-            if (run_dir / "eval_results.json").exists():
-                completed += 1
-            else:
-                failed += 1
+                with _counter_lock:
+                    if result == "completed":
+                        completed += 1
+                    elif result == "skipped":
+                        skipped += 1
+                    else:
+                        failed += 1
 
-            print(f"  Done. ({completed} completed, {skipped} skipped, {failed} failed)")
-            print()
+                    total_done = completed + skipped + failed
+                    log(f"  Progress: {total_done}/{len(jobs)} ({completed} completed, {skipped} skipped, {failed} failed)")
 
+    print()
     print("=" * 40)
     print("All runs complete.")
     print(f"Completed: {completed} | Skipped: {skipped} | Failed: {failed}")

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README