Fix argument list too long for noise cells - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 625d14b3b226e882d25a00909c6d47ab82d0080b
parent e59ff443edb659c9d21d3fef8d708bd29176f827
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Wed,  8 Apr 2026 07:17:24 +0200

Fix argument list too long for noise cells

Large prompts (>100KB from context noise) exceeded OS arg limit.
Now writes prompt to temp file and uses bash -c with cat for large prompts.
Also deleted 20 gemma runs with 403 auth errors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 24 ++++++++++++++++++++----

1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -23,6 +23,7 @@ Usage:
 import hashlib
 import json
 import os
+import shlex
 import signal
 import shutil
 import subprocess
@@ -241,10 +242,16 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path,
     # Auth helper for --bare mode
     auth_helper = str(SCRIPT_DIR / "lib" / "get-oauth-token.sh")
 
-    cmd = [
+    # For large prompts (noise cells), write to temp file and read via shell
+    prompt_file = None
+    if len(prompt) > 100000:
+        prompt_file = Path(tempfile.mktemp(suffix=".txt", prefix="prompt-"))
+        prompt_file.write_text(prompt)
+
+    # Build base command (prompt added separately for large prompts)
+    cmd_base = [
         "claude",
         "--bare",
-        "-p", prompt,
         "--model", cli_model,
         "--output-format", "stream-json",
         "--verbose",
@@ -255,13 +262,22 @@ def invoke_claude(cell: dict, workspace: Path, run_dir: Path, project_dir: Path,
     ]
 
     if effort:
-        cmd.extend(["--effort", effort])
+        cmd_base.extend(["--effort", effort])
 
     # Context file
     if cell.get("context_file") == "provided":
         ctx_file = project_dir / "tasks" / cell["task"] / "context.md"
         if ctx_file.exists():
-            cmd.extend(["--append-system-prompt", ctx_file.read_text()])
+            cmd_base.extend(["--append-system-prompt", ctx_file.read_text()])
+
+    # Build final command: for large prompts, use shell to read from file
+    if prompt_file:
+        # Use shell to cat the prompt file into -p to avoid arg list limit
+        cmd_str = " ".join(shlex.quote(c) for c in cmd_base)
+        cmd = ["bash", "-c", f'{cmd_str} -p "$(cat {shlex.quote(str(prompt_file))})"']
+        use_shell = False  # already wrapped in bash -c
+    else:
+        cmd = [*cmd_base, "-p", prompt]
 
     # Run claude
     transcript_path = run_dir / "transcript.jsonl"

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README