Stop deleting turns=1 and timeout runs as invalid - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit d42b0c8c388e6c37c066c61580ddc83ca243222d
parent 9a92d93df24f576c70bd662ec11a72aa97951b93
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 12:34:56 +0200

Stop deleting turns=1 and timeout runs as invalid

GLM models legitimately complete in 1 turn. Timeouts can produce
valid work. Only reject: turns=0 (no work), missing output, invalid API key.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/clean-and-reeval.py  | 10 ++++------
M harness/run.py  | 70 ++++++++++++++++++++++++++++------------------------------------------

2 files changed, 32 insertions(+), 48 deletions(-)
diff --git a/harness/clean-and-reeval.py b/harness/clean-and-reeval.py
@@ -48,12 +48,10 @@ def clean_bad_runs():
                 exit_code = meta.get("exit_code")
                 result_text = output.get("result", "")
 
-                if cost in (None, 0):
-                    bad, reason = True, f"cost={cost}"
-                elif turns in (None, 1):
-                    bad, reason = True, f"turns={turns}"
-                elif exit_code == 124:
-                    bad, reason = True, "timeout"
+                if cost is None and turns in (None, 0):
+                    bad, reason = True, "no cost and no turns"
+                elif turns in (None, 0) and (cost is None or cost == 0):
+                    bad, reason = True, f"turns={turns}, cost={cost}"
                 elif "Invalid API key" in str(result_text):
                     bad, reason = True, "invalid API key"
 
diff --git a/harness/run.py b/harness/run.py
@@ -591,58 +591,44 @@ def log(msg: str):
 def is_valid_run(run_dir: Path) -> bool:
     """Check whether a completed run directory contains valid results.
 
-    Returns False (invalid) if any of these are true:
-    - claude_output.json has total_cost_usd of 0, null, or missing
-    - claude_output.json has num_turns of 1, null, or missing (real runs always have >1)
-    - meta.json has exit_code of 124 (timeout)
-    - claude_output.json contains "Invalid API key" in the result field
-    - transcript.jsonl has fewer than 5 lines (too short to be a real session)
+    Returns False (invalid) only for unambiguous failures:
+    - claude_output.json missing entirely
+    - num_turns is 0 or null (no work done at all)
+    - "Invalid API key" in the result field
+    - transcript.jsonl missing or empty
+
+    Does NOT reject: turns=1 (GLM models complete in 1 turn),
+    timeouts (may have produced valid work), cost=0 with turns>0.
     """
-    # Check meta.json for timeout
-    meta_path = run_dir / "meta.json"
-    if meta_path.exists():
-        try:
-            meta = json.loads(meta_path.read_text())
-            if meta.get("exit_code") == 124:
-                return False
-        except (json.JSONDecodeError, OSError):
-            return False
-
-    # Check transcript.jsonl line count
+    # Check transcript exists
     transcript_path = run_dir / "transcript.jsonl"
-    if transcript_path.exists():
-        try:
-            lines = transcript_path.read_text().strip().split("\n")
-            if len(lines) < 5:
-                return False
-        except OSError:
+    if not transcript_path.exists():
+        return False
+    try:
+        lines = transcript_path.read_text().strip().split("\n")
+        if len(lines) < 3:
             return False
-    else:
+    except OSError:
         return False
 
     # Check claude_output.json
     output_path = run_dir / "claude_output.json"
-    if output_path.exists():
-        try:
-            output = json.loads(output_path.read_text())
-        except (json.JSONDecodeError, OSError):
-            return False
+    if not output_path.exists():
+        return False
 
-        # total_cost_usd: 0, null, or missing
-        cost = output.get("total_cost_usd")
-        if not cost:  # catches None, 0, 0.0, and missing (None from .get)
-            return False
+    try:
+        output = json.loads(output_path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return False
 
-        # num_turns: 1, null, or missing
-        num_turns = output.get("num_turns")
-        if num_turns is None or num_turns <= 1:
-            return False
+    # num_turns: 0 or null = no work done
+    num_turns = output.get("num_turns")
+    if num_turns is None or num_turns == 0:
+        return False
 
-        # "Invalid API key" in result field
-        result_text = output.get("result", "")
-        if isinstance(result_text, str) and "Invalid API key" in result_text:
-            return False
-    else:
+    # "Invalid API key" in result field
+    result_text = output.get("result", "")
+    if isinstance(result_text, str) and "Invalid API key" in result_text:
         return False
 
     return True

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	harness/clean-and-reeval.py	\|	10	++++------
M	harness/run.py	\|	70	++++++++++++++++++++++++++++------------------------------------------