loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit d42b0c8c388e6c37c066c61580ddc83ca243222d
parent 9a92d93df24f576c70bd662ec11a72aa97951b93
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 12:34:56 +0200

Stop deleting turns=1 and timeout runs as invalid

GLM models legitimately complete in 1 turn. Timeouts can produce
valid work. Only reject: turns=0 (no work), missing output, invalid API key.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/clean-and-reeval.py | 10++++------
Mharness/run.py | 70++++++++++++++++++++++++++++------------------------------------------
2 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/harness/clean-and-reeval.py b/harness/clean-and-reeval.py @@ -48,12 +48,10 @@ def clean_bad_runs(): exit_code = meta.get("exit_code") result_text = output.get("result", "") - if cost in (None, 0): - bad, reason = True, f"cost={cost}" - elif turns in (None, 1): - bad, reason = True, f"turns={turns}" - elif exit_code == 124: - bad, reason = True, "timeout" + if cost is None and turns in (None, 0): + bad, reason = True, "no cost and no turns" + elif turns in (None, 0) and (cost is None or cost == 0): + bad, reason = True, f"turns={turns}, cost={cost}" elif "Invalid API key" in str(result_text): bad, reason = True, "invalid API key" diff --git a/harness/run.py b/harness/run.py @@ -591,58 +591,44 @@ def log(msg: str): def is_valid_run(run_dir: Path) -> bool: """Check whether a completed run directory contains valid results. - Returns False (invalid) if any of these are true: - - claude_output.json has total_cost_usd of 0, null, or missing - - claude_output.json has num_turns of 1, null, or missing (real runs always have >1) - - meta.json has exit_code of 124 (timeout) - - claude_output.json contains "Invalid API key" in the result field - - transcript.jsonl has fewer than 5 lines (too short to be a real session) + Returns False (invalid) only for unambiguous failures: + - claude_output.json missing entirely + - num_turns is 0 or null (no work done at all) + - "Invalid API key" in the result field + - transcript.jsonl missing or empty + + Does NOT reject: turns=1 (GLM models complete in 1 turn), + timeouts (may have produced valid work), cost=0 with turns>0. """ - # Check meta.json for timeout - meta_path = run_dir / "meta.json" - if meta_path.exists(): - try: - meta = json.loads(meta_path.read_text()) - if meta.get("exit_code") == 124: - return False - except (json.JSONDecodeError, OSError): - return False - - # Check transcript.jsonl line count + # Check transcript exists transcript_path = run_dir / "transcript.jsonl" - if transcript_path.exists(): - try: - lines = transcript_path.read_text().strip().split("\n") - if len(lines) < 5: - return False - except OSError: + if not transcript_path.exists(): + return False + try: + lines = transcript_path.read_text().strip().split("\n") + if len(lines) < 3: return False - else: + except OSError: return False # Check claude_output.json output_path = run_dir / "claude_output.json" - if output_path.exists(): - try: - output = json.loads(output_path.read_text()) - except (json.JSONDecodeError, OSError): - return False + if not output_path.exists(): + return False - # total_cost_usd: 0, null, or missing - cost = output.get("total_cost_usd") - if not cost: # catches None, 0, 0.0, and missing (None from .get) - return False + try: + output = json.loads(output_path.read_text()) + except (json.JSONDecodeError, OSError): + return False - # num_turns: 1, null, or missing - num_turns = output.get("num_turns") - if num_turns is None or num_turns <= 1: - return False + # num_turns: 0 or null = no work done + num_turns = output.get("num_turns") + if num_turns is None or num_turns == 0: + return False - # "Invalid API key" in result field - result_text = output.get("result", "") - if isinstance(result_text, str) and "Invalid API key" in result_text: - return False - else: + # "Invalid API key" in result field + result_text = output.get("result", "") + if isinstance(result_text, str) and "Invalid API key" in result_text: return False return True

Impressum · Datenschutz