loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit fec57ee83892b809897124d42887597de29fa9b8
parent fe686981c1fb42be99b4b1a078c818496970dad6
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 15:54:34 +0200

Fix serve process leak in gameplay bot eval

Gameplay bot starts an HTTP server (npx serve) per eval run. If Playwright
times out or crashes, the afterAll cleanup never runs and serve processes
accumulate. Found 684 orphaned serve processes consuming ~24GB memory.

Fix: use Popen with start_new_session=True so playwright + child serve
processes share a process group. Kill the entire group via os.killpg() in
a finally block, ensuring cleanup on success, failure, and timeout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 64+++++++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/harness/run.py b/harness/run.py @@ -21,6 +21,7 @@ Usage: import json import os +import signal import shutil import subprocess import sys @@ -324,44 +325,61 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): bot_env = os.environ.copy() bot_env["WORKSPACE_PATH"] = str(workspace) bot_env["REPORT_OUTPUT_PATH"] = str(report_path) - bot_result = subprocess.run( + bot_proc = subprocess.Popen( ["npx", "playwright", "test", "--config", str(playwright_config)], cwd=str(PROJECT_DIR), - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=180, env=bot_env, + start_new_session=True, ) - if report_path.exists(): - report_data = json.loads(report_path.read_text()) - summary = report_data.get("summary", {}) - results["gameplay_bot"] = { - "pass": summary.get("failed", 1) == 0, - "score": summary.get("score", 0), - "total": summary.get("total", 0), - "passed": summary.get("passed", 0), - "failed": summary.get("failed", 0), - "report": report_data, - } - else: + try: + stdout, stderr = bot_proc.communicate(timeout=180) + except subprocess.TimeoutExpired: + # Kill entire process group (playwright + child serve processes) + try: + os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM) + except Exception: + pass + bot_proc.kill() + bot_proc.wait() results["gameplay_bot"] = { "pass": False, "score": 0, - "error": f"Report file not created. Exit code: {bot_result.returncode}. " - f"stderr: {bot_result.stderr[:1000]}", + "error": "Gameplay bot timed out after 180 seconds", } + else: + if report_path.exists(): + report_data = json.loads(report_path.read_text()) + summary = report_data.get("summary", {}) + results["gameplay_bot"] = { + "pass": summary.get("failed", 1) == 0, + "score": summary.get("score", 0), + "total": summary.get("total", 0), + "passed": summary.get("passed", 0), + "failed": summary.get("failed", 0), + "report": report_data, + } + else: + results["gameplay_bot"] = { + "pass": False, + "score": 0, + "error": f"Report file not created. Exit code: {bot_proc.returncode}. " + f"stderr: {stderr[:1000] if stderr else ''}", + } + finally: + # Always clean up the process group to prevent orphaned serve processes + try: + os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM) + except Exception: + pass except FileNotFoundError: results["gameplay_bot"] = { "pass": False, "score": 0, "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test", } - except subprocess.TimeoutExpired: - results["gameplay_bot"] = { - "pass": False, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds", - } except Exception as e: results["gameplay_bot"] = { "pass": False,

Impressum · Datenschutz