Fix serve process leak in gameplay bot eval - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit fec57ee83892b809897124d42887597de29fa9b8
parent fe686981c1fb42be99b4b1a078c818496970dad6
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 15:54:34 +0200

Fix serve process leak in gameplay bot eval

Gameplay bot starts an HTTP server (npx serve) per eval run. If Playwright
times out or crashes, the afterAll cleanup never runs and serve processes
accumulate. Found 684 orphaned serve processes consuming ~24GB memory.

Fix: use Popen with start_new_session=True so playwright + child serve
processes share a process group. Kill the entire group via os.killpg() in
a finally block, ensuring cleanup on success, failure, and timeout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 64 +++++++++++++++++++++++++++++++++++++++++-----------------------

1 file changed, 41 insertions(+), 23 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -21,6 +21,7 @@ Usage:
 
 import json
 import os
+import signal
 import shutil
 import subprocess
 import sys
@@ -324,44 +325,61 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
                 bot_env = os.environ.copy()
                 bot_env["WORKSPACE_PATH"] = str(workspace)
                 bot_env["REPORT_OUTPUT_PATH"] = str(report_path)
-                bot_result = subprocess.run(
+                bot_proc = subprocess.Popen(
                     ["npx", "playwright", "test", "--config", str(playwright_config)],
                     cwd=str(PROJECT_DIR),
-                    capture_output=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
                     text=True,
-                    timeout=180,
                     env=bot_env,
+                    start_new_session=True,
                 )
-                if report_path.exists():
-                    report_data = json.loads(report_path.read_text())
-                    summary = report_data.get("summary", {})
-                    results["gameplay_bot"] = {
-                        "pass": summary.get("failed", 1) == 0,
-                        "score": summary.get("score", 0),
-                        "total": summary.get("total", 0),
-                        "passed": summary.get("passed", 0),
-                        "failed": summary.get("failed", 0),
-                        "report": report_data,
-                    }
-                else:
+                try:
+                    stdout, stderr = bot_proc.communicate(timeout=180)
+                except subprocess.TimeoutExpired:
+                    # Kill entire process group (playwright + child serve processes)
+                    try:
+                        os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM)
+                    except Exception:
+                        pass
+                    bot_proc.kill()
+                    bot_proc.wait()
                     results["gameplay_bot"] = {
                         "pass": False,
                         "score": 0,
-                        "error": f"Report file not created. Exit code: {bot_result.returncode}. "
-                                 f"stderr: {bot_result.stderr[:1000]}",
+                        "error": "Gameplay bot timed out after 180 seconds",
                     }
+                else:
+                    if report_path.exists():
+                        report_data = json.loads(report_path.read_text())
+                        summary = report_data.get("summary", {})
+                        results["gameplay_bot"] = {
+                            "pass": summary.get("failed", 1) == 0,
+                            "score": summary.get("score", 0),
+                            "total": summary.get("total", 0),
+                            "passed": summary.get("passed", 0),
+                            "failed": summary.get("failed", 0),
+                            "report": report_data,
+                        }
+                    else:
+                        results["gameplay_bot"] = {
+                            "pass": False,
+                            "score": 0,
+                            "error": f"Report file not created. Exit code: {bot_proc.returncode}. "
+                                     f"stderr: {stderr[:1000] if stderr else ''}",
+                        }
+                finally:
+                    # Always clean up the process group to prevent orphaned serve processes
+                    try:
+                        os.killpg(os.getpgid(bot_proc.pid), signal.SIGTERM)
+                    except Exception:
+                        pass
             except FileNotFoundError:
                 results["gameplay_bot"] = {
                     "pass": False,
                     "score": 0,
                     "error": "Playwright (npx) not found. Install with: npm install -D @playwright/test",
                 }
-            except subprocess.TimeoutExpired:
-                results["gameplay_bot"] = {
-                    "pass": False,
-                    "score": 0,
-                    "error": "Gameplay bot timed out after 180 seconds",
-                }
             except Exception as e:
                 results["gameplay_bot"] = {
                     "pass": False,

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README