loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit b087659035807b7db06e48ad8dd9b9bd6d911aaa
parent 1862a787fcf22188e9681812d77b7276db281f7b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 09:25:16 +0200

Add SonarQube integration for code quality analysis

sonarqube-scan.py runs sonar-scanner against game workspaces and pulls
metrics via API. Requires SonarQube running at localhost:9000.

Metrics captured:
- Bugs, vulnerabilities, code smells (count)
- Cognitive complexity (better than cyclomatic)
- Duplication percentage
- Technical debt (minutes)
- Maintainability/Reliability/Security ratings (A-E)
- Composite 0-1 score

Tested: haiku JS game scored 0.77 (1 bug, complexity 90, A maintainability)

Not yet wired into the harness - needs SonarQube running during eval.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Atasks/tetris/eval/sonarqube-scan.py | 185+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 185 insertions(+), 0 deletions(-)

diff --git a/tasks/tetris/eval/sonarqube-scan.py b/tasks/tetris/eval/sonarqube-scan.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +"""SonarQube code analysis for generated Tetris implementations. + +Runs sonar-scanner against the workspace and pulls metrics via API. +Requires SonarQube running at localhost:9000. + +Usage: python3 sonarqube-scan.py <workspace_path> <project_key> +Output: JSON to stdout +""" + +import json +import subprocess +import sys +import time +import urllib.request +import urllib.error +from pathlib import Path + + +SONAR_URL = "http://localhost:9000" +SONAR_TOKEN_FILE = Path.home() / ".sonarqube-token" + + +def get_token() -> str: + if SONAR_TOKEN_FILE.exists(): + return SONAR_TOKEN_FILE.read_text().strip() + return "" + + +def scan(workspace: Path, project_key: str, token: str) -> bool: + """Run sonar-scanner against workspace. Returns True on success.""" + cmd = [ + "sonar-scanner", + f"-Dsonar.projectKey={project_key}", + "-Dsonar.sources=.", + f"-Dsonar.host.url={SONAR_URL}", + f"-Dsonar.token={token}", + "-Dsonar.exclusions=**/node_modules/**,**/package-lock.json,**/report/**", + "-Dsonar.scm.disabled=true", + ] + result = subprocess.run( + cmd, cwd=workspace, capture_output=True, text=True, timeout=60 + ) + return "EXECUTION SUCCESS" in result.stdout + + +def wait_for_analysis(project_key: str, token: str, timeout: int = 30) -> bool: + """Wait for SonarQube to finish processing.""" + import base64 + auth = base64.b64encode(f"{token}:".encode()).decode() + headers = {"Authorization": f"Basic {auth}"} + + for _ in range(timeout): + try: + req = urllib.request.Request( + f"{SONAR_URL}/api/ce/component?component={project_key}", + headers=headers, + ) + resp = urllib.request.urlopen(req, timeout=5) + data = json.loads(resp.read()) + tasks = data.get("queue", []) + [data.get("current", {})] + pending = any( + t.get("status") in ("PENDING", "IN_PROGRESS") + for t in tasks if t + ) + if not pending: + return True + except Exception: + pass + time.sleep(1) + return False + + +def get_metrics(project_key: str, token: str) -> dict: + """Pull metrics from SonarQube API.""" + import base64 + auth = base64.b64encode(f"{token}:".encode()).decode() + headers = {"Authorization": f"Basic {auth}"} + + metrics = [ + "bugs", "vulnerabilities", "code_smells", + "cognitive_complexity", "duplicated_lines_density", + "ncloc", "sqale_rating", "reliability_rating", + "security_rating", "sqale_index", + ] + + try: + req = urllib.request.Request( + f"{SONAR_URL}/api/measures/component?component={project_key}&metricKeys={','.join(metrics)}", + headers=headers, + ) + resp = urllib.request.urlopen(req, timeout=10) + data = json.loads(resp.read()) + measures = data.get("component", {}).get("measures", []) + return {m["metric"]: float(m["value"]) for m in measures} + except Exception as e: + return {"error": str(e)} + + +def compute_score(metrics: dict) -> float: + """Compute a 0-1 score from SonarQube metrics.""" + if "error" in metrics: + return 0.0 + + score = 100.0 + + # Bugs: -15 each, max -30 + bugs = metrics.get("bugs", 0) + score -= min(bugs * 15, 30) + + # Code smells: -3 each, max -20 + smells = metrics.get("code_smells", 0) + score -= min(smells * 3, 20) + + # Vulnerabilities: -20 each, max -40 + vulns = metrics.get("vulnerabilities", 0) + score -= min(vulns * 20, 40) + + # Cognitive complexity: penalty above 50 + complexity = metrics.get("cognitive_complexity", 0) + if complexity > 100: + score -= 15 + elif complexity > 50: + score -= 5 + + # Duplication: penalty above 5% + duplication = metrics.get("duplicated_lines_density", 0) + if duplication > 10: + score -= 10 + elif duplication > 5: + score -= 5 + + return max(0.0, min(100.0, score)) / 100.0 + + +def main(): + workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") + project_key = sys.argv[2] if len(sys.argv) > 2 else "tetris-eval" + + token = get_token() + if not token: + print(json.dumps({"error": "no SonarQube token found", "score": 0})) + return + + # Check if SonarQube is running + try: + urllib.request.urlopen(f"{SONAR_URL}/api/system/status", timeout=3) + except Exception: + print(json.dumps({"error": "SonarQube not running at localhost:9000", "score": 0})) + return + + # Run scan + if not scan(workspace, project_key, token): + print(json.dumps({"error": "sonar-scanner failed", "score": 0})) + return + + # Wait for processing + wait_for_analysis(project_key, token) + + # Get metrics + metrics = get_metrics(project_key, token) + score = compute_score(metrics) + + # Rating labels (SonarQube uses 1-5 where 1=A, 5=E) + rating_labels = {1.0: "A", 2.0: "B", 3.0: "C", 4.0: "D", 5.0: "E"} + + result = { + "bugs": int(metrics.get("bugs", 0)), + "vulnerabilities": int(metrics.get("vulnerabilities", 0)), + "code_smells": int(metrics.get("code_smells", 0)), + "cognitive_complexity": int(metrics.get("cognitive_complexity", 0)), + "lines_of_code": int(metrics.get("ncloc", 0)), + "duplication_pct": metrics.get("duplicated_lines_density", 0), + "tech_debt_minutes": int(metrics.get("sqale_index", 0)), + "maintainability": rating_labels.get(metrics.get("sqale_rating", 0), "?"), + "reliability": rating_labels.get(metrics.get("reliability_rating", 0), "?"), + "security": rating_labels.get(metrics.get("security_rating", 0), "?"), + "score": round(score, 2), + } + + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main()

Impressum · Datenschutz