Add SonarQube integration for code quality analysis - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit b087659035807b7db06e48ad8dd9b9bd6d911aaa
parent 1862a787fcf22188e9681812d77b7276db281f7b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 09:25:16 +0200

Add SonarQube integration for code quality analysis

sonarqube-scan.py runs sonar-scanner against game workspaces and pulls
metrics via API. Requires SonarQube running at localhost:9000.

Metrics captured:
- Bugs, vulnerabilities, code smells (count)
- Cognitive complexity (better than cyclomatic)
- Duplication percentage
- Technical debt (minutes)
- Maintainability/Reliability/Security ratings (A-E)
- Composite 0-1 score

Tested: haiku JS game scored 0.77 (1 bug, complexity 90, A maintainability)

Not yet wired into the harness - needs SonarQube running during eval.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
A tasks/tetris/eval/sonarqube-scan.py  | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 185 insertions(+), 0 deletions(-)
diff --git a/tasks/tetris/eval/sonarqube-scan.py b/tasks/tetris/eval/sonarqube-scan.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""SonarQube code analysis for generated Tetris implementations.
+
+Runs sonar-scanner against the workspace and pulls metrics via API.
+Requires SonarQube running at localhost:9000.
+
+Usage: python3 sonarqube-scan.py <workspace_path> <project_key>
+Output: JSON to stdout
+"""
+
+import json
+import subprocess
+import sys
+import time
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+
+SONAR_URL = "http://localhost:9000"
+SONAR_TOKEN_FILE = Path.home() / ".sonarqube-token"
+
+
+def get_token() -> str:
+    if SONAR_TOKEN_FILE.exists():
+        return SONAR_TOKEN_FILE.read_text().strip()
+    return ""
+
+
+def scan(workspace: Path, project_key: str, token: str) -> bool:
+    """Run sonar-scanner against workspace. Returns True on success."""
+    cmd = [
+        "sonar-scanner",
+        f"-Dsonar.projectKey={project_key}",
+        "-Dsonar.sources=.",
+        f"-Dsonar.host.url={SONAR_URL}",
+        f"-Dsonar.token={token}",
+        "-Dsonar.exclusions=**/node_modules/**,**/package-lock.json,**/report/**",
+        "-Dsonar.scm.disabled=true",
+    ]
+    result = subprocess.run(
+        cmd, cwd=workspace, capture_output=True, text=True, timeout=60
+    )
+    return "EXECUTION SUCCESS" in result.stdout
+
+
+def wait_for_analysis(project_key: str, token: str, timeout: int = 30) -> bool:
+    """Wait for SonarQube to finish processing."""
+    import base64
+    auth = base64.b64encode(f"{token}:".encode()).decode()
+    headers = {"Authorization": f"Basic {auth}"}
+
+    for _ in range(timeout):
+        try:
+            req = urllib.request.Request(
+                f"{SONAR_URL}/api/ce/component?component={project_key}",
+                headers=headers,
+            )
+            resp = urllib.request.urlopen(req, timeout=5)
+            data = json.loads(resp.read())
+            tasks = data.get("queue", []) + [data.get("current", {})]
+            pending = any(
+                t.get("status") in ("PENDING", "IN_PROGRESS")
+                for t in tasks if t
+            )
+            if not pending:
+                return True
+        except Exception:
+            pass
+        time.sleep(1)
+    return False
+
+
+def get_metrics(project_key: str, token: str) -> dict:
+    """Pull metrics from SonarQube API."""
+    import base64
+    auth = base64.b64encode(f"{token}:".encode()).decode()
+    headers = {"Authorization": f"Basic {auth}"}
+
+    metrics = [
+        "bugs", "vulnerabilities", "code_smells",
+        "cognitive_complexity", "duplicated_lines_density",
+        "ncloc", "sqale_rating", "reliability_rating",
+        "security_rating", "sqale_index",
+    ]
+
+    try:
+        req = urllib.request.Request(
+            f"{SONAR_URL}/api/measures/component?component={project_key}&metricKeys={','.join(metrics)}",
+            headers=headers,
+        )
+        resp = urllib.request.urlopen(req, timeout=10)
+        data = json.loads(resp.read())
+        measures = data.get("component", {}).get("measures", [])
+        return {m["metric"]: float(m["value"]) for m in measures}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def compute_score(metrics: dict) -> float:
+    """Compute a 0-1 score from SonarQube metrics."""
+    if "error" in metrics:
+        return 0.0
+
+    score = 100.0
+
+    # Bugs: -15 each, max -30
+    bugs = metrics.get("bugs", 0)
+    score -= min(bugs * 15, 30)
+
+    # Code smells: -3 each, max -20
+    smells = metrics.get("code_smells", 0)
+    score -= min(smells * 3, 20)
+
+    # Vulnerabilities: -20 each, max -40
+    vulns = metrics.get("vulnerabilities", 0)
+    score -= min(vulns * 20, 40)
+
+    # Cognitive complexity: penalty above 50
+    complexity = metrics.get("cognitive_complexity", 0)
+    if complexity > 100:
+        score -= 15
+    elif complexity > 50:
+        score -= 5
+
+    # Duplication: penalty above 5%
+    duplication = metrics.get("duplicated_lines_density", 0)
+    if duplication > 10:
+        score -= 10
+    elif duplication > 5:
+        score -= 5
+
+    return max(0.0, min(100.0, score)) / 100.0
+
+
+def main():
+    workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
+    project_key = sys.argv[2] if len(sys.argv) > 2 else "tetris-eval"
+
+    token = get_token()
+    if not token:
+        print(json.dumps({"error": "no SonarQube token found", "score": 0}))
+        return
+
+    # Check if SonarQube is running
+    try:
+        urllib.request.urlopen(f"{SONAR_URL}/api/system/status", timeout=3)
+    except Exception:
+        print(json.dumps({"error": "SonarQube not running at localhost:9000", "score": 0}))
+        return
+
+    # Run scan
+    if not scan(workspace, project_key, token):
+        print(json.dumps({"error": "sonar-scanner failed", "score": 0}))
+        return
+
+    # Wait for processing
+    wait_for_analysis(project_key, token)
+
+    # Get metrics
+    metrics = get_metrics(project_key, token)
+    score = compute_score(metrics)
+
+    # Rating labels (SonarQube uses 1-5 where 1=A, 5=E)
+    rating_labels = {1.0: "A", 2.0: "B", 3.0: "C", 4.0: "D", 5.0: "E"}
+
+    result = {
+        "bugs": int(metrics.get("bugs", 0)),
+        "vulnerabilities": int(metrics.get("vulnerabilities", 0)),
+        "code_smells": int(metrics.get("code_smells", 0)),
+        "cognitive_complexity": int(metrics.get("cognitive_complexity", 0)),
+        "lines_of_code": int(metrics.get("ncloc", 0)),
+        "duplication_pct": metrics.get("duplicated_lines_density", 0),
+        "tech_debt_minutes": int(metrics.get("sqale_index", 0)),
+        "maintainability": rating_labels.get(metrics.get("sqale_rating", 0), "?"),
+        "reliability": rating_labels.get(metrics.get("reliability_rating", 0), "?"),
+        "security": rating_labels.get(metrics.get("security_rating", 0), "?"),
+        "score": round(score, 2),
+    }
+
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README