Comprehensive code quality analysis (Python rewrite) - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit ee245799e717394d2faaa0060b8b1b0a24fed503
parent 7f450138909c4de86b94981aca1ebbb1a9defa82
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:21:32 +0200

Comprehensive code quality analysis (Python rewrite)

New code-analysis.py replaces the bash version with deeper checks:
- Magic numbers detection (numeric literals not in named constants)
- Function length analysis (average, max, count of long functions)
- Nesting depth (max indentation level)
- Global scope pollution (top-level declarations)
- Naming consistency (camelCase vs snake_case ratio)
- Error handling presence (try/catch blocks)
- Comments ratio (comment lines vs source lines)
- Separation of concerns (rendering vs logic in same files)
- HTML validation (html-validate)
- Code duplication (jscpd)

All deterministic, no LLM grading. Each metric factors into
a 0-1 score with penalties/bonuses.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M harness/run.py  | 21 ++++++++++++++++-----
A tasks/tetris/eval/code-analysis.py  | 363 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 379 insertions(+), 5 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -283,11 +283,22 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
         output = run_eval_script(quality_sh, workspace, language)
         results["quality"] = safe_parse_json(output)
 
-    # Code analysis (file count, LOC, unnecessary files, dependencies)
-    code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
-    if code_analysis_sh.exists():
-        output = run_eval_script(code_analysis_sh, workspace, language)
-        results["code_analysis"] = safe_parse_json(output)
+    # Code analysis (file count, LOC, unnecessary files, dependencies, quality metrics)
+    code_analysis_py = task_dir / "eval" / "code-analysis.py"
+    if code_analysis_py.exists():
+        try:
+            result = subprocess.run(
+                ["python3", str(code_analysis_py), str(workspace), language],
+                capture_output=True, text=True, timeout=120,
+            )
+            results["code_analysis"] = safe_parse_json(result.stdout.strip())
+        except Exception as e:
+            results["code_analysis"] = {"error": str(e), "score": 0}
+    else:
+        code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
+        if code_analysis_sh.exists():
+            output = run_eval_script(code_analysis_sh, workspace, language)
+            results["code_analysis"] = safe_parse_json(output)
 
     # Transcript analysis (agent efficiency, wasted turns, self-testing)
     transcript_py = task_dir / "eval" / "transcript-analysis.py"
diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+"""Code quality analysis for generated Tetris implementations.
+
+Measures code quality attributes that don't require running the game.
+All checks are deterministic -- no LLM grading.
+
+Usage: python3 code-analysis.py <workspace_path> <language>
+Output: JSON to stdout
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_cmd(cmd: list[str], cwd: str, timeout: int = 30) -> str:
+    try:
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, cwd=cwd, timeout=timeout
+        )
+        return result.stdout.strip()
+    except Exception:
+        return ""
+
+
+def main():
+    workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
+    language = sys.argv[2] if len(sys.argv) > 2 else "typescript"
+
+    os.chdir(workspace)
+    results: dict = {}
+
+    # ---- File inventory ----
+    all_files = list(
+        f for f in workspace.rglob("*")
+        if f.is_file()
+        and "node_modules" not in str(f)
+        and ".git" not in str(f)
+    )
+    code_extensions = {".ts", ".tsx", ".js", ".jsx", ".html", ".css"}
+    doc_extensions = {".md", ".txt"}
+
+    code_files = [f for f in all_files if f.suffix in code_extensions]
+    doc_files = [f for f in all_files if f.suffix in doc_extensions]
+
+    unnecessary_patterns = [
+        "README.md", "IMPLEMENTATION.md", "FEATURES.md", "QUICK_START.txt",
+        "CHANGELOG.md", "TODO.md", "server.js",
+    ]
+    unnecessary = [f.name for f in all_files if f.name in unnecessary_patterns]
+
+    results["files"] = {
+        "total": len(all_files),
+        "code": len(code_files),
+        "docs": len(doc_files),
+        "unnecessary": len(unnecessary),
+        "unnecessary_list": unnecessary,
+    }
+
+    # ---- Lines of code ----
+    total_loc = 0
+    for f in code_files:
+        try:
+            total_loc += len(f.read_text().splitlines())
+        except Exception:
+            pass
+    results["lines_of_code"] = total_loc
+
+    # ---- Dependencies ----
+    pkg_json = workspace / "package.json"
+    deps = 0
+    dev_deps = 0
+    if pkg_json.exists():
+        try:
+            pkg = json.loads(pkg_json.read_text())
+            deps = len(pkg.get("dependencies", {}))
+            dev_deps = len(pkg.get("devDependencies", {}))
+        except Exception:
+            pass
+    results["dependencies"] = {"production": deps, "dev": dev_deps, "total": deps + dev_deps}
+
+    # ---- Complexity classification ----
+    if len(code_files) <= 2:
+        complexity = "minimal"
+    elif len(code_files) <= 5:
+        complexity = "moderate"
+    else:
+        complexity = "over-engineered"
+    results["complexity"] = complexity
+
+    # ---- Console.log count ----
+    console_logs = 0
+    for f in code_files:
+        if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
+            try:
+                console_logs += f.read_text().count("console.log")
+            except Exception:
+                pass
+    results["console_logs"] = console_logs
+
+    # ---- Read all JS/TS source for further analysis ----
+    all_source = ""
+    for f in code_files:
+        if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
+            try:
+                all_source += f.read_text() + "\n"
+            except Exception:
+                pass
+
+    # ---- Magic numbers ----
+    # Find numeric literals > 1 that aren't in common patterns
+    magic_re = re.compile(r"(?<!\w)(\d{2,})(?!\w)")
+    ok_numbers = {"10", "20", "100", "200", "255", "300", "600", "1000", "60", "30"}
+    magic_numbers = []
+    for match in magic_re.finditer(all_source):
+        num = match.group(1)
+        if num not in ok_numbers and not num.startswith("0x"):
+            magic_numbers.append(num)
+    # Only flag if excessive
+    results["magic_numbers"] = {
+        "count": len(magic_numbers),
+        "excessive": len(magic_numbers) > 20,
+    }
+
+    # ---- Function length ----
+    # Simple heuristic: count lines between function/method declarations
+    func_re = re.compile(r"(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>|\w+\s*\([^)]*\)\s*\{)")
+    func_lengths = []
+    lines = all_source.split("\n")
+    in_func_start = -1
+    brace_depth = 0
+    for i, line in enumerate(lines):
+        if func_re.search(line) and "{" in line:
+            in_func_start = i
+            brace_depth = 0
+        if in_func_start >= 0:
+            brace_depth += line.count("{") - line.count("}")
+            if brace_depth <= 0 and i > in_func_start:
+                func_lengths.append(i - in_func_start + 1)
+                in_func_start = -1
+
+    avg_func_length = round(sum(func_lengths) / len(func_lengths), 1) if func_lengths else 0
+    max_func_length = max(func_lengths) if func_lengths else 0
+    results["function_length"] = {
+        "count": len(func_lengths),
+        "average": avg_func_length,
+        "max": max_func_length,
+        "long_functions": sum(1 for f in func_lengths if f > 50),
+    }
+
+    # ---- Nesting depth ----
+    max_nesting = 0
+    for line in lines:
+        stripped = line.lstrip()
+        indent = len(line) - len(stripped)
+        # Approximate nesting from indentation (assuming 2-space indent)
+        depth = indent // 2
+        max_nesting = max(max_nesting, depth)
+    results["max_nesting_depth"] = max_nesting
+
+    # ---- Global scope pollution ----
+    # Count var declarations at top level (indent 0)
+    globals_count = 0
+    global_re = re.compile(r"^(?:var|let|const)\s+\w+")
+    for line in lines:
+        if not line.startswith(" ") and not line.startswith("\t") and global_re.match(line):
+            globals_count += 1
+    results["global_declarations"] = globals_count
+
+    # ---- Naming consistency ----
+    camel_count = len(re.findall(r"\b[a-z]+[A-Z]\w*\b", all_source))
+    snake_count = len(re.findall(r"\b[a-z]+_[a-z]+\w*\b", all_source))
+    if camel_count > 0 and snake_count > 0:
+        total_names = camel_count + snake_count
+        dominant = "camelCase" if camel_count >= snake_count else "snake_case"
+        consistency = round(max(camel_count, snake_count) / total_names * 100, 1)
+    else:
+        dominant = "camelCase" if camel_count > 0 else "snake_case" if snake_count > 0 else "unknown"
+        consistency = 100.0
+    results["naming"] = {
+        "dominant_style": dominant,
+        "consistency_pct": consistency,
+        "camel_case": camel_count,
+        "snake_case": snake_count,
+    }
+
+    # ---- Error handling ----
+    try_catch_count = all_source.count("try {") + all_source.count("try{")
+    results["error_handling"] = {
+        "try_catch_blocks": try_catch_count,
+        "has_error_handling": try_catch_count > 0,
+    }
+
+    # ---- Comments ratio ----
+    comment_lines = 0
+    source_lines = 0
+    in_block_comment = False
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if in_block_comment:
+            comment_lines += 1
+            if "*/" in stripped:
+                in_block_comment = False
+            continue
+        if stripped.startswith("/*"):
+            comment_lines += 1
+            if "*/" not in stripped:
+                in_block_comment = True
+            continue
+        if stripped.startswith("//"):
+            comment_lines += 1
+            continue
+        source_lines += 1
+
+    comment_ratio = round(comment_lines / max(source_lines, 1) * 100, 1)
+    results["comments"] = {
+        "comment_lines": comment_lines,
+        "source_lines": source_lines,
+        "ratio_pct": comment_ratio,
+    }
+
+    # ---- Separation of concerns ----
+    # Check if rendering code and game logic are in the same functions
+    render_keywords = re.compile(
+        r"\b(canvas|getContext|fillRect|clearRect|strokeRect|drawImage|render|draw|paint|ctx\.|requestAnimationFrame)\b"
+    )
+    logic_keywords = re.compile(
+        r"\b(rotate|collide|collision|clearLine|clearRow|gameOver|game_over|score|level|dropPiece|lockPiece|spawnPiece|checkLine)\b",
+        re.IGNORECASE,
+    )
+
+    # Check per-file separation
+    files_with_render = 0
+    files_with_logic = 0
+    files_with_both = 0
+    for f in code_files:
+        if f.suffix not in {".ts", ".tsx", ".js", ".jsx"}:
+            continue
+        try:
+            content = f.read_text()
+        except Exception:
+            continue
+        has_render = bool(render_keywords.search(content))
+        has_logic = bool(logic_keywords.search(content))
+        if has_render:
+            files_with_render += 1
+        if has_logic:
+            files_with_logic += 1
+        if has_render and has_logic:
+            files_with_both += 1
+
+    if len(code_files) <= 1:
+        separation = "single-file"
+    elif files_with_both == 0 and files_with_render > 0 and files_with_logic > 0:
+        separation = "separated"
+    elif files_with_both > 0:
+        separation = "mixed"
+    else:
+        separation = "unclear"
+
+    results["separation_of_concerns"] = {
+        "verdict": separation,
+        "files_with_rendering": files_with_render,
+        "files_with_logic": files_with_logic,
+        "files_with_both": files_with_both,
+    }
+
+    # ---- HTML validation ----
+    html_files = [f for f in code_files if f.suffix == ".html"]
+    html_valid = "no_html"
+    html_errors = 0
+    if html_files:
+        run_cmd(["npm", "install", "--save-dev", "html-validate"], str(workspace))
+        for hf in html_files[:1]:  # Just check the main HTML
+            output = run_cmd(
+                ["npx", "html-validate", "--formatter", "json", str(hf)],
+                str(workspace),
+            )
+            if output:
+                try:
+                    data = json.loads(output)
+                    html_errors = sum(r.get("errorCount", 0) for r in data)
+                    html_valid = "true" if html_errors == 0 else "false"
+                except json.JSONDecodeError:
+                    pass
+    results["html_validation"] = {"valid": html_valid == "true", "errors": html_errors}
+
+    # ---- Code duplication ----
+    duplication_pct = 0.0
+    run_cmd(["npm", "install", "--save-dev", "jscpd"], str(workspace))
+    output = run_cmd(
+        ["npx", "jscpd", "--min-lines", "5", "--min-tokens", "50",
+         "--reporters", "json", "--ignore", "node_modules,package-lock.json", "."],
+        str(workspace), timeout=60,
+    )
+    if output:
+        try:
+            data = json.loads(output)
+            duplication_pct = data.get("statistics", {}).get("total", {}).get("percentage", 0)
+        except json.JSONDecodeError:
+            pass
+    results["duplication_percentage"] = round(duplication_pct, 2)
+
+    # ---- Compute score ----
+    score = 100
+
+    # Unnecessary files (-10 each, max -30)
+    score -= min(len(unnecessary) * 10, 30)
+
+    # Too many dependencies (-5 each over 2, max -20)
+    if deps > 2:
+        score -= min((deps - 2) * 5, 20)
+
+    # Console.logs (-2 each, max -10)
+    score -= min(console_logs * 2, 10)
+
+    # Over-engineering (-10)
+    if complexity == "over-engineered":
+        score -= 10
+
+    # Invalid HTML (-5)
+    if html_valid == "false":
+        score -= 5
+
+    # High duplication (-10 over 10%, -5 over 5%)
+    if duplication_pct > 10:
+        score -= 10
+    elif duplication_pct > 5:
+        score -= 5
+
+    # Long functions (-5 if any > 100 lines)
+    if max_func_length > 100:
+        score -= 5
+
+    # Deep nesting (-5 if > 8 levels)
+    if max_nesting > 8:
+        score -= 5
+
+    # Inconsistent naming (-5 if < 80% consistency)
+    if consistency < 80:
+        score -= 5
+
+    # Excessive magic numbers (-5)
+    if len(magic_numbers) > 20:
+        score -= 5
+
+    # Bonus for separated concerns (+5)
+    if separation == "separated":
+        score += 5
+
+    score = max(0, min(100, score))
+    results["score"] = round(score / 100, 2)
+
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	harness/run.py	\|	21	++++++++++++++++-----
A	tasks/tetris/eval/code-analysis.py	\|	363	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++