loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit ee245799e717394d2faaa0060b8b1b0a24fed503
parent 7f450138909c4de86b94981aca1ebbb1a9defa82
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:21:32 +0200

Comprehensive code quality analysis (Python rewrite)

New code-analysis.py replaces the bash version with deeper checks:
- Magic numbers detection (numeric literals not in named constants)
- Function length analysis (average, max, count of long functions)
- Nesting depth (max indentation level)
- Global scope pollution (top-level declarations)
- Naming consistency (camelCase vs snake_case ratio)
- Error handling presence (try/catch blocks)
- Comments ratio (comment lines vs source lines)
- Separation of concerns (rendering vs logic in same files)
- HTML validation (html-validate)
- Code duplication (jscpd)

All deterministic, no LLM grading. Each metric factors into
a 0-1 score with penalties/bonuses.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mharness/run.py | 21++++++++++++++++-----
Atasks/tetris/eval/code-analysis.py | 363+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 379 insertions(+), 5 deletions(-)

diff --git a/harness/run.py b/harness/run.py @@ -283,11 +283,22 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path): output = run_eval_script(quality_sh, workspace, language) results["quality"] = safe_parse_json(output) - # Code analysis (file count, LOC, unnecessary files, dependencies) - code_analysis_sh = task_dir / "eval" / "code-analysis.sh" - if code_analysis_sh.exists(): - output = run_eval_script(code_analysis_sh, workspace, language) - results["code_analysis"] = safe_parse_json(output) + # Code analysis (file count, LOC, unnecessary files, dependencies, quality metrics) + code_analysis_py = task_dir / "eval" / "code-analysis.py" + if code_analysis_py.exists(): + try: + result = subprocess.run( + ["python3", str(code_analysis_py), str(workspace), language], + capture_output=True, text=True, timeout=120, + ) + results["code_analysis"] = safe_parse_json(result.stdout.strip()) + except Exception as e: + results["code_analysis"] = {"error": str(e), "score": 0} + else: + code_analysis_sh = task_dir / "eval" / "code-analysis.sh" + if code_analysis_sh.exists(): + output = run_eval_script(code_analysis_sh, workspace, language) + results["code_analysis"] = safe_parse_json(output) # Transcript analysis (agent efficiency, wasted turns, self-testing) transcript_py = task_dir / "eval" / "transcript-analysis.py" diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +"""Code quality analysis for generated Tetris implementations. + +Measures code quality attributes that don't require running the game. +All checks are deterministic -- no LLM grading. + +Usage: python3 code-analysis.py <workspace_path> <language> +Output: JSON to stdout +""" + +import json +import os +import re +import subprocess +import sys +from pathlib import Path + + +def run_cmd(cmd: list[str], cwd: str, timeout: int = 30) -> str: + try: + result = subprocess.run( + cmd, capture_output=True, text=True, cwd=cwd, timeout=timeout + ) + return result.stdout.strip() + except Exception: + return "" + + +def main(): + workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") + language = sys.argv[2] if len(sys.argv) > 2 else "typescript" + + os.chdir(workspace) + results: dict = {} + + # ---- File inventory ---- + all_files = list( + f for f in workspace.rglob("*") + if f.is_file() + and "node_modules" not in str(f) + and ".git" not in str(f) + ) + code_extensions = {".ts", ".tsx", ".js", ".jsx", ".html", ".css"} + doc_extensions = {".md", ".txt"} + + code_files = [f for f in all_files if f.suffix in code_extensions] + doc_files = [f for f in all_files if f.suffix in doc_extensions] + + unnecessary_patterns = [ + "README.md", "IMPLEMENTATION.md", "FEATURES.md", "QUICK_START.txt", + "CHANGELOG.md", "TODO.md", "server.js", + ] + unnecessary = [f.name for f in all_files if f.name in unnecessary_patterns] + + results["files"] = { + "total": len(all_files), + "code": len(code_files), + "docs": len(doc_files), + "unnecessary": len(unnecessary), + "unnecessary_list": unnecessary, + } + + # ---- Lines of code ---- + total_loc = 0 + for f in code_files: + try: + total_loc += len(f.read_text().splitlines()) + except Exception: + pass + results["lines_of_code"] = total_loc + + # ---- Dependencies ---- + pkg_json = workspace / "package.json" + deps = 0 + dev_deps = 0 + if pkg_json.exists(): + try: + pkg = json.loads(pkg_json.read_text()) + deps = len(pkg.get("dependencies", {})) + dev_deps = len(pkg.get("devDependencies", {})) + except Exception: + pass + results["dependencies"] = {"production": deps, "dev": dev_deps, "total": deps + dev_deps} + + # ---- Complexity classification ---- + if len(code_files) <= 2: + complexity = "minimal" + elif len(code_files) <= 5: + complexity = "moderate" + else: + complexity = "over-engineered" + results["complexity"] = complexity + + # ---- Console.log count ---- + console_logs = 0 + for f in code_files: + if f.suffix in {".ts", ".tsx", ".js", ".jsx"}: + try: + console_logs += f.read_text().count("console.log") + except Exception: + pass + results["console_logs"] = console_logs + + # ---- Read all JS/TS source for further analysis ---- + all_source = "" + for f in code_files: + if f.suffix in {".ts", ".tsx", ".js", ".jsx"}: + try: + all_source += f.read_text() + "\n" + except Exception: + pass + + # ---- Magic numbers ---- + # Find numeric literals > 1 that aren't in common patterns + magic_re = re.compile(r"(?<!\w)(\d{2,})(?!\w)") + ok_numbers = {"10", "20", "100", "200", "255", "300", "600", "1000", "60", "30"} + magic_numbers = [] + for match in magic_re.finditer(all_source): + num = match.group(1) + if num not in ok_numbers and not num.startswith("0x"): + magic_numbers.append(num) + # Only flag if excessive + results["magic_numbers"] = { + "count": len(magic_numbers), + "excessive": len(magic_numbers) > 20, + } + + # ---- Function length ---- + # Simple heuristic: count lines between function/method declarations + func_re = re.compile(r"(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>|\w+\s*\([^)]*\)\s*\{)") + func_lengths = [] + lines = all_source.split("\n") + in_func_start = -1 + brace_depth = 0 + for i, line in enumerate(lines): + if func_re.search(line) and "{" in line: + in_func_start = i + brace_depth = 0 + if in_func_start >= 0: + brace_depth += line.count("{") - line.count("}") + if brace_depth <= 0 and i > in_func_start: + func_lengths.append(i - in_func_start + 1) + in_func_start = -1 + + avg_func_length = round(sum(func_lengths) / len(func_lengths), 1) if func_lengths else 0 + max_func_length = max(func_lengths) if func_lengths else 0 + results["function_length"] = { + "count": len(func_lengths), + "average": avg_func_length, + "max": max_func_length, + "long_functions": sum(1 for f in func_lengths if f > 50), + } + + # ---- Nesting depth ---- + max_nesting = 0 + for line in lines: + stripped = line.lstrip() + indent = len(line) - len(stripped) + # Approximate nesting from indentation (assuming 2-space indent) + depth = indent // 2 + max_nesting = max(max_nesting, depth) + results["max_nesting_depth"] = max_nesting + + # ---- Global scope pollution ---- + # Count var declarations at top level (indent 0) + globals_count = 0 + global_re = re.compile(r"^(?:var|let|const)\s+\w+") + for line in lines: + if not line.startswith(" ") and not line.startswith("\t") and global_re.match(line): + globals_count += 1 + results["global_declarations"] = globals_count + + # ---- Naming consistency ---- + camel_count = len(re.findall(r"\b[a-z]+[A-Z]\w*\b", all_source)) + snake_count = len(re.findall(r"\b[a-z]+_[a-z]+\w*\b", all_source)) + if camel_count > 0 and snake_count > 0: + total_names = camel_count + snake_count + dominant = "camelCase" if camel_count >= snake_count else "snake_case" + consistency = round(max(camel_count, snake_count) / total_names * 100, 1) + else: + dominant = "camelCase" if camel_count > 0 else "snake_case" if snake_count > 0 else "unknown" + consistency = 100.0 + results["naming"] = { + "dominant_style": dominant, + "consistency_pct": consistency, + "camel_case": camel_count, + "snake_case": snake_count, + } + + # ---- Error handling ---- + try_catch_count = all_source.count("try {") + all_source.count("try{") + results["error_handling"] = { + "try_catch_blocks": try_catch_count, + "has_error_handling": try_catch_count > 0, + } + + # ---- Comments ratio ---- + comment_lines = 0 + source_lines = 0 + in_block_comment = False + for line in lines: + stripped = line.strip() + if not stripped: + continue + if in_block_comment: + comment_lines += 1 + if "*/" in stripped: + in_block_comment = False + continue + if stripped.startswith("/*"): + comment_lines += 1 + if "*/" not in stripped: + in_block_comment = True + continue + if stripped.startswith("//"): + comment_lines += 1 + continue + source_lines += 1 + + comment_ratio = round(comment_lines / max(source_lines, 1) * 100, 1) + results["comments"] = { + "comment_lines": comment_lines, + "source_lines": source_lines, + "ratio_pct": comment_ratio, + } + + # ---- Separation of concerns ---- + # Check if rendering code and game logic are in the same functions + render_keywords = re.compile( + r"\b(canvas|getContext|fillRect|clearRect|strokeRect|drawImage|render|draw|paint|ctx\.|requestAnimationFrame)\b" + ) + logic_keywords = re.compile( + r"\b(rotate|collide|collision|clearLine|clearRow|gameOver|game_over|score|level|dropPiece|lockPiece|spawnPiece|checkLine)\b", + re.IGNORECASE, + ) + + # Check per-file separation + files_with_render = 0 + files_with_logic = 0 + files_with_both = 0 + for f in code_files: + if f.suffix not in {".ts", ".tsx", ".js", ".jsx"}: + continue + try: + content = f.read_text() + except Exception: + continue + has_render = bool(render_keywords.search(content)) + has_logic = bool(logic_keywords.search(content)) + if has_render: + files_with_render += 1 + if has_logic: + files_with_logic += 1 + if has_render and has_logic: + files_with_both += 1 + + if len(code_files) <= 1: + separation = "single-file" + elif files_with_both == 0 and files_with_render > 0 and files_with_logic > 0: + separation = "separated" + elif files_with_both > 0: + separation = "mixed" + else: + separation = "unclear" + + results["separation_of_concerns"] = { + "verdict": separation, + "files_with_rendering": files_with_render, + "files_with_logic": files_with_logic, + "files_with_both": files_with_both, + } + + # ---- HTML validation ---- + html_files = [f for f in code_files if f.suffix == ".html"] + html_valid = "no_html" + html_errors = 0 + if html_files: + run_cmd(["npm", "install", "--save-dev", "html-validate"], str(workspace)) + for hf in html_files[:1]: # Just check the main HTML + output = run_cmd( + ["npx", "html-validate", "--formatter", "json", str(hf)], + str(workspace), + ) + if output: + try: + data = json.loads(output) + html_errors = sum(r.get("errorCount", 0) for r in data) + html_valid = "true" if html_errors == 0 else "false" + except json.JSONDecodeError: + pass + results["html_validation"] = {"valid": html_valid == "true", "errors": html_errors} + + # ---- Code duplication ---- + duplication_pct = 0.0 + run_cmd(["npm", "install", "--save-dev", "jscpd"], str(workspace)) + output = run_cmd( + ["npx", "jscpd", "--min-lines", "5", "--min-tokens", "50", + "--reporters", "json", "--ignore", "node_modules,package-lock.json", "."], + str(workspace), timeout=60, + ) + if output: + try: + data = json.loads(output) + duplication_pct = data.get("statistics", {}).get("total", {}).get("percentage", 0) + except json.JSONDecodeError: + pass + results["duplication_percentage"] = round(duplication_pct, 2) + + # ---- Compute score ---- + score = 100 + + # Unnecessary files (-10 each, max -30) + score -= min(len(unnecessary) * 10, 30) + + # Too many dependencies (-5 each over 2, max -20) + if deps > 2: + score -= min((deps - 2) * 5, 20) + + # Console.logs (-2 each, max -10) + score -= min(console_logs * 2, 10) + + # Over-engineering (-10) + if complexity == "over-engineered": + score -= 10 + + # Invalid HTML (-5) + if html_valid == "false": + score -= 5 + + # High duplication (-10 over 10%, -5 over 5%) + if duplication_pct > 10: + score -= 10 + elif duplication_pct > 5: + score -= 5 + + # Long functions (-5 if any > 100 lines) + if max_func_length > 100: + score -= 5 + + # Deep nesting (-5 if > 8 levels) + if max_nesting > 8: + score -= 5 + + # Inconsistent naming (-5 if < 80% consistency) + if consistency < 80: + score -= 5 + + # Excessive magic numbers (-5) + if len(magic_numbers) > 20: + score -= 5 + + # Bonus for separated concerns (+5) + if separation == "separated": + score += 5 + + score = max(0, min(100, score)) + results["score"] = round(score / 100, 2) + + print(json.dumps(results, indent=2)) + + +if __name__ == "__main__": + main()

Impressum · Datenschutz