commit ee245799e717394d2faaa0060b8b1b0a24fed503
parent 7f450138909c4de86b94981aca1ebbb1a9defa82
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 08:21:32 +0200
Comprehensive code quality analysis (Python rewrite)
New code-analysis.py replaces the bash version with deeper checks:
- Magic numbers detection (numeric literals not in named constants)
- Function length analysis (average, max, count of long functions)
- Nesting depth (max indentation level)
- Global scope pollution (top-level declarations)
- Naming consistency (camelCase vs snake_case ratio)
- Error handling presence (try/catch blocks)
- Comments ratio (comment lines vs source lines)
- Separation of concerns (rendering vs logic in same files)
- HTML validation (html-validate)
- Code duplication (jscpd)
All deterministic, no LLM grading. Each metric factors into
a 0-1 score with penalties/bonuses.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 379 insertions(+), 5 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -283,11 +283,22 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
output = run_eval_script(quality_sh, workspace, language)
results["quality"] = safe_parse_json(output)
- # Code analysis (file count, LOC, unnecessary files, dependencies)
- code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
- if code_analysis_sh.exists():
- output = run_eval_script(code_analysis_sh, workspace, language)
- results["code_analysis"] = safe_parse_json(output)
+ # Code analysis (file count, LOC, unnecessary files, dependencies, quality metrics)
+ code_analysis_py = task_dir / "eval" / "code-analysis.py"
+ if code_analysis_py.exists():
+ try:
+ result = subprocess.run(
+ ["python3", str(code_analysis_py), str(workspace), language],
+ capture_output=True, text=True, timeout=120,
+ )
+ results["code_analysis"] = safe_parse_json(result.stdout.strip())
+ except Exception as e:
+ results["code_analysis"] = {"error": str(e), "score": 0}
+ else:
+ code_analysis_sh = task_dir / "eval" / "code-analysis.sh"
+ if code_analysis_sh.exists():
+ output = run_eval_script(code_analysis_sh, workspace, language)
+ results["code_analysis"] = safe_parse_json(output)
# Transcript analysis (agent efficiency, wasted turns, self-testing)
transcript_py = task_dir / "eval" / "transcript-analysis.py"
diff --git a/tasks/tetris/eval/code-analysis.py b/tasks/tetris/eval/code-analysis.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+"""Code quality analysis for generated Tetris implementations.
+
+Measures code quality attributes that don't require running the game.
+All checks are deterministic -- no LLM grading.
+
+Usage: python3 code-analysis.py <workspace_path> <language>
+Output: JSON to stdout
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_cmd(cmd: list[str], cwd: str, timeout: int = 30) -> str:
+ try:
+ result = subprocess.run(
+ cmd, capture_output=True, text=True, cwd=cwd, timeout=timeout
+ )
+ return result.stdout.strip()
+ except Exception:
+ return ""
+
+
+def main():
+ workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
+ language = sys.argv[2] if len(sys.argv) > 2 else "typescript"
+
+ os.chdir(workspace)
+ results: dict = {}
+
+ # ---- File inventory ----
+ all_files = list(
+ f for f in workspace.rglob("*")
+ if f.is_file()
+ and "node_modules" not in str(f)
+ and ".git" not in str(f)
+ )
+ code_extensions = {".ts", ".tsx", ".js", ".jsx", ".html", ".css"}
+ doc_extensions = {".md", ".txt"}
+
+ code_files = [f for f in all_files if f.suffix in code_extensions]
+ doc_files = [f for f in all_files if f.suffix in doc_extensions]
+
+ unnecessary_patterns = [
+ "README.md", "IMPLEMENTATION.md", "FEATURES.md", "QUICK_START.txt",
+ "CHANGELOG.md", "TODO.md", "server.js",
+ ]
+ unnecessary = [f.name for f in all_files if f.name in unnecessary_patterns]
+
+ results["files"] = {
+ "total": len(all_files),
+ "code": len(code_files),
+ "docs": len(doc_files),
+ "unnecessary": len(unnecessary),
+ "unnecessary_list": unnecessary,
+ }
+
+ # ---- Lines of code ----
+ total_loc = 0
+ for f in code_files:
+ try:
+ total_loc += len(f.read_text().splitlines())
+ except Exception:
+ pass
+ results["lines_of_code"] = total_loc
+
+ # ---- Dependencies ----
+ pkg_json = workspace / "package.json"
+ deps = 0
+ dev_deps = 0
+ if pkg_json.exists():
+ try:
+ pkg = json.loads(pkg_json.read_text())
+ deps = len(pkg.get("dependencies", {}))
+ dev_deps = len(pkg.get("devDependencies", {}))
+ except Exception:
+ pass
+ results["dependencies"] = {"production": deps, "dev": dev_deps, "total": deps + dev_deps}
+
+ # ---- Complexity classification ----
+ if len(code_files) <= 2:
+ complexity = "minimal"
+ elif len(code_files) <= 5:
+ complexity = "moderate"
+ else:
+ complexity = "over-engineered"
+ results["complexity"] = complexity
+
+ # ---- Console.log count ----
+ console_logs = 0
+ for f in code_files:
+ if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
+ try:
+ console_logs += f.read_text().count("console.log")
+ except Exception:
+ pass
+ results["console_logs"] = console_logs
+
+ # ---- Read all JS/TS source for further analysis ----
+ all_source = ""
+ for f in code_files:
+ if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
+ try:
+ all_source += f.read_text() + "\n"
+ except Exception:
+ pass
+
+ # ---- Magic numbers ----
+ # Find numeric literals > 1 that aren't in common patterns
+ magic_re = re.compile(r"(?<!\w)(\d{2,})(?!\w)")
+ ok_numbers = {"10", "20", "100", "200", "255", "300", "600", "1000", "60", "30"}
+ magic_numbers = []
+ for match in magic_re.finditer(all_source):
+ num = match.group(1)
+ if num not in ok_numbers and not num.startswith("0x"):
+ magic_numbers.append(num)
+ # Only flag if excessive
+ results["magic_numbers"] = {
+ "count": len(magic_numbers),
+ "excessive": len(magic_numbers) > 20,
+ }
+
+ # ---- Function length ----
+ # Simple heuristic: count lines between function/method declarations
+ func_re = re.compile(r"(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>|\w+\s*\([^)]*\)\s*\{)")
+ func_lengths = []
+ lines = all_source.split("\n")
+ in_func_start = -1
+ brace_depth = 0
+ for i, line in enumerate(lines):
+ if func_re.search(line) and "{" in line:
+ in_func_start = i
+ brace_depth = 0
+ if in_func_start >= 0:
+ brace_depth += line.count("{") - line.count("}")
+ if brace_depth <= 0 and i > in_func_start:
+ func_lengths.append(i - in_func_start + 1)
+ in_func_start = -1
+
+ avg_func_length = round(sum(func_lengths) / len(func_lengths), 1) if func_lengths else 0
+ max_func_length = max(func_lengths) if func_lengths else 0
+ results["function_length"] = {
+ "count": len(func_lengths),
+ "average": avg_func_length,
+ "max": max_func_length,
+ "long_functions": sum(1 for f in func_lengths if f > 50),
+ }
+
+ # ---- Nesting depth ----
+ max_nesting = 0
+ for line in lines:
+ stripped = line.lstrip()
+ indent = len(line) - len(stripped)
+ # Approximate nesting from indentation (assuming 2-space indent)
+ depth = indent // 2
+ max_nesting = max(max_nesting, depth)
+ results["max_nesting_depth"] = max_nesting
+
+ # ---- Global scope pollution ----
+ # Count var declarations at top level (indent 0)
+ globals_count = 0
+ global_re = re.compile(r"^(?:var|let|const)\s+\w+")
+ for line in lines:
+ if not line.startswith(" ") and not line.startswith("\t") and global_re.match(line):
+ globals_count += 1
+ results["global_declarations"] = globals_count
+
+ # ---- Naming consistency ----
+ camel_count = len(re.findall(r"\b[a-z]+[A-Z]\w*\b", all_source))
+ snake_count = len(re.findall(r"\b[a-z]+_[a-z]+\w*\b", all_source))
+ if camel_count > 0 and snake_count > 0:
+ total_names = camel_count + snake_count
+ dominant = "camelCase" if camel_count >= snake_count else "snake_case"
+ consistency = round(max(camel_count, snake_count) / total_names * 100, 1)
+ else:
+ dominant = "camelCase" if camel_count > 0 else "snake_case" if snake_count > 0 else "unknown"
+ consistency = 100.0
+ results["naming"] = {
+ "dominant_style": dominant,
+ "consistency_pct": consistency,
+ "camel_case": camel_count,
+ "snake_case": snake_count,
+ }
+
+ # ---- Error handling ----
+ try_catch_count = all_source.count("try {") + all_source.count("try{")
+ results["error_handling"] = {
+ "try_catch_blocks": try_catch_count,
+ "has_error_handling": try_catch_count > 0,
+ }
+
+ # ---- Comments ratio ----
+ comment_lines = 0
+ source_lines = 0
+ in_block_comment = False
+ for line in lines:
+ stripped = line.strip()
+ if not stripped:
+ continue
+ if in_block_comment:
+ comment_lines += 1
+ if "*/" in stripped:
+ in_block_comment = False
+ continue
+ if stripped.startswith("/*"):
+ comment_lines += 1
+ if "*/" not in stripped:
+ in_block_comment = True
+ continue
+ if stripped.startswith("//"):
+ comment_lines += 1
+ continue
+ source_lines += 1
+
+ comment_ratio = round(comment_lines / max(source_lines, 1) * 100, 1)
+ results["comments"] = {
+ "comment_lines": comment_lines,
+ "source_lines": source_lines,
+ "ratio_pct": comment_ratio,
+ }
+
+ # ---- Separation of concerns ----
+ # Check if rendering code and game logic are in the same functions
+ render_keywords = re.compile(
+ r"\b(canvas|getContext|fillRect|clearRect|strokeRect|drawImage|render|draw|paint|ctx\.|requestAnimationFrame)\b"
+ )
+ logic_keywords = re.compile(
+ r"\b(rotate|collide|collision|clearLine|clearRow|gameOver|game_over|score|level|dropPiece|lockPiece|spawnPiece|checkLine)\b",
+ re.IGNORECASE,
+ )
+
+ # Check per-file separation
+ files_with_render = 0
+ files_with_logic = 0
+ files_with_both = 0
+ for f in code_files:
+ if f.suffix not in {".ts", ".tsx", ".js", ".jsx"}:
+ continue
+ try:
+ content = f.read_text()
+ except Exception:
+ continue
+ has_render = bool(render_keywords.search(content))
+ has_logic = bool(logic_keywords.search(content))
+ if has_render:
+ files_with_render += 1
+ if has_logic:
+ files_with_logic += 1
+ if has_render and has_logic:
+ files_with_both += 1
+
+ if len(code_files) <= 1:
+ separation = "single-file"
+ elif files_with_both == 0 and files_with_render > 0 and files_with_logic > 0:
+ separation = "separated"
+ elif files_with_both > 0:
+ separation = "mixed"
+ else:
+ separation = "unclear"
+
+ results["separation_of_concerns"] = {
+ "verdict": separation,
+ "files_with_rendering": files_with_render,
+ "files_with_logic": files_with_logic,
+ "files_with_both": files_with_both,
+ }
+
+ # ---- HTML validation ----
+ html_files = [f for f in code_files if f.suffix == ".html"]
+ html_valid = "no_html"
+ html_errors = 0
+ if html_files:
+ run_cmd(["npm", "install", "--save-dev", "html-validate"], str(workspace))
+ for hf in html_files[:1]: # Just check the main HTML
+ output = run_cmd(
+ ["npx", "html-validate", "--formatter", "json", str(hf)],
+ str(workspace),
+ )
+ if output:
+ try:
+ data = json.loads(output)
+ html_errors = sum(r.get("errorCount", 0) for r in data)
+ html_valid = "true" if html_errors == 0 else "false"
+ except json.JSONDecodeError:
+ pass
+ results["html_validation"] = {"valid": html_valid == "true", "errors": html_errors}
+
+ # ---- Code duplication ----
+ duplication_pct = 0.0
+ run_cmd(["npm", "install", "--save-dev", "jscpd"], str(workspace))
+ output = run_cmd(
+ ["npx", "jscpd", "--min-lines", "5", "--min-tokens", "50",
+ "--reporters", "json", "--ignore", "node_modules,package-lock.json", "."],
+ str(workspace), timeout=60,
+ )
+ if output:
+ try:
+ data = json.loads(output)
+ duplication_pct = data.get("statistics", {}).get("total", {}).get("percentage", 0)
+ except json.JSONDecodeError:
+ pass
+ results["duplication_percentage"] = round(duplication_pct, 2)
+
+ # ---- Compute score ----
+ score = 100
+
+ # Unnecessary files (-10 each, max -30)
+ score -= min(len(unnecessary) * 10, 30)
+
+ # Too many dependencies (-5 each over 2, max -20)
+ if deps > 2:
+ score -= min((deps - 2) * 5, 20)
+
+ # Console.logs (-2 each, max -10)
+ score -= min(console_logs * 2, 10)
+
+ # Over-engineering (-10)
+ if complexity == "over-engineered":
+ score -= 10
+
+ # Invalid HTML (-5)
+ if html_valid == "false":
+ score -= 5
+
+ # High duplication (-10 over 10%, -5 over 5%)
+ if duplication_pct > 10:
+ score -= 10
+ elif duplication_pct > 5:
+ score -= 5
+
+ # Long functions (-5 if any > 100 lines)
+ if max_func_length > 100:
+ score -= 5
+
+ # Deep nesting (-5 if > 8 levels)
+ if max_nesting > 8:
+ score -= 5
+
+ # Inconsistent naming (-5 if < 80% consistency)
+ if consistency < 80:
+ score -= 5
+
+ # Excessive magic numbers (-5)
+ if len(magic_numbers) > 20:
+ score -= 5
+
+ # Bonus for separated concerns (+5)
+ if separation == "separated":
+ score += 5
+
+ score = max(0, min(100, score))
+ results["score"] = round(score / 100, 2)
+
+ print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+ main()