code-analysis.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

code-analysis.py (12216B)
      1 #!/usr/bin/env python3
      2 """Code quality analysis for generated Tetris implementations.
      3 
      4 Measures code quality attributes that don't require running the game.
      5 All checks are deterministic -- no LLM grading.
      6 
      7 Usage: python3 code-analysis.py <workspace_path> <language>
      8 Output: JSON to stdout
      9 """
     10 
     11 import json
     12 import os
     13 import re
     14 import subprocess
     15 import sys
     16 from pathlib import Path
     17 
     18 
     19 def run_cmd(cmd: list[str], cwd: str, timeout: int = 30) -> str:
     20     try:
     21         result = subprocess.run(
     22             cmd, capture_output=True, text=True, cwd=cwd, timeout=timeout
     23         )
     24         return result.stdout.strip()
     25     except Exception:
     26         return ""
     27 
     28 
     29 def main():
     30     workspace = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
     31     language = sys.argv[2] if len(sys.argv) > 2 else "typescript"
     32 
     33     os.chdir(workspace)
     34     results: dict = {}
     35 
     36     # ---- File inventory ----
     37     all_files = list(
     38         f for f in workspace.rglob("*")
     39         if f.is_file()
     40         and "node_modules" not in str(f)
     41         and ".git" not in str(f)
     42     )
     43     code_extensions = {".ts", ".tsx", ".js", ".jsx", ".html", ".css"}
     44     doc_extensions = {".md", ".txt"}
     45 
     46     code_files = [f for f in all_files if f.suffix in code_extensions]
     47     doc_files = [f for f in all_files if f.suffix in doc_extensions]
     48 
     49     unnecessary_patterns = [
     50         "README.md", "IMPLEMENTATION.md", "FEATURES.md", "QUICK_START.txt",
     51         "CHANGELOG.md", "TODO.md", "server.js",
     52     ]
     53     unnecessary = [f.name for f in all_files if f.name in unnecessary_patterns]
     54 
     55     results["files"] = {
     56         "total": len(all_files),
     57         "code": len(code_files),
     58         "docs": len(doc_files),
     59         "unnecessary": len(unnecessary),
     60         "unnecessary_list": unnecessary,
     61     }
     62 
     63     # ---- Lines of code ----
     64     total_loc = 0
     65     for f in code_files:
     66         try:
     67             total_loc += len(f.read_text().splitlines())
     68         except Exception:
     69             pass
     70     results["lines_of_code"] = total_loc
     71 
     72     # ---- Dependencies ----
     73     pkg_json = workspace / "package.json"
     74     deps = 0
     75     dev_deps = 0
     76     if pkg_json.exists():
     77         try:
     78             pkg = json.loads(pkg_json.read_text())
     79             deps = len(pkg.get("dependencies", {}))
     80             dev_deps = len(pkg.get("devDependencies", {}))
     81         except Exception:
     82             pass
     83     results["dependencies"] = {"production": deps, "dev": dev_deps, "total": deps + dev_deps}
     84 
     85     # ---- Complexity classification ----
     86     if len(code_files) <= 2:
     87         complexity = "minimal"
     88     elif len(code_files) <= 5:
     89         complexity = "moderate"
     90     else:
     91         complexity = "over-engineered"
     92     results["complexity"] = complexity
     93 
     94     # ---- Console.log count ----
     95     console_logs = 0
     96     for f in code_files:
     97         if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
     98             try:
     99                 console_logs += f.read_text().count("console.log")
    100             except Exception:
    101                 pass
    102     results["console_logs"] = console_logs
    103 
    104     # ---- Read all JS/TS source for further analysis ----
    105     all_source = ""
    106     for f in code_files:
    107         if f.suffix in {".ts", ".tsx", ".js", ".jsx"}:
    108             try:
    109                 all_source += f.read_text() + "\n"
    110             except Exception:
    111                 pass
    112 
    113     # ---- Magic numbers ----
    114     # Find numeric literals > 1 that aren't in common patterns
    115     magic_re = re.compile(r"(?<!\w)(\d{2,})(?!\w)")
    116     ok_numbers = {"10", "20", "100", "200", "255", "300", "600", "1000", "60", "30"}
    117     magic_numbers = []
    118     for match in magic_re.finditer(all_source):
    119         num = match.group(1)
    120         if num not in ok_numbers and not num.startswith("0x"):
    121             magic_numbers.append(num)
    122     # Only flag if excessive
    123     results["magic_numbers"] = {
    124         "count": len(magic_numbers),
    125         "excessive": len(magic_numbers) > 20,
    126     }
    127 
    128     # ---- Function length ----
    129     # Simple heuristic: count lines between function/method declarations
    130     func_re = re.compile(r"(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>|\w+\s*\([^)]*\)\s*\{)")
    131     func_lengths = []
    132     lines = all_source.split("\n")
    133     in_func_start = -1
    134     brace_depth = 0
    135     for i, line in enumerate(lines):
    136         if func_re.search(line) and "{" in line:
    137             in_func_start = i
    138             brace_depth = 0
    139         if in_func_start >= 0:
    140             brace_depth += line.count("{") - line.count("}")
    141             if brace_depth <= 0 and i > in_func_start:
    142                 func_lengths.append(i - in_func_start + 1)
    143                 in_func_start = -1
    144 
    145     avg_func_length = round(sum(func_lengths) / len(func_lengths), 1) if func_lengths else 0
    146     max_func_length = max(func_lengths) if func_lengths else 0
    147     results["function_length"] = {
    148         "count": len(func_lengths),
    149         "average": avg_func_length,
    150         "max": max_func_length,
    151         "long_functions": sum(1 for f in func_lengths if f > 50),
    152     }
    153 
    154     # ---- Nesting depth ----
    155     max_nesting = 0
    156     for line in lines:
    157         stripped = line.lstrip()
    158         indent = len(line) - len(stripped)
    159         # Approximate nesting from indentation (assuming 2-space indent)
    160         depth = indent // 2
    161         max_nesting = max(max_nesting, depth)
    162     results["max_nesting_depth"] = max_nesting
    163 
    164     # ---- Global scope pollution ----
    165     # Count var declarations at top level (indent 0)
    166     globals_count = 0
    167     global_re = re.compile(r"^(?:var|let|const)\s+\w+")
    168     for line in lines:
    169         if not line.startswith(" ") and not line.startswith("\t") and global_re.match(line):
    170             globals_count += 1
    171     results["global_declarations"] = globals_count
    172 
    173     # ---- Naming consistency ----
    174     camel_count = len(re.findall(r"\b[a-z]+[A-Z]\w*\b", all_source))
    175     snake_count = len(re.findall(r"\b[a-z]+_[a-z]+\w*\b", all_source))
    176     if camel_count > 0 and snake_count > 0:
    177         total_names = camel_count + snake_count
    178         dominant = "camelCase" if camel_count >= snake_count else "snake_case"
    179         consistency = round(max(camel_count, snake_count) / total_names * 100, 1)
    180     else:
    181         dominant = "camelCase" if camel_count > 0 else "snake_case" if snake_count > 0 else "unknown"
    182         consistency = 100.0
    183     results["naming"] = {
    184         "dominant_style": dominant,
    185         "consistency_pct": consistency,
    186         "camel_case": camel_count,
    187         "snake_case": snake_count,
    188     }
    189 
    190     # ---- Error handling ----
    191     try_catch_count = all_source.count("try {") + all_source.count("try{")
    192     results["error_handling"] = {
    193         "try_catch_blocks": try_catch_count,
    194         "has_error_handling": try_catch_count > 0,
    195     }
    196 
    197     # ---- Comments ratio ----
    198     comment_lines = 0
    199     source_lines = 0
    200     in_block_comment = False
    201     for line in lines:
    202         stripped = line.strip()
    203         if not stripped:
    204             continue
    205         if in_block_comment:
    206             comment_lines += 1
    207             if "*/" in stripped:
    208                 in_block_comment = False
    209             continue
    210         if stripped.startswith("/*"):
    211             comment_lines += 1
    212             if "*/" not in stripped:
    213                 in_block_comment = True
    214             continue
    215         if stripped.startswith("//"):
    216             comment_lines += 1
    217             continue
    218         source_lines += 1
    219 
    220     comment_ratio = round(comment_lines / max(source_lines, 1) * 100, 1)
    221     results["comments"] = {
    222         "comment_lines": comment_lines,
    223         "source_lines": source_lines,
    224         "ratio_pct": comment_ratio,
    225     }
    226 
    227     # ---- Separation of concerns ----
    228     # Check if rendering code and game logic are in the same functions
    229     render_keywords = re.compile(
    230         r"\b(canvas|getContext|fillRect|clearRect|strokeRect|drawImage|render|draw|paint|ctx\.|requestAnimationFrame)\b"
    231     )
    232     logic_keywords = re.compile(
    233         r"\b(rotate|collide|collision|clearLine|clearRow|gameOver|game_over|score|level|dropPiece|lockPiece|spawnPiece|checkLine)\b",
    234         re.IGNORECASE,
    235     )
    236 
    237     # Check per-file separation
    238     files_with_render = 0
    239     files_with_logic = 0
    240     files_with_both = 0
    241     for f in code_files:
    242         if f.suffix not in {".ts", ".tsx", ".js", ".jsx"}:
    243             continue
    244         try:
    245             content = f.read_text()
    246         except Exception:
    247             continue
    248         has_render = bool(render_keywords.search(content))
    249         has_logic = bool(logic_keywords.search(content))
    250         if has_render:
    251             files_with_render += 1
    252         if has_logic:
    253             files_with_logic += 1
    254         if has_render and has_logic:
    255             files_with_both += 1
    256 
    257     if len(code_files) <= 1:
    258         separation = "single-file"
    259     elif files_with_both == 0 and files_with_render > 0 and files_with_logic > 0:
    260         separation = "separated"
    261     elif files_with_both > 0:
    262         separation = "mixed"
    263     else:
    264         separation = "unclear"
    265 
    266     results["separation_of_concerns"] = {
    267         "verdict": separation,
    268         "files_with_rendering": files_with_render,
    269         "files_with_logic": files_with_logic,
    270         "files_with_both": files_with_both,
    271     }
    272 
    273     # ---- HTML validation ----
    274     html_files = [f for f in code_files if f.suffix == ".html"]
    275     html_valid = "no_html"
    276     html_errors = 0
    277     if html_files:
    278         run_cmd(["npm", "install", "--save-dev", "html-validate"], str(workspace))
    279         for hf in html_files[:1]:  # Just check the main HTML
    280             output = run_cmd(
    281                 ["npx", "html-validate", "--formatter", "json", str(hf)],
    282                 str(workspace),
    283             )
    284             if output:
    285                 try:
    286                     data = json.loads(output)
    287                     html_errors = sum(r.get("errorCount", 0) for r in data)
    288                     html_valid = "true" if html_errors == 0 else "false"
    289                 except json.JSONDecodeError:
    290                     pass
    291     results["html_validation"] = {"valid": html_valid == "true", "errors": html_errors}
    292 
    293     # ---- Code duplication ----
    294     duplication_pct = 0.0
    295     run_cmd(["npm", "install", "--save-dev", "jscpd"], str(workspace))
    296     output = run_cmd(
    297         ["npx", "jscpd", "--min-lines", "5", "--min-tokens", "50",
    298          "--reporters", "json", "--ignore", "node_modules,package-lock.json", "."],
    299         str(workspace), timeout=60,
    300     )
    301     if output:
    302         try:
    303             data = json.loads(output)
    304             duplication_pct = data.get("statistics", {}).get("total", {}).get("percentage", 0)
    305         except json.JSONDecodeError:
    306             pass
    307     results["duplication_percentage"] = round(duplication_pct, 2)
    308 
    309     # ---- Compute score ----
    310     score = 100
    311 
    312     # No code at all = 0 score (empty workspace or build failed)
    313     if total_loc < 50:
    314         results["score"] = 0.0
    315         results["score_reason"] = f"insufficient code ({total_loc} LOC, minimum 50)"
    316         print(json.dumps(results, indent=2))
    317         return
    318 
    319     # No HTML entry point = major penalty (game can't be played)
    320     if html_valid == "no_html":
    321         score -= 40
    322 
    323     # Unnecessary files (-10 each, max -30)
    324     score -= min(len(unnecessary) * 10, 30)
    325 
    326     # Too many dependencies (-5 each over 2, max -20)
    327     if deps > 2:
    328         score -= min((deps - 2) * 5, 20)
    329 
    330     # Console.logs (-2 each, max -10)
    331     score -= min(console_logs * 2, 10)
    332 
    333     # Over-engineering (-10)
    334     if complexity == "over-engineered":
    335         score -= 10
    336 
    337     # Invalid HTML (-5)
    338     if html_valid == "false":
    339         score -= 5
    340 
    341     # High duplication (-10 over 10%, -5 over 5%)
    342     if duplication_pct > 10:
    343         score -= 10
    344     elif duplication_pct > 5:
    345         score -= 5
    346 
    347     # Long functions (-5 if any > 100 lines)
    348     if max_func_length > 100:
    349         score -= 5
    350 
    351     # Deep nesting (-5 if > 8 levels)
    352     if max_nesting > 8:
    353         score -= 5
    354 
    355     # Inconsistent naming (-5 if < 80% consistency)
    356     if consistency < 80:
    357         score -= 5
    358 
    359     # Excessive magic numbers (-5)
    360     if len(magic_numbers) > 20:
    361         score -= 5
    362 
    363     # Bonus for separated concerns (+5)
    364     if separation == "separated":
    365         score += 5
    366 
    367     score = max(0, min(100, score))
    368     results["score"] = round(score / 100, 2)
    369 
    370     print(json.dumps(results, indent=2))
    371 
    372 
    373 if __name__ == "__main__":
    374     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README