scan.json (18954B)
1 { 2 "paper": { 3 "title": "Large Language Models for Code Generation: A Comprehensive Survey of Challenges, Techniques, Evaluation, and Applications", 4 "authors": ["Nam Huynh", "Beiyu Lin"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.01245" 8 }, 9 "scan_version": 2, 10 "active_modules": ["survey_methodology"], 11 "methodology_tags": ["meta-analysis"], 12 "key_findings": "This survey organizes LLM code generation literature into four areas: limitations/challenges (resource constraints, errors, biases, security), fine-tuning techniques (domain-specific datasets, feedback, prompt engineering), evaluation (CodeBLEU, pass@k, HumanEval, SWE-bench, BigCodeBench), and applications (Copilot, Code Llama, AlphaCode). The survey covers 38 references total, which is very thin for a 'comprehensive survey.' No original analysis, quality assessment, or structured review methodology is applied.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No code or analysis scripts are released. No repository link is provided." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No dataset, search corpus, or extracted data is released." 24 }, 25 "environment_specified": { 26 "applies": false, 27 "answer": false, 28 "justification": "This is a survey paper with no computational experiments requiring an environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No instructions are provided for reproducing the literature search or paper selection process." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": false, 39 "answer": false, 40 "justification": "Survey paper with no original experiments or statistical analysis." 41 }, 42 "significance_tests": { 43 "applies": false, 44 "answer": false, 45 "justification": "Survey paper with no original experiments." 46 }, 47 "effect_sizes_reported": { 48 "applies": false, 49 "answer": false, 50 "justification": "Survey paper with no original experiments." 51 }, 52 "sample_size_justified": { 53 "applies": false, 54 "answer": false, 55 "justification": "Survey paper with no original experiments." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "Survey paper with no original experiments." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": false, 67 "justification": "The survey does not compare itself against prior surveys of LLM code generation. No positioning against existing reviews." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "No prior surveys are compared against." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "Survey paper — no system components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": false, 81 "answer": false, 82 "justification": "Survey paper — no original evaluation is conducted." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Survey paper — no system outputs to evaluate." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "Survey paper — no experiments." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "The survey organizes its reviewed papers into clear categories (Table 1): limits and challenges, fine-tuning techniques, evaluations, and applications, with subtopics and per-paper breakdowns." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section III discusses limitations and challenges of LLMs including syntactic/semantic errors, biases, and security risks, which function as failure cases of the surveyed systems." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The survey reports negative findings from reviewed papers: e.g., Copilot generates insecure code in ~40% of cases, ChatGPT code has 47% maintainability issues, bias increases with model size." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims to provide 'a comprehensive overview of LLMs for code generation' covering challenges, techniques, evaluations, and applications. The paper delivers on this structure, though the coverage is thin (38 references)." 115 }, 116 "causal_claims_justified": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper is a survey that reports findings from other papers without making original causal claims." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims to be a 'Comprehensive Survey' but covers only 38 references, which is extremely thin coverage. The abstract does not bound this limitation. Many major works in the space are missing." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": false, 128 "answer": false, 129 "justification": "Pure survey with no original empirical results requiring alternative explanations." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": false, 133 "answer": false, 134 "justification": "Survey paper with no original measurements." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "Survey paper — no models are used for experiments." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "Survey paper — no prompting used." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "Survey paper — no experiments." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "Survey paper — no agentic scaffolding used." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No description of how papers were selected for this survey. No search queries, databases searched, inclusion/exclusion criteria, or filtering pipeline is documented. The paper simply presents 38 references without explaining the selection methodology." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "No limitations section is present. The conclusion summarizes findings but does not discuss limitations of the survey itself." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries are stated. The paper does not discuss what was excluded from the survey or what the survey does NOT cover." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No list of all papers considered, search results, or extracted data is available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "No description of how the 38 references were identified or collected. No search strategy, databases, or time period is described." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; data source is published literature, but no standard benchmark is used — this is an ad-hoc collection." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "No documentation of how papers were found, screened, or selected for inclusion." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Authors are identified as being from the School of Computer Science, University of Oklahoma." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "Survey paper — does not evaluate any model on a benchmark." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Survey paper — does not evaluate any model on a benchmark." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Survey paper — does not evaluate any model on a benchmark." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Survey paper — no method with inference costs." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Survey paper — no computational experiments." 289 } 290 }, 291 "survey_methodology": { 292 "prisma_or_structured_protocol": { 293 "applies": true, 294 "answer": false, 295 "justification": "No structured review protocol is followed. No PRISMA diagram, no systematic search strategy, no reproducible queries. The 38 references appear to be ad-hoc selections." 296 }, 297 "quality_assessment_of_sources": { 298 "applies": true, 299 "answer": false, 300 "justification": "No quality assessment of included papers is performed. All sources are treated equally regardless of methodology, sample size, or rigor." 301 }, 302 "publication_bias_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of publication bias or whether the surveyed literature skews toward positive results." 306 } 307 } 308 }, 309 "claims": [ 310 { 311 "claim": "The survey provides a comprehensive overview of LLMs for code generation covering challenges, techniques, evaluations, and applications.", 312 "evidence": "The paper is organized into four sections (III-VI) covering these topics, supported by 38 references.", 313 "supported": "weak" 314 }, 315 { 316 "claim": "OpenAI o1 achieves the highest performance rate of 92.4% on HumanEval, establishing itself as the best coding model.", 317 "evidence": "Cited from Vellum leaderboard data (Section 2.2). No primary source or independent verification provided.", 318 "supported": "weak" 319 }, 320 { 321 "claim": "Smaller models (7B/8B/13B) demonstrate 5-15% performance gains over 70B models under the same resource limits.", 322 "evidence": "Attributed to Hassid et al. [28], Section 3.1. The survey reports this finding but does not independently verify it.", 323 "supported": "moderate" 324 }, 325 { 326 "claim": "38.92% of GPT-4's generated code contained gender bias.", 327 "evidence": "Cited from Huang et al. [24], Section 3. The survey reports this figure without critical assessment of the methodology.", 328 "supported": "moderate" 329 } 330 ], 331 "red_flags": [ 332 { 333 "flag": "Extremely thin coverage for a 'comprehensive survey'", 334 "detail": "Only 38 references are covered, which is far too few for a paper claiming to be a comprehensive survey of LLMs for code generation. Major works are missing. Many references are blog posts, company pages, and tutorial articles rather than peer-reviewed research." 335 }, 336 { 337 "flag": "No systematic review methodology", 338 "detail": "No search strategy, inclusion/exclusion criteria, or PRISMA-style protocol is described. The paper selection appears completely ad-hoc, making it impossible to assess coverage or reproduce the survey." 339 }, 340 { 341 "flag": "Heavy reliance on non-peer-reviewed sources", 342 "detail": "Many references are blog posts (Medium, freeCodeCamp, PromptHub, AutoGPT), company marketing pages (IBM, AWS, Nvidia), and tutorial sites (DataCamp, edX) rather than peer-reviewed research papers." 343 }, 344 { 345 "flag": "No quality assessment of reviewed papers", 346 "detail": "The survey presents findings from all sources uncritically, treating a Medium blog post with the same authority as a peer-reviewed conference paper. This launders weak results through aggregation." 347 }, 348 { 349 "flag": "No limitations section", 350 "detail": "The paper does not discuss any limitations of its own methodology or coverage, despite claiming to be 'comprehensive' with only 38 references." 351 }, 352 { 353 "flag": "Broken cross-reference", 354 "detail": "Table reference in Section 1 appears as 'Table ??' indicating an unresolved LaTeX reference, suggesting incomplete preparation." 355 } 356 ], 357 "cited_papers": [ 358 { 359 "title": "The Landscape and Challenges of HPC Research and LLMs", 360 "authors": ["L. Chen"], 361 "year": 2024, 362 "arxiv_id": "2402.02018", 363 "relevance": "Discusses resource constraints and computational demands of training LLMs for code generation." 364 }, 365 { 366 "title": "Where Do Large Language Models Fail When Generating Code?", 367 "authors": ["Z. Wang"], 368 "year": 2024, 369 "arxiv_id": "2406.08731", 370 "relevance": "Error taxonomy for LLM-generated code across six models on HumanEval." 371 }, 372 { 373 "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study", 374 "authors": ["S. Dou"], 375 "year": 2024, 376 "arxiv_id": "2407.06153", 377 "relevance": "Analysis of syntactic and semantic errors across seven LLMs on HumanEval+, MBPP+, APPS+." 378 }, 379 { 380 "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code", 381 "authors": ["R. Pan"], 382 "year": 2023, 383 "arxiv_id": "2308.03109", 384 "relevance": "Categorizes 15 types of translation errors in LLM code translation tasks." 385 }, 386 { 387 "title": "Exploring Multi-Lingual Bias of Large Code Models in Code Generation", 388 "authors": ["C. Wang"], 389 "year": 2024, 390 "arxiv_id": "2404.19368", 391 "relevance": "Evaluates multilingual bias in LLM code generation across natural languages and programming languages." 392 }, 393 { 394 "title": "Is Your AI-Generated Code Really Secure? Evaluating Large Language Models on Secure Code Generation with CodeSecEval", 395 "authors": ["J. Wang"], 396 "year": 2024, 397 "arxiv_id": "2407.02395", 398 "relevance": "Security evaluation of LLM-generated code including training data vulnerability risks." 399 }, 400 { 401 "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification", 402 "authors": ["F. Mu"], 403 "year": 2023, 404 "arxiv_id": "2310.10996", 405 "relevance": "Framework for improving code generation through ambiguity detection and clarifying questions." 406 }, 407 { 408 "title": "RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning", 409 "authors": ["J. Gehring"], 410 "year": 2024, 411 "arxiv_id": "2410.02089", 412 "relevance": "RL from execution feedback for code synthesis, achieving 37.5% solve rate on CodeContests." 413 }, 414 { 415 "title": "Prompting Techniques for Secure Code Generation: A Systematic Investigation", 416 "authors": ["C. Tony"], 417 "year": 2024, 418 "arxiv_id": "2407.07064", 419 "relevance": "Evaluates 15 prompting techniques for security of LLM-generated code." 420 }, 421 { 422 "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation", 423 "authors": ["X. Du"], 424 "year": 2023, 425 "arxiv_id": "2308.01861", 426 "relevance": "Class-level code generation benchmark with 100 tasks evaluating 11 LLMs." 427 }, 428 { 429 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 430 "authors": ["C. E. Jimenez"], 431 "year": 2023, 432 "arxiv_id": "2310.06770", 433 "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks." 434 }, 435 { 436 "title": "Competition-Level Code Generation with AlphaCode", 437 "authors": ["Y. Li"], 438 "year": 2022, 439 "arxiv_id": "2203.07814", 440 "relevance": "AlphaCode system for competitive programming, reaching top 54.3% on Codeforces." 441 }, 442 { 443 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 444 "authors": ["Z. Feng"], 445 "year": 2020, 446 "arxiv_id": "2002.08155", 447 "relevance": "Foundational bimodal pre-trained model for NL-PL understanding and generation." 448 } 449 ] 450 }