scan.json (25845B)
1 { 2 "paper": { 3 "title": "A case study on the transformative potential of AI in software engineering on LeetCode and ChatGPT", 4 "authors": ["Manuel Merkel", "Jens Dörpinghaus"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2501.03639", 8 "doi": "10.48550/arXiv.2501.03639" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper provides a Zenodo repository with scripts and data: https://doi.org/10.5281/zenodo.13881451. This is stated in both the 'Code availability' and 'Data availability' sections." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The dataset is released via the same Zenodo repository (https://doi.org/10.5281/zenodo.13881451). The 'Data availability' section explicitly states this." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions Python scripts, the requests library, Selenium, SonarQube, PostgreSQL, and various other tools, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "While the Zenodo repository contains code and data, the paper itself does not provide step-by-step reproduction instructions. The methodology is described in prose but there are no specific commands or README instructions for reproducing the experiments." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper reports p-values and effect sizes but does not report confidence intervals or error bars for its main results. Tables show means and medians without uncertainty bounds." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": true, 42 "justification": "The paper uses Mann-Whitney U tests (H1, H2) and Wilcoxon tests (H3, H4) for hypothesis testing, with p-values reported for each hypothesis. It also applies Bonferroni correction for multiple comparisons (adjusted significance level of 1.25 × 10^-2)." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "Cohen's d is reported for each accepted hypothesis: d=0.65 (medium) for code quality, d=0.14 (small) for understandability, d=0.44 (small) for runtime. This provides context for the magnitude of differences." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No power analysis or sample size justification is provided. The sample size is determined by what was available on LeetCode (2,321 problems, 57,238 user solutions) rather than by any a priori calculation." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Box plots showing IQR are provided for all four hypotheses (Figures 5-8). Medians and means are reported in tables. The distribution characteristics are visually documented." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The study compares GPT-4o generated code against human-written code from LeetCode users across all four metrics, which serves as the baseline." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The comparison is between GPT-4o (the latest OpenAI model at the time, May 2024) and current LeetCode user solutions. The related work section also contextualizes against recent 2024 studies." 70 }, 71 "ablation_study": { 72 "applies": false, 73 "answer": false, 74 "justification": "The study compares human vs. AI code quality; there is no multi-component system to ablate. The study measures four independent metrics rather than testing components of a proposed system." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Four metrics are used: code smells per kLOC (code quality), cognitive complexity per kLOC (understandability), runtime rank (time behaviour), and memory usage rank (resource utilisation)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "No human evaluation of code quality is included. All evaluation is automated through SonarQube metrics and LeetCode platform metrics. Given that claims are about code quality and understandability, human evaluation of these subjective qualities would be relevant." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is not a machine learning training/evaluation study. The study analyzes all available LeetCode problems; there is no model tuning involved that would require train/test separation." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by difficulty level (Easy, Medium, Hard) in Tables 4-7, and also by before/after the training cutoff date. Additional correlation analyses by problem age and difficulty are reported." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper discusses GPT-4o's failure to solve 235 problems (10.12% invalid), the drop in performance for after-problems (51.94% vs 93.38%), and the failure of the memory usage hypothesis (H3). The heatmap in Figure 4 visualizes failure patterns." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "H3 (resource utilisation) is reported as not accepted -- GPT-4o does not outperform humans on memory usage. The paper also reports that for after-problems, the code understandability difference was not significant (p=0.43). These are genuine negative results." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims that GPT-4o produces 'significantly lower values across all three metrics' (quality, understandability, runtime) vs. humans, that memory usage showed no significant improvement, and that GPT-4o struggled with problems outside training data. All of these are supported by the hypothesis tests in Section 4." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper makes implicit causal claims about training data influencing GPT-4o's quality, and the title implies AI has 'transformative potential.' The study design is observational/correlational -- it compares two groups (human vs. AI code) without controlling for confounds such as problem selection bias, developer experience levels, or the competitive incentive structure of LeetCode." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title claims broad 'transformative potential of AI in software engineering' but results are limited to Python solutions on LeetCode coding problems using GPT-4o. The paper does not adequately bound generalizations: LeetCode problems are algorithmic puzzles, not representative of real-world software engineering. Section 6 discusses future work extending to other languages and LLMs, implicitly acknowledging limited scope, but the title and abstract overreach." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 5 discusses several alternative explanations: LeetCode users may be inexperienced developers (Section 5.1), the competitive nature of LeetCode may bias memory usage results (Section 5.2), training data may prioritize runtime over memory (Section 5.2), and problem difficulty correlates with developer expertise (Section 5.1 with Spearman correlation test)." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper specifies 'gpt-4o-2024-05-13' as the exact model version used (Section 3.1.3, Appendix B), with the knowledge cutoff date of October 2023 stated explicitly." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Example prompts are shown in Figure 3a (first prompt) and Figure 3b (error-fixing prompt). The prompt structure is described in detail in Appendix B with all four components. While not every problem-specific prompt is shown, the template with its structure and an example with actual fill values is provided." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Appendix B states: temperature = 1 (default), n = 1 output, maximum tokens = 4,096. These API parameters are explicitly documented." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The multi-round fixing approach (up to 5 retry attempts) is described in detail in Section 3.1.3 and Appendix B, including how error information from LeetCode is incorporated into subsequent prompts." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "The data pipeline is extensively documented: problem filtering criteria (Appendix A), code extraction from Markdown posts (Appendix C), import handling, code validation via AST parsing, and removal of non-Python/invalid solutions. Counts at each stage are provided (278,397 posts -> 70,261 Python -> 57,238 valid)." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The discussion (Section 5) touches on some limitations embedded within the discussion of results, and the conclusion mentions future work, but there is no dedicated section for limitations." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Though not in a dedicated section, the paper discusses specific threats: LeetCode metrics for runtime may not closely align with local times (Section 3.2.3, citing Choudhuri et al.), developer experience levels on LeetCode may skew results (Section 5.1), the small after-problems sample (107 cases, Section 5.1), and the competitive incentive structure of LeetCode affecting memory optimization (Section 5.2)." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what the results do NOT show. It mentions future work (other languages, other LLMs) in Section 6 but does not clearly delineate that results apply only to Python, only to LeetCode algorithmic problems, only to GPT-4o, and should not be generalized to real-world software engineering." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "Raw data is available via the Zenodo repository (https://doi.org/10.5281/zenodo.13881451), as stated in the Data availability section." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The data collection procedure is described in extensive detail across Sections 3.1-3.2 and Appendices A-C, including web scraping via GraphQL queries, LeetCode API interaction, SonarQube analysis, and the time period (March-August 2024)." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited. The study mines existing LeetCode user posts and generates code via API. There are no human subjects." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The full data pipeline is documented in Figure 1 and throughout Section 3 and Appendices: 2,992 problems -> 2,321 after filtering -> code generation (2,086 valid) and user scraping (278,397 posts -> 70,261 Python -> 57,238 valid). Each stage has explicit counts and criteria." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "The Declarations section states: 'This research received no external funding. This article was funded by the Open Access Publication Fund of the Federal Institute for Vocational Education and Training (BIBB), Bonn.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: University of Stuttgart, BIBB, University of Koblenz, and Linnaeus University. Neither author is affiliated with OpenAI or any AI company whose product is being evaluated." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "The funder (BIBB) funded only open access publication, not the research itself. BIBB is a vocational education institute with no financial stake in GPT-4o performance outcomes." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": true, 214 "justification": "The Declarations section includes 'Conflict of interest/Competing interests: The authors declare no conflict of interest.' This is an explicit competing interests statement." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": true, 221 "justification": "The training data cutoff for GPT-4o is stated as October 2023 (Section 3.1.3, Appendix B), with a specific cutoff date of 1 October 2023 used for analysis (Section 4.1)." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper explicitly discusses train/test overlap: problems are divided into 'before-problems' (in training data, N=2,115) and 'after-problems' (post-cutoff, N=206). Section 4.1 and Table 7 analyze this split. The heatmap in Figure 4 visualizes the performance drop near the cutoff." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Section 2.5 and Appendix A explicitly discuss contamination: 'some of the GenAIs have undergone training on code sourced from GitHub, there is a possibility of data contamination.' The study deliberately includes both before and after problems rather than excluding contaminated ones, and reports differential performance (93.38% vs 51.94% solve rates)." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants. The study mines existing LeetCode posts and generates code via API." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants. The Declarations section states 'Ethics approval and consent to participate: Not applicable.'" 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants were recruited. LeetCode users' posts are mined but the users are not participants in the study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. Inclusion/exclusion criteria for LeetCode problems and posts are documented, but this is data filtering, not human subject selection." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants and no experimental conditions to randomize." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants and no experimental conditions requiring blinding." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper does not report the cost of calling the GPT-4o API for 3,676 prompts. Token counts are mentioned briefly (e.g., 451 tokens for the Two Sum example) but total API cost is not reported." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "No mention of total compute budget, API costs, or hardware used for the SonarQube analysis of 59,324 files. The study required significant compute (SonarQube with PostgreSQL, Selenium automation, OpenAI API) but this is not quantified." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "GPT-4o produces code with significantly fewer code smells per kLOC than human developers on LeetCode (median 76.92 vs 104.17).", 287 "evidence": "Mann-Whitney U test, p = 2.13 × 10^-89, Cohen's d = 0.65 (medium effect). Based on 2,082 problems with 2,082 generated and 55,392 user solutions (Section 4.2, Table 8).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "GPT-4o produces code with significantly lower cognitive complexity per kLOC than human developers (median 375 vs 405.29).", 292 "evidence": "Mann-Whitney U test, p = 4.01 × 10^-6, Cohen's d = 0.14 (small effect). However, for after-problems the p-value was 0.43 (not significant) (Section 4.3, Table 9).", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "GPT-4o generated code runs significantly faster than human code on LeetCode (median runtime rank 57.18).", 297 "evidence": "Wilcoxon test, p = 1.17 × 10^-22, Cohen's d = 0.44 (small effect). Based on 2,086 problems (Section 4.5, Table 12).", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "GPT-4o does not produce code that uses less memory than human developers on LeetCode.", 302 "evidence": "Wilcoxon test, p = 5.85 × 10^-2, above adjusted significance level. Memory rank median was 48.16, below 50th percentile (Section 4.4, Table 10).", 303 "supported": "strong" 304 }, 305 { 306 "claim": "GPT-4o has a limited capacity to generalise to problems not in its training data, solving only 51.94% of after-problems vs 93.38% of before-problems.", 307 "evidence": "Table 7 shows 107/206 after-problems solved vs 1,975/2,115 before-problems. The heatmap in Figure 4 visualizes the performance drop near the cutoff.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "The overall valid solution rate for GPT-4o across all LeetCode problems was 89.88%.", 312 "evidence": "2,086 valid solutions out of 2,321 problems within 5 attempts (Section 3.1.3).", 313 "supported": "strong" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval", "observational"], 317 "key_findings": "GPT-4o generates Python code with significantly fewer code smells (medium effect, d=0.65) and lower cognitive complexity (small effect, d=0.14) than human LeetCode solutions. GPT-4o code also runs faster (small effect, d=0.44) but does not use less memory than human code. GPT-4o's problem-solving ability drops sharply for problems outside its training data (51.94% vs 93.38% success rate), demonstrating limited generalization.", 318 "red_flags": [ 319 { 320 "flag": "Title overstates scope", 321 "detail": "The title claims 'transformative potential of AI in software engineering' but the study only evaluates GPT-4o on Python LeetCode algorithmic puzzles, which are not representative of real-world software engineering tasks." 322 }, 323 { 324 "flag": "Non-representative baseline population", 325 "detail": "LeetCode users include many beginners and interview preparers, making them a potentially weak baseline. The paper acknowledges this (Section 5.1) but still draws conclusions about 'humans' vs 'AI' code quality. A comparison against professional production code would be more meaningful." 326 }, 327 { 328 "flag": "LeetCode metrics unreliability acknowledged but used anyway", 329 "detail": "The paper cites Choudhuri et al. noting that LeetCode runtime/memory metrics 'do not consistently align closely with the times recorded locally' and have 'a higher degree of variance' (Section 3.2.3), yet these metrics are central to two of the four hypotheses." 330 }, 331 { 332 "flag": "No dedicated limitations section", 333 "detail": "Despite being a study with multiple threats to validity (non-representative sample, unreliable metrics, potential contamination), there is no dedicated limitations or threats-to-validity section." 334 }, 335 { 336 "flag": "After-problems sample too small for strong claims", 337 "detail": "Only 107 after-problems were solved by GPT-4o (out of 206), yet the paper draws conclusions about generalization capacity. The code understandability result was not significant for after-problems (p=0.43), suggesting the main findings may not hold for uncontaminated problems." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Evaluating Large Language Models Trained on Code", 343 "authors": ["Mark Chen", "Jerry Tworek"], 344 "year": 2021, 345 "arxiv_id": "2107.03374", 346 "relevance": "Introduces the HumanEval benchmark widely used for evaluating LLM code generation capabilities." 347 }, 348 { 349 "title": "No need to lift a finger anymore? Assessing the quality of code generation by ChatGPT", 350 "authors": ["Zhijie Liu", "Yutian Tang", "Xiapu Luo", "Yuming Zhou", "Liang Feng Zhang"], 351 "year": 2024, 352 "doi": "10.1109/TSE.2024.3392499", 353 "relevance": "Large-scale study of ChatGPT code quality on LeetCode, examines correctness, complexity and security with multi-round fixing." 354 }, 355 { 356 "title": "An empirical study of code smells in transformer-based code generation techniques", 357 "authors": ["Mohammed Latif Siddiq"], 358 "year": 2022, 359 "doi": "10.1109/SCAM55253.2022.00014", 360 "relevance": "Analyzes code quality issues in LLM-generated code using static analysis, finding smells propagate from training data." 361 }, 362 { 363 "title": "An empirical evaluation of GitHub Copilot's code suggestions", 364 "authors": ["Nhan Nguyen", "Sarah Nadi"], 365 "year": 2022, 366 "doi": "10.1145/3524842.3528470", 367 "relevance": "Evaluates Copilot code correctness and understandability on LeetCode problems across four programming languages." 368 }, 369 { 370 "title": "Refining ChatGPT-generated code: Characterizing and mitigating code quality issues", 371 "authors": ["Yue Liu"], 372 "year": 2024, 373 "doi": "10.1145/3643674", 374 "relevance": "Studies ChatGPT code quality issues on 2,033 LeetCode tasks and evaluates self-repair capabilities." 375 }, 376 { 377 "title": "A performance study of LLM-generated code on LeetCode", 378 "authors": ["Tristan Coignion", "Clément Quinton", "Romain Rouvoy"], 379 "year": 2024, 380 "doi": "10.1145/3661167.3661221", 381 "relevance": "Compares runtime and memory efficiency of 18 LLMs on LeetCode, finding LLM code is on average more efficient than human code." 382 }, 383 { 384 "title": "Where Are Large Language Models for Code Generation on GitHub?", 385 "authors": ["Xiang Yu"], 386 "year": 2024, 387 "arxiv_id": "2406.19544", 388 "relevance": "Large-scale study comparing LLM-generated code found on GitHub with human-written code using SonarQube metrics." 389 }, 390 { 391 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 392 "authors": ["Hammond Pearce", "Baleegh Ahmad"], 393 "year": 2022, 394 "doi": "10.1109/SP46214.2022.9833571", 395 "relevance": "Early study evaluating security vulnerabilities in AI-generated code from GitHub Copilot." 396 }, 397 { 398 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 399 "authors": ["Jiawei Liu"], 400 "year": 2023, 401 "relevance": "Evaluates 26 LLMs on HumanEval+ with GPT-4 achieving 76.2%, relevant to LLM code generation capability assessment." 402 }, 403 { 404 "title": "Programming with ChatGPT: How far can we go?", 405 "authors": ["Alessio Bucaioni"], 406 "year": 2024, 407 "doi": "10.1016/j.mlwa.2024.100526", 408 "relevance": "Studies ChatGPT code generation for Java and C++ on LeetCode, finding opposite efficiency results to some other studies." 409 }, 410 { 411 "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks", 412 "authors": ["Alon Jacovi"], 413 "year": 2023, 414 "arxiv_id": "2305.10160", 415 "relevance": "Addresses benchmark contamination strategies, directly relevant to evaluation methodology for LLM code generation." 416 }, 417 { 418 "title": "Program code generation with generative AIs", 419 "authors": ["Bulat Idrisov", "Tim Schlippe"], 420 "year": 2024, 421 "doi": "10.3390/a17020062", 422 "relevance": "Compares 6 GenAIs on LeetCode problems created after training cutoff, finding Copilot most effective at 50%." 423 } 424 ] 425 }