scan.json (23735B)
1 { 2 "paper": { 3 "title": "Performance Review on LLM for solving leetcode problems", 4 "authors": [ 5 "Lun Wang", 6 "Chuanqi Shi", 7 "Shaoshuai Du", 8 "Yiyi Tao", 9 "Yixian Shen", 10 "Hang Zhang", 11 "Yanxin Shen", 12 "Xinyu Qiu" 13 ], 14 "year": 2024, 15 "venue": "2024 4th International Symposium on Artificial Intelligence and Intelligent Manufacturing (AIIM)", 16 "arxiv_id": "2502.15770", 17 "doi": "10.1109/AIIM64537.2024.10934280" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "The paper evaluates 22 LLMs on LeetCode problems using pass@k and runtime metrics. GPT-4-omni achieves the highest non-canonical pass@1 (43.36%) and pass@10 (61.95%), while most open-source models score below 5% pass@1. Solutions from o1-mini achieve a mean LeetCode runtime percentile rank of 63%, suggesting LLM-generated code is faster than most human submissions. However, the paper has significant internal inconsistencies (204 vs ~2,100 problems, 18 vs 22 models) and its reference list is dominated by unrelated self-citations.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "A GitHub link is provided in the results section: https://github.com/DHUer/LLMevaluationresults. The paper states 'All the experiment code and dataset is published' there." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper states the dataset is published at the GitHub link alongside the experiment code. LeetCode problems are also publicly available." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No environment specifications are provided. The paper mentions Python and pytest-benchmark but provides no requirements.txt, dependency versions, or setup instructions." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided. The paper describes the general methodology but gives no concrete commands or procedures to replicate the experiments." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Table I reports only point estimates for pass@1 and pass@10 with no confidence intervals, error bars, or uncertainty measures." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are used. Model performance differences are compared by raw number comparison only. Claims like 'top performers' are based solely on comparing point estimates." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No effect sizes are reported. The paper presents raw pass@k percentages and a runtime percentile rank (63%) but no formal effect size measures such as Cohen's d or structured percentage improvement with baseline context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification is given for why 204 (or ~2,100) problems were used, why 10 solutions per temperature were generated, or why 5 temperature settings were chosen." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance or standard deviation is reported across experimental runs. The paper generates multiple solutions but only reports aggregated pass@k metrics without spread measures." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table I includes 'Canonical Solutions' (human-written, 97.94% pass@1) as a baseline, and 22 models are compared against each other." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "The evaluation includes contemporary models such as GPT-4-omni, GPT-4-turbo, and o1-mini alongside older models like CodeGen and SantaCoder, providing a range of baselines." 83 }, 84 "ablation_study": { 85 "applies": false, 86 "answer": false, 87 "justification": "The paper evaluates existing LLMs as black boxes with no system components to ablate." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Multiple metrics are used: pass@1, pass@10, runtime, memory usage, and LeetCode runtime percentile rank." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of code quality is performed. The comparison with humans is through automated LeetCode percentile rankings, not manual code review." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "The best pass@k is selected across all temperature settings on the same test problems. There is no validation/test split — the temperature hyperparameter is optimized on the test set." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": false, 107 "justification": "Although the dataset spans Easy/Medium/Hard difficulty levels (ratio ~11:50:10), no per-difficulty breakdowns of results are presented. Only aggregate pass@k numbers are shown in Table I." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "The methodology section mentions 'Error Analysis: Identification of common errors or misconceptions' but the results section contains no actual error analysis or failure case discussion." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": false, 117 "justification": "While many models show very low scores (e.g., InCoder-1B at 0.10% pass@1), these are presented as observations rather than analyzed as negative results. No discussion of what went wrong or unexpected findings." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims a 'comprehensive performance evaluation' and promises insights into 'strengths and limitations.' The results section does provide pass@k data and runtime analysis supporting these general claims, though 'comprehensive' is generous given the lack of per-category breakdown." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper claims to 'observe the impact of the temperature parameter on the correctness and efficiency of the generated code' but presents no per-temperature analysis to support this claim. Temperature is varied but its effect is not isolated or reported." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The abstract claims insights into LLMs' 'potential applications and areas for improvement in automated programming assistance,' generalizing beyond LeetCode algorithmic problems to broader programming assistance without justification." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No alternative explanations are discussed for the observed performance differences between models or for the 63% runtime percentile rank finding." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper measures pass@k on LeetCode and frames this as evaluating LLMs' capabilities in 'code generation and problem-solving tasks' and 'automated programming assistance' without discussing that LeetCode algorithmic problems are a narrow proxy for real-world programming ability." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Models are identified by marketing names only: 'GPT-4', 'GPT-3.5-turbo', 'GPT-4-omni', 'o1-mini', 'Copilot'. No specific API versions, snapshot dates, or exact model identifiers (e.g., 'gpt-4-0613') are provided." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "The paper describes providing 'problem statement, code comments, and code framework' as input but never shows the actual prompt text sent to the models. The prompt structure is described in natural language only." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Temperature settings are explicitly reported (0.2, 0.4, 0.6, 0.8, 1.0) along with the number of solutions generated (10 per temperature per model per problem)." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. Models are queried directly via API or IDE integration." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The paper describes preprocessing: 'We standardized the problem data by removing any extraneous information such as solution discussions, hints, or previously submitted solutions' and extracting problem statements, function signatures, and code comments." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no limitations, threats to validity, or discussion section addressing the study's shortcomings." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No threats to validity are discussed. Critical threats such as benchmark contamination, LeetCode's noisy runtime measurements, and the narrow scope of algorithmic problems are unaddressed." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show or which populations/settings are excluded." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper provides a GitHub repository link (https://github.com/DHUer/LLMevaluationresults) claimed to contain the experiment code and dataset." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper describes crawling LeetCode to collect problems, extracting problem statements, function signatures, and code comments, and standardizing the data by removing extraneous information." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Data source is the public LeetCode platform." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": false, 210 "justification": "The pipeline from collection to analysis has unexplained inconsistencies. The contributions section claims '204 Leetcode problems' while the dataset analysis section claims 'approximately 2,100 LeetCode problems.' Table I shows 22 model entries but the abstract claims '18 LLMs.' These discrepancies suggest the pipeline is incompletely documented." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "The paper contains an unfilled template placeholder: 'Identify applicable funding agency here. If none, delete this.' No funding information is provided." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All eight authors' university affiliations are listed (Duke, UCSD, UvA, JHU, SFU, Northeastern). None appear affiliated with the model providers being evaluated." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Funding is not disclosed — the template placeholder was left unfilled. Independence cannot be assessed." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for any of the models evaluated. This is critical because LeetCode problems and solutions are widely available online and almost certainly in training data." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether the evaluated models may have been trained on LeetCode problems and solutions, which are publicly available and widely scraped." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "LeetCode problems have been publicly available for years and solutions are extensively discussed online. All evaluated models likely trained on this data. This fundamental contamination risk is never mentioned." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference costs, API costs, or per-example costs are reported despite generating thousands of API calls across multiple models." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total computational budget, hardware used, or total API spend is stated." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No seed sensitivity analysis. While 10 solutions per temperature are generated, results are only reported as aggregated pass@k without examining variance across runs." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper explicitly states '10 distinct solutions per model for each problem' at each of 5 temperature settings." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": true, 315 "justification": "The temperature search space is explicitly stated: 5 values (0.2, 0.4, 0.6, 0.8, 1.0) with 10 solutions at each, constituting 50 generations per model per problem." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper states 'The best pass@k value observed across all temperatures was then considered the final pass@k metric' — this selects the best temperature on the test set with no separate validation split, inflating reported performance." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "22 models are compared with no statistical tests at all, let alone multiple comparison corrections." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of potential bias in the evaluation methodology. The authors control the prompt construction, temperature selection, and submission process, which could introduce systematic advantages or disadvantages." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Models ranging from 350M parameters (CodeGen-350M) to GPT-4 are compared on the same metrics without any normalization or discussion of compute differences." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "While contribution 3 claims to 'assess the usability of Leetcode as a public repository,' the results section contains no actual analysis of construct validity — whether LeetCode performance measures what the paper claims about LLM programming capability." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "GitHub Copilot is accessed through VS Code integration while other models use the OpenAI API. These are fundamentally different interfaces that may provide different context. This confound is not discussed." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not addressed. LeetCode problems have been available online for years, predating the training of all evaluated models. No temporal analysis is performed." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not addressed. No discussion of whether providing function signatures and code comments provides information that wouldn't be available in real-world usage scenarios." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "Not addressed. LeetCode problems within the same topic category may share structural similarities, and many problems have publicly available solutions that could be in training data." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, temporal splits, or decontamination analysis." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "GPT-4-omni is the top-performing LLM with pass@1 of 43.36% and pass@10 of 61.95%", 374 "evidence": "Table I shows pass@1 and pass@10 for 22 models, with GPT-4-omni ranking highest among LLMs (Canonical Solutions excluded).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "LLM-generated solutions (o1-mini) achieve a mean runtime percentile rank of 63%, faster than 63% of human submissions", 379 "evidence": "Section III.B and Figure 3 report this finding based on LeetCode's runtime percentile rankings.", 380 "supported": "weak" 381 }, 382 { 383 "claim": "The paper evaluates 18 LLMs on 204 LeetCode problems", 384 "evidence": "Contribution 1 states '18 LLMs on 204 Leetcode problems,' but Table I lists 22 entries and the dataset analysis section claims approximately 2,100 problems.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "Temperature parameter impacts correctness and efficiency of generated code", 389 "evidence": "The methodology describes testing 5 temperature settings (0.2–1.0), but no per-temperature results are presented in the results section. Only best-across-temperature pass@k values are reported.", 390 "supported": "unsupported" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Internal inconsistencies in dataset and model counts", 396 "detail": "The contributions section claims '18 LLMs on 204 Leetcode problems,' but Table I lists 22 model entries (including Canonical Solutions) and the dataset analysis section says 'approximately 2,100 LeetCode problems.' The abstract lists 5 authors while the paper header lists 8." 397 }, 398 { 399 "flag": "Unrelated self-citation padding in references", 400 "detail": "The majority of numbered references [1]-[17] are unrelated to the paper's topic: PIQI quantized inference [1], data augmentation [3], fMRI analysis [4], few-shot learning in GANs [5], phishing attacks [6], robot obstacle avoidance [7], diabetes risk prediction [8], parameter-efficient fine-tuning [9], meta learning [10][11], explainability [12], vision-language pre-training [13], DNN frameworks [14], load balancing [15][16], multi-label learning [17]. Many are by the paper's own authors. Only [2] (Coignion et al.) is directly relevant." 401 }, 402 { 403 "flag": "Complete absence of contamination analysis", 404 "detail": "LeetCode problems and their solutions are among the most extensively indexed programming content online. Every model tested was almost certainly trained on LeetCode data. This fundamental validity threat is never mentioned." 405 }, 406 { 407 "flag": "Unfilled template placeholder left in paper", 408 "detail": "The paper contains the text 'Identify applicable funding agency here. If none, delete this.' — an IEEE template instruction that was not addressed before submission." 409 }, 410 { 411 "flag": "No statistical rigor in comparisons", 412 "detail": "22 models are ranked by point estimates with no error bars, confidence intervals, or significance tests. Claims of 'top performers' and 'lower performers' are based solely on comparing raw numbers." 413 }, 414 { 415 "flag": "Best temperature selected on test set", 416 "detail": "The methodology states 'The best pass@k value observed across all temperatures was then considered the final pass@k metric.' This optimizes hyperparameters on the test set with no validation split, inflating reported performance." 417 }, 418 { 419 "flag": "Claimed analysis not presented", 420 "detail": "Several analyses described in the methodology (error analysis, per-temperature effects, LeetCode dataset assessment) are never presented in the results section." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "A performance study of LLM-generated code on Leetcode", 426 "authors": ["Tristan Coignion", "Clément Quinton", "Romain Rouvoy"], 427 "year": 2024, 428 "relevance": "Directly relevant benchmark evaluation of LLM code generation on the same LeetCode platform." 429 }, 430 { 431 "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)", 432 "authors": ["Mark Chen"], 433 "year": 2021, 434 "relevance": "Defines the pass@k metric and unbiased estimator used as the primary evaluation methodology in this paper." 435 } 436 ] 437 }