scan-v5.json (25675B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "An evaluation of LLM code generation capabilities through graded exercises", 6 "authors": [ 7 "Álvaro Barbero Jiménez" 8 ], 9 "year": 2024, 10 "venue": "arXiv.org", 11 "arxiv_id": "2410.16292", 12 "doi": "10.48550/arXiv.2410.16292" 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": false, 19 "justification": "The abstract states 'positive correlation with task difficulty' but Figure 7 shows success decreases as difficulty increases — harder katas (kyu 1–2) are entirely unsolvable. This is an inversion of the actual finding and constitutes a misleading abstract claim.", 20 "source": "haiku" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper attributes 37.4% of LLM behavior to 'solution leakage' based on a surrogate model with correlated features (days since publication, user completions), but this is correlational, not causal — the paper cannot rule out that older katas are simply better-specified or more canonical.", 26 "source": "haiku" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": false, 31 "justification": "The conclusions state 'current evaluations in the literature of the performance of state-of-the-art LLMs are, quite probably, overestimates' — extrapolating from a single model (GPT-4o-mini) on one platform (Codewars) to all LLM evaluations without bounding this claim.", 32 "source": "haiku" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 4.4 explicitly lists two competing hypotheses for the age effect (criteria drift vs. leakage), acknowledging that 'the obtained results show no evidence toward one hypothesis or the other.'", 38 "source": "haiku" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": true, 43 "justification": "Section 2 explicitly limits scope to 'solving programming exercises' as one specific aspect of software development, and the Limitations section itemizes what aspects of real development are not covered.", 44 "source": "haiku" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 5 is a dedicated Limitations section covering four distinct limitations: coverage, reproducibility, human-in-the-loop, and explainability accuracy.", 52 "source": "haiku" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": true, 57 "justification": "The explainability limitation quantifies proxy model accuracy at 74.88% and explains why estimates of leakage impact 'contain some inherent noise.' The reproducibility limitation identifies the specific cause (Codewars ToS) rather than generic disclaimers.", 58 "source": "haiku" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": true, 63 "justification": "Section 2 states explicitly 'we will limit ourselves here to measuring one specific aspect of software development' and contrasts with broader benchmarks like SWE-bench.", 64 "source": "haiku" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": false, 71 "justification": "The Acknowledgements section thanks colleagues and Codewars contributors but does not mention any funding source, grant, or institutional support.", 72 "source": "haiku" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "The author's affiliations (Instituto de Ingeniería del Conocimiento and Universidad Autónoma de Madrid) are disclosed on the title page.", 78 "source": "haiku" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": true, 82 "answer": false, 83 "justification": "No funding is disclosed, so independence of any funder cannot be verified; the paper cannot be confirmed as clearly unfunded independent work.", 84 "source": "haiku" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": false, 89 "justification": "There is no competing interests statement or declaration of financial interests anywhere in the paper.", 90 "source": "haiku" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "The kyu difficulty system is precisely defined (kyu 8 easiest to kyu 1 hardest), the performance metric (Equation 1) is derived and explained, and 'kata' is contextualized with examples.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "The abstract and introduction clearly state the paper runs a new evaluation of GPT-4o-mini and aims to be 'the first result that quantifies the impact of solutions leakage on the performance of an LLM for coding.'", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 1.4 provides a substantive review of prior benchmarks (HumanEval, APPS, BigCodeBench, SWE-bench, MBPP, ODEX), explaining how each measures coding capability and where this work differs.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "empirical": { 116 "artifacts": { 117 "code_released": { 118 "applies": true, 119 "answer": false, 120 "justification": "The Limitations section explicitly states: 'we have decided to release neither the code of the developed botnet, nor the database of solutions proposed by the LLM.'", 121 "source": "haiku" 122 }, 123 "data_released": { 124 "applies": true, 125 "answer": false, 126 "justification": "The scraped Codewars dataset and LLM solutions database are explicitly not released (same sentence in Limitations). The source platform (Codewars) is public but the curated evaluation dataset itself is not.", 127 "source": "haiku" 128 }, 129 "environment_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper mentions Python and Selenium but provides no requirements.txt, version pins, or dependency specifications adequate for reproduction.", 133 "source": "haiku" 134 }, 135 "reproduction_instructions": { 136 "applies": true, 137 "answer": false, 138 "justification": "No reproduction instructions are provided; the code is explicitly withheld, making reproduction impossible.", 139 "source": "haiku" 140 } 141 }, 142 "statistical_methodology": { 143 "confidence_intervals_or_error_bars": { 144 "applies": true, 145 "answer": false, 146 "justification": "Figures 7–13 present point estimates (percentages, scores) with no confidence intervals or error bars reported anywhere.", 147 "source": "haiku" 148 }, 149 "significance_tests": { 150 "applies": true, 151 "answer": false, 152 "justification": "Comparative claims are made (LLM vs. human at each difficulty level, language differences) without any statistical significance tests.", 153 "source": "haiku" 154 }, 155 "effect_sizes_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Figure 13 reports normalized aggregate SHAP contributions as percentages (46.6% difficulty, 37.4% leakage, 16% language), which function as quantified effect sizes for each factor.", 159 "source": "haiku" 160 }, 161 "sample_size_justified": { 162 "applies": true, 163 "answer": false, 164 "justification": "The paper uses all available katas (14,346) without any formal justification or discussion of whether this is sufficient for the subgroup analyses performed.", 165 "source": "haiku" 166 }, 167 "variance_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "Success rates are reported as point estimates; no standard deviation, variance, or confidence bounds are provided for any metric, including the cross-validation accuracy.", 171 "source": "haiku" 172 } 173 }, 174 "evaluation_design": { 175 "baselines_included": { 176 "applies": true, 177 "answer": true, 178 "justification": "Human completion rates from Codewars are used as a baseline throughout (Figure 7, Figure 11), providing direct LLM vs. human comparison at each difficulty level.", 179 "source": "haiku" 180 }, 181 "baselines_contemporary": { 182 "applies": true, 183 "answer": true, 184 "justification": "Human performance data is derived from actual current Codewars user statistics, not historical or synthetic data.", 185 "source": "haiku" 186 }, 187 "ablation_study": { 188 "applies": false, 189 "answer": false, 190 "justification": "The paper evaluates a single pre-trained closed-source model with no modifiable components; ablation is not applicable.", 191 "source": "haiku" 192 }, 193 "multiple_metrics": { 194 "applies": true, 195 "answer": true, 196 "justification": "The paper uses raw success rate per rank, the weighted Codewars score (Equation 1), and SHAP-based feature attribution as distinct evaluation perspectives.", 197 "source": "haiku" 198 }, 199 "human_evaluation": { 200 "applies": true, 201 "answer": false, 202 "justification": "Human completion rate statistics from Codewars are used as comparison baselines, not as human evaluation of the LLM's output quality.", 203 "source": "haiku" 204 }, 205 "held_out_test_set": { 206 "applies": false, 207 "answer": false, 208 "justification": "The paper evaluates a pre-trained LLM on all available katas; there is no training task requiring a train/test split for the main evaluation.", 209 "source": "haiku" 210 }, 211 "per_category_breakdown": { 212 "applies": true, 213 "answer": true, 214 "justification": "Results are broken down by programming language (Figure 8) and by difficulty level (kyu 1–8) throughout the results section.", 215 "source": "haiku" 216 }, 217 "failure_cases_discussed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Section 4.2 notes COBOL shows 'catastrophic results' with 'many cases of syntax errors,' and Timeout failures are separately identified in Figure 8.", 221 "source": "haiku" 222 }, 223 "negative_results_reported": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper explicitly reports complete failure on rank 1–2 katas, near-zero performance on COBOL, and that GPT-4o-mini scores below human performance on recent katas.", 227 "source": "haiku" 228 } 229 }, 230 "setup_transparency": { 231 "model_versions_specified": { 232 "applies": true, 233 "answer": true, 234 "justification": "The exact model version 'GPT-4o-mini (version 2024-07-18)' is specified in Section 3.", 235 "source": "haiku" 236 }, 237 "prompts_provided": { 238 "applies": true, 239 "answer": true, 240 "justification": "Figure 4 provides the complete system prompt and user prompt templates with all placeholder variables identified.", 241 "source": "haiku" 242 }, 243 "hyperparameters_reported": { 244 "applies": true, 245 "answer": false, 246 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported anywhere in the paper.", 247 "source": "haiku" 248 }, 249 "scaffolding_described": { 250 "applies": true, 251 "answer": true, 252 "justification": "Figure 3 describes the three-bot network (downloader, attempter, verifier) with their interactions, and Selenium-based injection is explained.", 253 "source": "haiku" 254 }, 255 "data_preprocessing_documented": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section 3 documents that 57 katas (0.4%) were discarded due to regex/Selenium injection incompatibilities, and log-scaling of certain surrogate model features is noted.", 259 "source": "haiku" 260 } 261 }, 262 "data_integrity": { 263 "raw_data_available": { 264 "applies": true, 265 "answer": false, 266 "justification": "The Limitations section explicitly states the database of LLM solutions is not released; raw evaluation data is unavailable for independent verification.", 267 "source": "haiku" 268 }, 269 "data_collection_described": { 270 "applies": true, 271 "answer": true, 272 "justification": "Section 3 describes scraping Codewars via Selenium bots, the bot architecture (Figure 3), and how solutions were submitted and verified.", 273 "source": "haiku" 274 }, 275 "recruitment_methods_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants were recruited; this is a fully automated benchmark evaluation.", 279 "source": "haiku" 280 }, 281 "data_pipeline_documented": { 282 "applies": true, 283 "answer": true, 284 "justification": "Figure 3 documents the full pipeline from kata download through LLM solution generation to success/failure verification on Codewars.", 285 "source": "haiku" 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "The model version (2024-07-18) is given but GPT-4o-mini's training data cutoff date is not stated in the paper.", 293 "source": "haiku" 294 }, 295 "train_test_overlap_discussed": { 296 "applies": true, 297 "answer": true, 298 "justification": "Train/test overlap is the central research question; Section 4.4 and the SHAP analysis (Section 4.5) quantify leakage as approximately 37.4% of model behavior.", 299 "source": "haiku" 300 }, 301 "benchmark_contamination_addressed": { 302 "applies": true, 303 "answer": true, 304 "justification": "The paper directly addresses contamination via the publication-date analysis (Figure 11) and notes '38,500 public repositories containing kata solutions' as the contamination mechanism.", 305 "source": "haiku" 306 } 307 }, 308 "human_studies": { 309 "pre_registered": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants; NA.", 313 "source": "haiku" 314 }, 315 "irb_or_ethics_approval": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants; NA.", 319 "source": "haiku" 320 }, 321 "demographics_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants; NA.", 325 "source": "haiku" 326 }, 327 "inclusion_exclusion_criteria": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants; NA.", 331 "source": "haiku" 332 }, 333 "randomization_described": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants; NA.", 337 "source": "haiku" 338 }, 339 "blinding_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants; NA.", 343 "source": "haiku" 344 }, 345 "attrition_reported": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants; NA.", 349 "source": "haiku" 350 } 351 }, 352 "cost_and_practicality": { 353 "inference_cost_reported": { 354 "applies": true, 355 "answer": false, 356 "justification": "The paper mentions GPT-4o-mini is '30 times cheaper' than GPT-4o qualitatively but does not report actual inference cost for the 14,346-kata evaluation.", 357 "source": "haiku" 358 }, 359 "compute_budget_stated": { 360 "applies": true, 361 "answer": false, 362 "justification": "No total computational budget, API cost, or runtime figures are reported.", 363 "source": "haiku" 364 } 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "GPT-4o-mini outperforms humans on easy tasks (kyu 8–7) but fails completely on the hardest tasks (kyu 1–2).", 371 "evidence": "Figure 7 shows LLM success rate exceeds human rate at kyu 8–7, converges at kyu 4–6, and reaches 0% at kyu 1–2.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "LLM performance varies dramatically by programming language, with popular languages (Python, JavaScript) scoring highest and legacy languages (COBOL, Fortran) showing catastrophic failure.", 376 "evidence": "Figure 9 shows Python scores 36.1 and COBOL near zero on the Codewars metric; Figure 10 correlates performance with GitHub push counts.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Approximately 37.4% of LLM success variation is attributable to solution leakage into training data.", 381 "evidence": "SHAP analysis on a surrogate model (74.88% CV accuracy) attributes 37.4% of influence to leakage-proxying features (days since publication, user completions).", 382 "supported": "weak" 383 }, 384 { 385 "claim": "Newer kata publication dates correlate with lower LLM performance, suggesting leakage from solutions in public repositories.", 386 "evidence": "Figure 11 shows LLM Codewars score declines for katas published more recently, with the effect stronger for LLMs than for humans.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Current LLM code generation evaluations likely overestimate true capabilities due to benchmark contamination.", 391 "evidence": "The age-effect and leakage-attribution findings suggest older evaluation datasets with publicly available solutions inflate scores; this aligns with reports from other benchmarks cited in the paper.", 392 "supported": "moderate" 393 } 394 ], 395 "methodology_tags": [ 396 "benchmark-eval", 397 "observational" 398 ], 399 "key_findings": "GPT-4o-mini was evaluated on 14,346 Codewars coding challenges across 8 programming languages. The model exceeds human performance on easy tasks but fails entirely on the hardest (kyu 1–2). A surrogate model SHAP analysis estimates that ~37.4% of LLM success variation is explained by features proxying training data leakage (kata age, completion count), ~46.6% by task difficulty, and ~16% by language. Legacy languages (COBOL, Fortran) show near-zero performance while popular languages (Python, JavaScript) score highest. The findings suggest current benchmark evaluations likely overestimate LLM coding capability due to solution contamination in training data.", 400 "red_flags": [ 401 { 402 "flag": "Abstract direction error", 403 "detail": "The abstract claims 'positive correlation with task difficulty' but Figure 7 shows success decreases with difficulty — the abstract states the opposite of what the data shows." 404 }, 405 { 406 "flag": "Single model, no generalization", 407 "detail": "Only GPT-4o-mini is evaluated; broad claims about 'current LLMs' overestimating capabilities are unsupported by multi-model evidence." 408 }, 409 { 410 "flag": "Code and data not released", 411 "detail": "The botnet code and solution database are explicitly withheld due to Codewars ToS, making the evaluation entirely irreproducible." 412 }, 413 { 414 "flag": "Leakage attribution is correlational", 415 "detail": "37.4% leakage attribution comes from a surrogate model with correlated features; causation is not established and alternative explanations (newer katas harder in uncontrolled ways) are acknowledged but not ruled out." 416 }, 417 { 418 "flag": "No confidence intervals", 419 "detail": "All main results are point estimates with no uncertainty quantification; the surrogate model's CV accuracy (74.88%) is also reported without variance." 420 }, 421 { 422 "flag": "Training cutoff not stated", 423 "detail": "GPT-4o-mini's training data cutoff is not stated, making it impossible to verify which katas were in-distribution at training time." 424 }, 425 { 426 "flag": "LLM temperature unreported", 427 "detail": "No LLM hyperparameters (temperature, top-p, etc.) are reported, affecting reproducibility of even the overall pass rates." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 433 "relevance": "Canonical code generation benchmark; introduces pass@k metric used throughout the paper" 434 }, 435 { 436 "title": "Measuring Coding Challenge Competence With APPS", 437 "relevance": "Prior work on difficulty-stratified code evaluation benchmarks; paper builds directly on its methodology" 438 }, 439 { 440 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 441 "relevance": "Most realistic code generation benchmark; discussed as the standard the authors compare their approach against" 442 }, 443 { 444 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 445 "relevance": "Contemporary benchmark showing LLMs at 60% on realistic tasks; contextualizes this paper's difficulty findings" 446 }, 447 { 448 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)", 449 "relevance": "Shows augmented test suites reduce apparent LLM performance; supports contamination/overestimation thesis" 450 }, 451 { 452 "title": "Program Synthesis with Large Language Models (MBPP)", 453 "relevance": "Introduces multi-turn human correction loop for code generation; cited as limitation of single-shot evaluation" 454 }, 455 { 456 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 457 "relevance": "Human preference leaderboard used to justify model selection (GPT-4o-mini #2 for coding)" 458 }, 459 { 460 "title": "Open LLM Leaderboard v2", 461 "relevance": "Cited as evidence that public benchmarks require constant updates to avoid contamination, supporting the paper's main thesis" 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 2, 467 "justification": "Practitioners evaluating LLMs for coding assistance will find the language-specific breakdowns and leakage quantification directly actionable." 468 }, 469 "surprise_contrarian": { 470 "score": 2, 471 "justification": "Quantifying that 37.4% of apparent LLM capability may be memorization/leakage challenges conventional benchmark validity assumptions." 472 }, 473 "fear_safety": { 474 "score": 0, 475 "justification": "No AI safety or risk concerns; findings are about evaluation methodology, not harmful capabilities." 476 }, 477 "drama_conflict": { 478 "score": 1, 479 "justification": "Mild controversy in suggesting current LLM evaluations are systematically inflated, but framed academically rather than confrontationally." 480 }, 481 "demo_ability": { 482 "score": 1, 483 "justification": "Code and data are not released, but readers could qualitatively replicate the age-effect analysis with access to a similar platform." 484 }, 485 "brand_recognition": { 486 "score": 1, 487 "justification": "Evaluates OpenAI's GPT-4o-mini (recognizable product) but the author and institution (IIC/UAM) are not widely known." 488 } 489 }, 490 "hn_data": { 491 "threads": [ 492 { 493 "hn_id": "39274918", 494 "title": "Better Call GPT: Comparing large language models against lawyers [pdf]", 495 "points": 389, 496 "comments": 264, 497 "url": "https://news.ycombinator.com/item?id=39274918", 498 "created_at": "2024-02-06T15:04:39Z" 499 }, 500 { 501 "hn_id": "42021222", 502 "title": "Fast and Accurate Deep Reconfigurable Spiking Inference Accelerator Architecture", 503 "points": 2, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=42021222", 506 "created_at": "2024-11-01T20:28:32Z" 507 }, 508 { 509 "hn_id": "41926182", 510 "title": "We discovered a way to measure LLM bias while building a recruitment tool", 511 "points": 1, 512 "comments": 1, 513 "url": "https://news.ycombinator.com/item?id=41926182", 514 "created_at": "2024-10-23T15:41:33Z" 515 }, 516 { 517 "hn_id": "42576715", 518 "title": "Reinforcement Learning for Multi-Intersection Traffic Signal Control", 519 "points": 1, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=42576715", 522 "created_at": "2025-01-02T17:51:07Z" 523 }, 524 { 525 "hn_id": "38177348", 526 "title": "CleanCoNLL: A Nearly Noise-Free Named Entity Recognition Dataset", 527 "points": 1, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=38177348", 530 "created_at": "2023-11-07T14:47:31Z" 531 } 532 ], 533 "top_points": 389, 534 "total_points": 394, 535 "total_comments": 265 536 } 537 }