scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33264B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An evaluation of LLM code generation capabilities through graded exercises",
      6     "authors": [
      7       "Álvaro Barbero Jiménez"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2410.16292",
     12     "doi": "10.48550/arXiv.2410.16292"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract claims about correlation with task difficulty, language popularity, publication date, and the 46.6%/37.4%/16% decomposition are all directly supported by results in Sections 4.1-4.5.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The abstract states '37.4% seems to be related to leakage' and uses 'attributed to' language. The surrogate model (Section 4.5) provides correlational evidence only — SHAP values on a 74.88%-accurate linear SVM do not establish causation. The study design is observational with no causal identification strategy.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper tests only GPT-4o-mini on Codewars katas but generalizes broadly. The conclusions state 'current evaluations in the literature of the performance of state-of-the-art LLMs are, quite probably, overestimates of their real skill' — a claim about all LLMs based on one model on one platform.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 4.4 explicitly considers two competing hypotheses for the age effect: changed difficulty criteria vs. solution leakage. Section 4.5 addresses multicollinearity and acknowledges that features encode overlapping factors (e.g., easy katas have more completions AND more solutions available).",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 5 (Limitations) explicitly distinguishes between what was measured (solving programming exercises) and what broader claims this supports: 'actual work as a software developer involves not just solving programming challenges, but also writing documentation, finding and reporting bugs...' The gap between proxy and outcome is clearly acknowledged.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 5 is titled 'Limitations' and contains substantive discussion of four specific limitations spanning multiple paragraphs.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 5 discusses specific threats: coverage limited to programming exercises (not full dev work), reproducibility constraints from Codewars ToS, absence of human-in-the-loop refinement, and surrogate model accuracy of only 74.88% introducing noise in the explanatory analysis.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 5 explicitly states the evaluation 'only covers a small spectrum of this wide range of tasks' and enumerates what is excluded: documentation, bug finding, system design, requirements gathering, team collaboration, and roadmap definition.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source is disclosed. The acknowledgments section thanks the Codewars community and IIC colleagues but mentions no grants, sponsors, or funding agencies.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation with Instituto de Ingeniería del Conocimiento and Universidad Autónoma de Madrid is clearly stated on the title page. The author evaluates a third-party product (OpenAI's GPT-4o-mini), not their own.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No funding is disclosed. The work appears to be unfunded academic research from a solo university-affiliated researcher.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms are defined: 'kata' and kyu difficulty levels (1–8) are explained with concrete examples, the Codewars scoring formula is stated explicitly (Equation 1), and 'code generation' is scoped to single-function synthesis from natural language descriptions.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The contribution is explicitly framed: evaluating GPT-4o-mini on 14,346 Codewars challenges across 8 languages and providing 'the first result that quantifies the impact of solutions leakage on the performance of an LLM for coding.'",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1.4 provides a comprehensive review of code evaluation benchmarks (HumanEval, APPS, MBPP, ODEX, BigCodeBench, SWE-bench, etc.), showing how this work extends and differs from each prior effort.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "Section 5 (Limitations) explicitly states: 'we have decided to release neither the code of the developed botnet, nor the database of solutions proposed by the LLM' due to Codewars terms of use concerns.",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The solutions database and evaluation results are explicitly withheld (Section 5). While Codewars katas are publicly available, the paper's collected dataset of LLM solutions and outcomes is not released.",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The paper mentions Python, Selenium, and the OpenAI API but provides no requirements.txt, library versions, or environment specification beyond naming the tools.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No reproduction instructions are provided. The code is explicitly withheld and no alternative reproduction path is described.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All results are reported as point estimates (pass rates, Codewars scores). No confidence intervals or error bars appear in any figure or table.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The paper makes numerous comparative claims (LLM vs humans, across languages, across kyu levels) but no statistical significance tests are performed. Differences are stated from visual inspection of charts.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "The SHAP analysis decomposes performance into difficulty (46.6%), leakage (37.4%), and language (16%) with full context. Per-language Codewars scores are reported (e.g., Python 36.1) against reference levels (87 for solving all kyu ≥4). Magnitudes are interpretable throughout.",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The study uses ~14,346 katas but never justifies this sample size or discusses whether it is adequate for the claims made, particularly for rare kata-language combinations.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Each kata appears to be attempted once with a single LLM call. No variance across runs, seeds, or repeated attempts is reported.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Human performance from Codewars community statistics serves as a baseline throughout. Figure 7 directly compares LLM vs human success rates by difficulty level, and Figure 11 compares Codewars scores over time.",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "The human baseline comes from current Codewars community statistics at the time of the study, making it contemporary and representative of human developer performance.",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": false,
    189           "answer": false,
    190           "justification": "The system under evaluation is a single LLM (GPT-4o-mini) with a fixed prompt. There are no components to ablate.",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The paper uses both raw pass rates (percentage of katas solved, Figures 7-8) and a weighted Codewars score metric (Equation 1, Figure 9) that emphasizes harder tasks. Timeout rates are also reported separately.",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Evaluation is entirely automated via Codewars unit tests (pass/fail). No human evaluation of generated code quality, readability, or correctness beyond test passing is performed.",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Codewars uses a two-tier testing system: public unit tests visible to the solver, plus hidden tests used for final verification (Figure 1). The LLM only sees public tests; final evaluation uses hidden tests, functioning as a held-out set.",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Extensive breakdowns by difficulty rank (8 levels, Figures 7-8), programming language (8 languages, Figures 8-9), and publication date (Figure 11) are provided.",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 4.2 discusses COBOL's 'catastrophic results' with syntax errors, the complete failure on kyu 1-2 exercises, and timeout failures for JavaScript/Python on harder problems (yellow bars in Figure 8).",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The paper reports that the LLM fails completely on kyu 1-2 exercises (0% success), performs poorly on legacy languages, and that performance degrades significantly on newer challenges. These are central findings, not hidden.",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Section 3 specifies 'GPT-4o-mini (version 2024-07-18)' — an exact version with snapshot date.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Figure 4 provides the full system prompt and user prompt with all text. The placeholders (%LANGUAGE%, %DESCRIPTION%, %SOLUTION_TEMPLATE%, %UNIT_TESTS%) are filled from publicly available Codewars kata content, making the prompts fully reconstructable.",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No LLM inference hyperparameters are reported — temperature, top-p, max tokens, and other API parameters that affect output quality are not mentioned anywhere.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used. The LLM receives a single prompt and produces a single response. The Selenium bot infrastructure is evaluation tooling, not LLM scaffolding.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 3 documents the data collection from Codewars, the 8 language selection rationale, that 57 katas (0.4%) were discarded due to regex/Selenium incompatibilities, and the resulting distribution across languages and difficulty levels (Figure 5).",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Raw data (kata solutions, pass/fail results) is explicitly withheld per Section 5: 'we have decided to release neither the code of the developed botnet, nor the database of solutions proposed by the LLM.'",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 3 and Figure 3 describe the Selenium bot system, interaction with the Codewars API and OpenAI API, the account creation/deletion procedure, and responsible crawling practices.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants. The data source is Codewars, a public coding challenge platform. The kata selection is the full set available for 8 languages.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Figure 3 diagrams the full pipeline: kata downloader bot → solution generator bot (via OpenAI API) → kata attempter bot → result verification. Exclusion criteria (57 regex-incompatible katas) are documented with counts.",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "The model version is specified (GPT-4o-mini 2024-07-18) but the training data cutoff date is not stated. Without this, temporal contamination cannot be precisely assessed.",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "This is a central theme. Section 4.4 discusses how ~38,500 public GitHub repos contain Codewars solutions, and the SHAP analysis (Section 4.5) attributes 37.4% of performance to leakage-related features. The paper also cites similar concerns with APPS and SWE-bench.",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "Contamination is the paper's core contribution. Section 4.4 discusses solution availability in public repos, Section 4.5 quantifies contamination impact via surrogate model analysis, and Section 6 proposes temporal evaluation as mitigation.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants in this study. The evaluation is fully automated LLM-on-benchmark.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants. The study evaluates an LLM on coding challenges.",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants. Human performance data comes from aggregate Codewars community statistics.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "The paper notes GPT-4o-mini is 'approximately 30 times cheaper' than GPT-4o (Section 3) but never reports the actual API cost for the ~14,346 kata attempts.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No total computational budget is reported — neither total API spend, wall-clock time for the full evaluation, nor hardware used for the bot infrastructure.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "Each kata appears to be attempted a single time. No multiple seeds or repeated sampling is performed, despite LLM outputs being stochastic.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "The paper never explicitly states how many attempts per kata were made. It appears to be a single attempt but this is not stated.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "For the LLM evaluation, API parameters (temperature, sampling) are not reported, let alone searched. The surrogate model hyperparameters are reported (Table 1) but the main LLM evaluation hyperparameters are absent.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": true,
    388           "justification": "For the surrogate model, 5-fold cross-validation is used to select the best configuration (Section 4.5, Table 1). The linear SVM is chosen over XGBoost because it produces equivalent accuracy (74.88% vs 74.91%) with greater interpretability.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": false,
    393           "answer": false,
    394           "justification": "No statistical significance tests are performed in the paper, so correction for multiple comparisons is structurally inapplicable.",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "The authors build the evaluation framework and surrogate model but do not discuss how their design choices (feature selection, model choice, clustering of SHAP features into difficulty/leakage/language groups) could bias the results.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The paper tests only GPT-4o-mini, choosing it over GPT-4o for cost (30x cheaper). No performance-compute comparison is made, despite this being directly relevant to interpreting the results.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": true,
    412           "justification": "This is a central theme. The paper extensively discusses whether coding benchmarks (Codewars, HumanEval, APPS, SWE-bench) actually measure LLM coding capability vs. memorization. Section 1.4 reviews construct validity issues and Section 6 argues current evaluations overestimate real skill.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": false,
    417           "answer": false,
    418           "justification": "No scaffolding is involved. The LLM receives a single prompt and produces a single response; no agentic workflow or tool use.",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": true,
    426           "justification": "Section 4.4 analyzes how kata publication date correlates with LLM success, finding older katas with more time for solutions to spread are easier for the LLM. The SHAP analysis includes 'days since publication' as a leakage-related feature.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "The paper does not discuss whether the evaluation setup (public unit tests provided in the prompt) leaks information about solutions that wouldn't be available in a real coding scenario.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether Codewars katas share structural similarities, whether solutions to similar katas in training data could transfer, or whether kata families create non-independence in the evaluation.",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": true,
    444           "justification": "The surrogate model with SHAP analysis (Section 4.5) provides a concrete, quantitative method for estimating leakage impact. Features encoding leakage likelihood (completions, publication date) are systematically analyzed via non-interventional SHAP with 100K sampling points.",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "GPT-4o-mini outperforms humans on easy katas (rank 8–7) but fails completely on rank 1–2 katas.",
    453       "evidence": "Figure 7 shows LLM success rate exceeds human completion rate for ranks 8–7, while rank 1–2 shows zero LLM successes versus non-zero human completion across the entire evaluation corpus.",
    454       "supported": "strong"
    455     },
    456     {
    457       "claim": "~37.4% of LLM coding performance on Codewars is attributable to solution leakage into training data.",
    458       "evidence": "SHAP analysis on a linear SVM surrogate model groups features by hypothesized cause; leakage-proxy features (days since publication, total completions) account for 37.4% of total absolute SHAP value. The model itself has only 74.88% accuracy and the attribution is indirect.",
    459       "supported": "weak"
    460     },
    461     {
    462       "claim": "LLM performance varies significantly by programming language, with legacy languages (COBOL, Fortran) showing dramatically degraded results.",
    463       "evidence": "Figure 8 shows near-zero completion rates for COBOL across all difficulty levels with syntax errors noted on visual inspection; Figure 9 shows large Codewars score gaps correlated with GitHub code volume (Figure 10).",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Newer Codewars katas are harder for the LLM than for humans, consistent with solution leakage benefiting older challenges disproportionately.",
    468       "evidence": "Figure 11 shows LLM performance declining more steeply with kata recency than human performance, with the LLM falling below human levels on basic/intermediate tasks from recent years. Two alternative explanations are noted but not distinguished.",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "Current LLM code generation benchmark evaluations likely overestimate actual model skill due to data contamination.",
    473       "evidence": "The surrogate model attributes ~37.4% of performance to leakage proxies, and 38,500 public GitHub repos contain Codewars solutions. However, this rests on indirect measurement from a single model with no controlled intervention.",
    474       "supported": "moderate"
    475     }
    476   ],
    477   "methodology_tags": [
    478     "benchmark-eval",
    479     "observational"
    480   ],
    481   "key_findings": "GPT-4o-mini was evaluated on 14,346 Codewars coding challenges across 8 programming languages using a bot network that submits solutions against the platform's hidden test sets. The model outperforms humans on easy exercises but completely fails on the hardest challenges (rank 1–2), and performance degrades catastrophically for low-resource and legacy languages (especially COBOL). A surrogate model with SHAP analysis estimates that ~46.6% of performance is explained by task difficulty, ~37.4% by solution leakage into training data (proxied by kata age and user completion counts), and ~16% by programming language popularity — suggesting current code generation benchmarks substantially overestimate true LLM capability.",
    482   "red_flags": [
    483     {
    484       "flag": "Single model, broad conclusions",
    485       "detail": "Only GPT-4o-mini is evaluated, but conclusions generalize to 'current evaluations of state-of-the-art LLMs' being 'quite probably overestimates.' One model on one platform cannot support this claim."
    486     },
    487     {
    488       "flag": "Code and data withheld — irreproducible by design",
    489       "detail": "The paper explicitly states code and evaluation data are not released due to Codewars TOS concerns. Results cannot be independently verified or reproduced by any third party."
    490     },
    491     {
    492       "flag": "Causal attribution via proxy features in surrogate model",
    493       "detail": "The 37.4% leakage figure is derived from grouping correlated features in an SVM surrogate model by hypothesized cause, not from any direct measurement of training data overlap or controlled experiment. The surrogate model itself achieves only 74.88% accuracy."
    494     },
    495     {
    496       "flag": "No statistical tests or uncertainty quantification",
    497       "detail": "All comparative claims (LLM vs. humans, between languages, age trends) are supported only by visual inspection of bar charts and trend lines with no significance tests, confidence intervals, or variance estimates."
    498     },
    499     {
    500       "flag": "Training cutoff not disclosed",
    501       "detail": "Despite the central focus on training data contamination, the training cutoff date for GPT-4o-mini is never stated, making it impossible to reason precisely about which katas were available during training."
    502     }
    503   ],
    504   "cited_papers": [
    505     {
    506       "title": "Evaluating Large Language Models Trained on Code (HumanEval / Codex)",
    507       "relevance": "Foundational benchmark paper for LLM code generation; introduces pass@k metric and discusses contamination from public repos; directly compared throughout"
    508     },
    509     {
    510       "title": "Measuring Coding Challenge Competence With APPS",
    511       "relevance": "Prior benchmark evaluating LLMs on programming challenges scraped from competitive platforms; directly compared methodology and results including difficulty scaling"
    512     },
    513     {
    514       "title": "Program Synthesis with Large Language Models (MBPP)",
    515       "relevance": "Prior LLM code evaluation showing similar difficulty scaling and multi-turn improvement; compared single-turn vs. multi-turn performance"
    516     },
    517     {
    518       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    519       "relevance": "Most realistic code benchmark; cited as state-of-the-art comparison and as another benchmark with known contamination from public GitHub repos"
    520     },
    521     {
    522       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    523       "relevance": "Contemporary benchmark showing 60% ceiling even for best models; compared scope and findings on library-usage difficulty"
    524     },
    525     {
    526       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    527       "relevance": "Used to justify model selection (GPT-4o-mini ranked #2 for coding) and as an example of crowd-sourced evaluation methodology"
    528     },
    529     {
    530       "title": "Execution-Based Evaluation for Open-Domain Code Generation (ODEX)",
    531       "relevance": "Prior benchmark showing library popularity correlates with LLM performance; directly corroborates the language-popularity finding"
    532     },
    533     {
    534       "title": "Open LLM Leaderboard v2",
    535       "relevance": "Example of contamination forcing benchmark replacement in the general LLM evaluation space; motivates the paper's code-specific contamination concern"
    536     }
    537   ],
    538   "engagement_factors": {
    539     "practical_relevance": {
    540       "score": 3,
    541       "justification": "Directly actionable for practitioners who rely on benchmark scores to select LLMs for coding tasks — quantifies how much contamination may inflate reported capability."
    542     },
    543     "surprise_contrarian": {
    544       "score": 2,
    545       "justification": "Provides a novel quantification of leakage (~37.4%) that challenges trust in existing code benchmarks, though contamination as a concern is broadly acknowledged."
    546     },
    547     "fear_safety": {
    548       "score": 0,
    549       "justification": "No AI safety or harm concerns raised; purely an evaluation methodology paper."
    550     },
    551     "drama_conflict": {
    552       "score": 1,
    553       "justification": "Implies state-of-the-art benchmark results are inflated, which is mildly provocative, but the framing is measured and academic."
    554     },
    555     "demo_ability": {
    556       "score": 1,
    557       "justification": "The exact study cannot be replicated (code withheld, TOS issues), but readers can manually test LLMs on Codewars challenges to spot-check the finding."
    558     },
    559     "brand_recognition": {
    560       "score": 1,
    561       "justification": "GPT-4o-mini (OpenAI) is a recognizable product, but the paper is from a small Spanish research institute with no major lab affiliation."
    562     }
    563   },
    564   "hn_data": {
    565     "threads": [
    566       {
    567         "hn_id": "39274918",
    568         "title": "Better Call GPT: Comparing large language models against lawyers [pdf]",
    569         "points": 389,
    570         "comments": 264,
    571         "url": "https://news.ycombinator.com/item?id=39274918",
    572         "created_at": "2024-02-06T15:04:39Z"
    573       },
    574       {
    575         "hn_id": "42021222",
    576         "title": "Fast and Accurate Deep Reconfigurable Spiking Inference Accelerator Architecture",
    577         "points": 2,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=42021222",
    580         "created_at": "2024-11-01T20:28:32Z"
    581       },
    582       {
    583         "hn_id": "41926182",
    584         "title": "We discovered a way to measure LLM bias while building a recruitment tool",
    585         "points": 1,
    586         "comments": 1,
    587         "url": "https://news.ycombinator.com/item?id=41926182",
    588         "created_at": "2024-10-23T15:41:33Z"
    589       },
    590       {
    591         "hn_id": "42576715",
    592         "title": "Reinforcement Learning for Multi-Intersection Traffic Signal Control",
    593         "points": 1,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=42576715",
    596         "created_at": "2025-01-02T17:51:07Z"
    597       },
    598       {
    599         "hn_id": "38177348",
    600         "title": "CleanCoNLL: A Nearly Noise-Free Named Entity Recognition Dataset",
    601         "points": 1,
    602         "comments": 0,
    603         "url": "https://news.ycombinator.com/item?id=38177348",
    604         "created_at": "2023-11-07T14:47:31Z"
    605       }
    606     ],
    607     "top_points": 389,
    608     "total_points": 394,
    609     "total_comments": 265
    610   }
    611 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs