ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28549B)


      1 {
      2   "paper": {
      3     "title": "An evaluation of LLM code generation capabilities through graded exercises",
      4     "authors": ["Álvaro Barbero Jiménez"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2410.16292",
      8     "doi": "10.48550/arXiv.2410.16292"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Evaluating GPT-4o-mini on ~14,346 Codewars coding challenges across 8 languages and 8 difficulty levels, the study finds the LLM outperforms humans on easy tasks but fails completely on the hardest exercises (kyu 1-2). A surrogate SHAP analysis attributes 46.6% of performance to task difficulty, 37.4% to solution leakage, and 16% to programming language. Performance drops dramatically for unpopular and legacy languages (especially COBOL), and newer challenges are harder for the LLM, suggesting current benchmarks overestimate LLM coding capability due to data contamination.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "Section 5 (Limitations) explicitly states: 'we have decided to release neither the code of the developed botnet, nor the database of solutions proposed by the LLM' due to Codewars terms of use concerns."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The solutions database and evaluation results are explicitly withheld (Section 5). While Codewars katas are publicly available, the paper's collected dataset of LLM solutions and outcomes is not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions Python, Selenium, and the OpenAI API but provides no requirements.txt, library versions, or environment specification beyond naming the tools."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The code is explicitly withheld and no alternative reproduction path is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (pass rates, Codewars scores). No confidence intervals or error bars appear in any figure or table."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes numerous comparative claims (LLM vs humans, across languages, across kyu levels) but no statistical significance tests are performed. Differences are stated from visual inspection of charts."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The SHAP analysis decomposes performance into difficulty (46.6%), leakage (37.4%), and language (16%) with full context. Per-language Codewars scores are reported (e.g., Python 36.1) against reference levels (87 for solving all kyu ≥4). Magnitudes are interpretable throughout."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The study uses ~14,346 katas but never justifies this sample size or discusses whether it is adequate for the claims made, particularly for rare kata-language combinations."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Each kata appears to be attempted once with a single LLM call. No variance across runs, seeds, or repeated attempts is reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Human performance from Codewars community statistics serves as a baseline throughout. Figure 7 directly compares LLM vs human success rates by difficulty level, and Figure 11 compares Codewars scores over time."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The human baseline comes from current Codewars community statistics at the time of the study, making it contemporary and representative of human developer performance."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The system under evaluation is a single LLM (GPT-4o-mini) with a fixed prompt. There are no components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses both raw pass rates (percentage of katas solved, Figures 7-8) and a weighted Codewars score metric (Equation 1, Figure 9) that emphasizes harder tasks. Timeout rates are also reported separately."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Evaluation is entirely automated via Codewars unit tests (pass/fail). No human evaluation of generated code quality, readability, or correctness beyond test passing is performed."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Codewars uses a two-tier testing system: public unit tests visible to the solver, plus hidden tests used for final verification (Figure 1). The LLM only sees public tests; final evaluation uses hidden tests, functioning as a held-out set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Extensive breakdowns by difficulty rank (8 levels, Figures 7-8), programming language (8 languages, Figures 8-9), and publication date (Figure 11) are provided."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.2 discusses COBOL's 'catastrophic results' with syntax errors, the complete failure on kyu 1-2 exercises, and timeout failures for JavaScript/Python on harder problems (yellow bars in Figure 8)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that the LLM fails completely on kyu 1-2 exercises (0% success), performs poorly on legacy languages, and that performance degrades significantly on newer challenges. These are central findings, not hidden."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about correlation with task difficulty, language popularity, publication date, and the 46.6%/37.4%/16% decomposition are all directly supported by results in Sections 4.1-4.5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract states '37.4% seems to be related to leakage' and uses 'attributed to' language. The surrogate model (Section 4.5) provides correlational evidence only — SHAP values on a 74.88%-accurate linear SVM do not establish causation. The study design is observational with no causal identification strategy."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only GPT-4o-mini on Codewars katas but generalizes broadly. The conclusions state 'current evaluations in the literature of the performance of state-of-the-art LLMs are, quite probably, overestimates of their real skill' — a claim about all LLMs based on one model on one platform."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 4.4 explicitly considers two competing hypotheses for the age effect: changed difficulty criteria vs. solution leakage. Section 4.5 addresses multicollinearity and acknowledges that features encode overlapping factors (e.g., easy katas have more completions AND more solutions available)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 5 (Limitations) explicitly distinguishes between what was measured (solving programming exercises) and what broader claims this supports: 'actual work as a software developer involves not just solving programming challenges, but also writing documentation, finding and reporting bugs...' The gap between proxy and outcome is clearly acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3 specifies 'GPT-4o-mini (version 2024-07-18)' — an exact version with snapshot date."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Figure 4 provides the full system prompt and user prompt with all text. The placeholders (%LANGUAGE%, %DESCRIPTION%, %SOLUTION_TEMPLATE%, %UNIT_TESTS%) are filled from publicly available Codewars kata content, making the prompts fully reconstructable."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM inference hyperparameters are reported — temperature, top-p, max tokens, and other API parameters that affect output quality are not mentioned anywhere."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The LLM receives a single prompt and produces a single response. The Selenium bot infrastructure is evaluation tooling, not LLM scaffolding."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3 documents the data collection from Codewars, the 8 language selection rationale, that 57 katas (0.4%) were discarded due to regex/Selenium incompatibilities, and the resulting distribution across languages and difficulty levels (Figure 5)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 is titled 'Limitations' and contains substantive discussion of four specific limitations spanning multiple paragraphs."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 discusses specific threats: coverage limited to programming exercises (not full dev work), reproducibility constraints from Codewars ToS, absence of human-in-the-loop refinement, and surrogate model accuracy of only 74.88% introducing noise in the explanatory analysis."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5 explicitly states the evaluation 'only covers a small spectrum of this wide range of tasks' and enumerates what is excluded: documentation, bug finding, system design, requirements gathering, team collaboration, and roadmap definition."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw data (kata solutions, pass/fail results) is explicitly withheld per Section 5: 'we have decided to release neither the code of the developed botnet, nor the database of solutions proposed by the LLM.'"
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 and Figure 3 describe the Selenium bot system, interaction with the Codewars API and OpenAI API, the account creation/deletion procedure, and responsible crawling practices."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The data source is Codewars, a public coding challenge platform. The kata selection is the full set available for 8 languages."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Figure 3 diagrams the full pipeline: kata downloader bot → solution generator bot (via OpenAI API) → kata attempter bot → result verification. Exclusion criteria (57 regex-incompatible katas) are documented with counts."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed. The acknowledgments section thanks the Codewars community and IIC colleagues but mentions no grants, sponsors, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliation with Instituto de Ingeniería del Conocimiento and Universidad Autónoma de Madrid is clearly stated on the title page. The author evaluates a third-party product (OpenAI's GPT-4o-mini), not their own."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding is disclosed. The work appears to be unfunded academic research from a solo university-affiliated researcher."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The model version is specified (GPT-4o-mini 2024-07-18) but the training data cutoff date is not stated. Without this, temporal contamination cannot be precisely assessed."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "This is a central theme. Section 4.4 discusses how ~38,500 public GitHub repos contain Codewars solutions, and the SHAP analysis (Section 4.5) attributes 37.4% of performance to leakage-related features. The paper also cites similar concerns with APPS and SWE-bench."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Contamination is the paper's core contribution. Section 4.4 discusses solution availability in public repos, Section 4.5 quantifies contamination impact via surrogate model analysis, and Section 6 proposes temporal evaluation as mitigation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The evaluation is fully automated LLM-on-benchmark."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates an LLM on coding challenges."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. Human performance data comes from aggregate Codewars community statistics."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper notes GPT-4o-mini is 'approximately 30 times cheaper' than GPT-4o (Section 3) but never reports the actual API cost for the ~14,346 kata attempts."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is reported — neither total API spend, wall-clock time for the full evaluation, nor hardware used for the bot infrastructure."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Each kata appears to be attempted a single time. No multiple seeds or repeated sampling is performed, despite LLM outputs being stochastic."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper never explicitly states how many attempts per kata were made. It appears to be a single attempt but this is not stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "For the LLM evaluation, API parameters (temperature, sampling) are not reported, let alone searched. The surrogate model hyperparameters are reported (Table 1) but the main LLM evaluation hyperparameters are absent."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "For the surrogate model, 5-fold cross-validation is used to select the best configuration (Section 4.5, Table 1). The linear SVM is chosen over XGBoost because it produces equivalent accuracy (74.88% vs 74.91%) with greater interpretability."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed in the paper, so correction for multiple comparisons is structurally inapplicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors build the evaluation framework and surrogate model but do not discuss how their design choices (feature selection, model choice, clustering of SHAP features into difficulty/leakage/language groups) could bias the results."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper tests only GPT-4o-mini, choosing it over GPT-4o for cost (30x cheaper). No performance-compute comparison is made, despite this being directly relevant to interpreting the results."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "This is a central theme. The paper extensively discusses whether coding benchmarks (Codewars, HumanEval, APPS, SWE-bench) actually measure LLM coding capability vs. memorization. Section 1.4 reviews construct validity issues and Section 6 argues current evaluations overestimate real skill."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. The LLM receives a single prompt and produces a single response; no agentic workflow or tool use."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 4.4 analyzes how kata publication date correlates with LLM success, finding older katas with more time for solutions to spread are easier for the LLM. The SHAP analysis includes 'days since publication' as a leakage-related feature."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The paper does not discuss whether the evaluation setup (public unit tests provided in the prompt) leaks information about solutions that wouldn't be available in a real coding scenario."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether Codewars katas share structural similarities, whether solutions to similar katas in training data could transfer, or whether kata families create non-independence in the evaluation."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The surrogate model with SHAP analysis (Section 4.5) provides a concrete, quantitative method for estimating leakage impact. Features encoding leakage likelihood (completions, publication date) are systematically analyzed via non-interventional SHAP with 100K sampling points."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "The LLM outperforms humans on easy tasks (kyu 7-8) but fails completely on the hardest exercises (kyu 1-2).",
    365       "evidence": "Figure 7 shows LLM success rates exceeding human rates at kyu 8 and 7, similar performance at kyu 6-4, and 0% LLM success at kyu 1-2 (Section 4.1).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "LLM performance varies significantly across programming languages, with legacy languages (COBOL, Fortran) performing dramatically worse.",
    370       "evidence": "Figures 8-9 show Python leading (score 36.1) and COBOL near zero. Section 4.2 notes COBOL shows 'catastrophic results' with frequent syntax errors.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Approximately 37.4% of the LLM's performance can be attributed to leakage of challenge solutions into the training set.",
    375       "evidence": "SHAP analysis of a linear SVM surrogate model (Section 4.5, Figure 13) groups features into difficulty (46.6%), leakage (37.4%), and language (16%). The surrogate model has 74.88% cross-validation accuracy.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The publication date of challenges negatively correlates with LLM success, with newer challenges being harder to solve.",
    380       "evidence": "Figure 11 shows LLM Codewars score declining for more recently published katas, both overall and when filtering by difficulty level.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Current evaluation methodologies are overestimating the actual skill of LLMs for generating functional code.",
    385       "evidence": "Based on the 37.4% leakage attribution and analysis of temporal effects (Sections 4.4-4.5, 6). However, this is based on one model (GPT-4o-mini) on one platform (Codewars), and the surrogate model has limited accuracy (74.88%).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Language popularity (measured by GitHub pushes) correlates with LLM coding performance.",
    390       "evidence": "Figure 10 shows a clear visual correlation between GitHub language popularity (pushes) and LLM Codewars score across 8 languages.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Single model generalized to all LLMs",
    397       "detail": "Only GPT-4o-mini is tested, but conclusions are framed in terms of 'Large Language Models' generally. The abstract and conclusions make claims about 'current evaluation methodologies' and LLM skill broadly."
    398     },
    399     {
    400       "flag": "No statistical tests for comparative claims",
    401       "detail": "Multiple comparative claims (LLM vs human, across languages, across difficulty levels) are made without any significance tests, confidence intervals, or error bars."
    402     },
    403     {
    404       "flag": "Single attempt per challenge",
    405       "detail": "Each kata appears to be attempted once despite LLM outputs being stochastic. The pass@k metric (which the paper reviews in Section 1.4) was designed to address this, yet is not used in the paper's own evaluation."
    406     },
    407     {
    408       "flag": "Code and data deliberately withheld",
    409       "detail": "Both the evaluation code and results database are withheld (Section 5), making independent verification impossible. While the Codewars ToS concern is acknowledged, alternative approaches (e.g., releasing anonymized results) are not explored."
    410     },
    411     {
    412       "flag": "Surrogate model accuracy limits conclusions",
    413       "detail": "The 46.6%/37.4%/16% decomposition comes from a linear SVM with only 74.88% accuracy. The remaining 25% unexplained variance means the decomposition is approximate, yet it is presented prominently in the abstract without this caveat."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating Large Language Models Trained on Code",
    419       "authors": ["Mark Chen"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduced HumanEval and Codex, foundational LLM code evaluation benchmark; noted public repos contain solutions to other datasets."
    423     },
    424     {
    425       "title": "Measuring Coding Challenge Competence With APPS",
    426       "authors": ["Dan Hendrycks"],
    427       "year": 2021,
    428       "arxiv_id": "2105.09938",
    429       "relevance": "Code challenge benchmark with difficulty levels; found LLMs fail on competition-level problems, directly relevant to difficulty-performance analysis."
    430     },
    431     {
    432       "title": "Program Synthesis with Large Language Models",
    433       "authors": ["Jacob Austin"],
    434       "year": 2021,
    435       "arxiv_id": "2108.07732",
    436       "relevance": "Introduced MBPP benchmark for basic Python programs; found LLMs struggle with multi-subproblem tasks and training data modifications."
    437     },
    438     {
    439       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    440       "authors": ["Carlos E. Jimenez"],
    441       "year": 2024,
    442       "arxiv_id": "2310.06770",
    443       "relevance": "Real-world software engineering benchmark; discussed for contamination risk since tasks come from public GitHub repos."
    444     },
    445     {
    446       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    447       "authors": ["Jiawei Liu"],
    448       "year": 2023,
    449       "arxiv_id": "2305.01210",
    450       "relevance": "EvalPlus benchmark showing augmented tests reveal LLM code quality is lower than originally reported; directly supports the overestimation thesis."
    451     },
    452     {
    453       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    454       "authors": ["Terry Yue Zhuo"],
    455       "year": 2024,
    456       "arxiv_id": "2406.15877",
    457       "relevance": "Comprehensive code generation benchmark showing even best models achieve only 60% on well-structured tasks, 50% on natural language instructions."
    458     },
    459     {
    460       "title": "GPT-4 Technical Report",
    461       "authors": ["OpenAI"],
    462       "year": 2024,
    463       "arxiv_id": "2303.08774",
    464       "relevance": "Technical report for a major LLM family; relevant as the parent model family of the GPT-4o-mini model evaluated in this paper."
    465     },
    466     {
    467       "title": "Open LLM Leaderboard v2",
    468       "authors": ["Clémentine Fourrier"],
    469       "year": 2024,
    470       "relevance": "Updated general LLM leaderboard addressing dataset contamination issues; directly relevant to the paper's contamination argument."
    471     },
    472     {
    473       "title": "StarCoder: may the source be with you!",
    474       "authors": ["Raymond Li"],
    475       "year": 2023,
    476       "arxiv_id": "2305.06161",
    477       "relevance": "Code-specialized LLM trained on open source code; relevant to understanding code generation training data composition."
    478     },
    479     {
    480       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    481       "authors": ["Wei-Lin Chiang"],
    482       "year": 2024,
    483       "arxiv_id": "2403.04132",
    484       "relevance": "Human preference-based LLM evaluation platform used to justify model selection in this study; alternative evaluation methodology."
    485     },
    486     {
    487       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    488       "authors": ["Daya Guo"],
    489       "year": 2024,
    490       "arxiv_id": "2401.14196",
    491       "relevance": "Code-specialized LLM trained from scratch on code; relevant to understanding code LLM training and capabilities."
    492     },
    493     {
    494       "title": "CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Benchmarking on HumanEval-X",
    495       "authors": ["Qinkai Zheng"],
    496       "year": 2023,
    497       "relevance": "Multilingual code generation benchmark (HumanEval-X); found models perform better on popular languages, consistent with this paper's findings."
    498     }
    499   ]
    500 }

Impressum · Datenschutz