scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27156B)
      1 {
      2   "paper": {
      3     "title": "LeetCodeDataset: A Temporal Dataset for Robust Evaluation and Efficient Training of Code LLMs",
      4     "authors": [
      5       "Yunhui Xia",
      6       "Wei Shen",
      7       "Yan Wang",
      8       "Jason Klein Liu",
      9       "Huifeng Sun",
     10       "Siyue Wu",
     11       "Jian Hu",
     12       "Xiaolong Xu"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2504.14655",
     17     "doi": "10.48550/arXiv.2504.14655"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "LeetCodeDataset curates 2,869 Python LeetCode problems with temporal splits (pre/post July 2024) for contamination-free evaluation and SFT training. Reasoning models (DeepSeek-R1 at 65.2% pass@1) substantially outperform non-reasoning models (GPT-4o at 35.6%) on competitive programming tasks. SFT with only 2.6K model-generated solutions matches or exceeds performance of models trained on 110K samples on HumanEval and MBPP, though this advantage does not extend to harder benchmarks.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "GitHub repository link provided in abstract: https://github.com/newfacade/LeetCodeDataset"
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "HuggingFace dataset link provided in abstract: https://huggingface.co/datasets/newfacade/LeetCodeDataset"
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or environment specification is provided in the paper. Only model names and training hyperparameters are listed."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The paper mentions an 'evaluation toolkit' but does not include instructions for reproducing the evaluation or SFT results."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 2, 3, and 4 report only point estimates (e.g., '65.23%', '79.9%') with no confidence intervals, error bars, or ± notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims reasoning models 'significantly outperform' non-reasoning ones but uses no statistical significance tests. All comparisons are based on raw number differences."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are presented with baseline context enabling effect size assessment, e.g., '79.9% vs. 55.5% on HumanEval' for model-generated vs human-written training data (Section 4.2), and per-model pass rates across difficulty levels (Table 2)."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The test set contains 256 problems and SFT uses 2.6K samples. No justification is given for why these sizes are adequate for the claims made, and no power analysis is discussed."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported across runs. All results appear to be single-run point estimates."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 3 evaluates six models (GPT-4o, Claude 3.7 Sonnet, DeepSeek-V3, DeepSeek-R1, Qwen2.5-Max, QwQ-Plus). Section 4 compares SFT against five other training datasets."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Evaluated models include GPT-4o-0806, Claude 3.7 Sonnet, DeepSeek-R1, and QwQ-Plus — all released in 2024-2025 and representing the state of the art at time of writing."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table 4 compares model-generated vs. human-written responses for SFT (79.9% vs 55.5% on HumanEval), isolating the effect of response quality while holding the training set constant."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Only pass@1 is reported throughout the paper. While it is evaluated on four benchmarks (HumanEval, MBPP, LiveCodeBench, LeetCodeDataset), only a single metric is used."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "All evaluation is automated via test case pass/fail. No human evaluation of solution quality, readability, or efficiency is included."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "A strict temporal split is used: problems released after July 1, 2024 form the 256-problem test set, while earlier problems are allocated for training (Section 2.1)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 2 provides per-difficulty breakdowns (Easy/Medium/Hard). Table 3 provides per-topic-tag breakdowns across 24 algorithm categories for all six models."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No qualitative error analysis or specific failure examples are shown. The paper notes models struggle with certain tags (e.g., GPT-4o at 7.7% on Binary Search) but does not analyze why or show failure cases."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 4.2 explicitly reports that the 2.6K-trained SFT model 'underperformed on hard benchmarks,' noting that 'small-scale SFT primarily develops basic programming skills.'"
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims are supported: (1) reasoning models outperform non-reasoning — Table 2 shows DeepSeek-R1 65.23% vs GPT-4o 35.55%. (2) SFT with 2.6K samples matches 110K — Table 4 shows 79.9% vs 77.4% on HumanEval."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The key causal claim (model-generated data improves SFT over human-written) uses controlled comparison: same base model (Qwen2.5-Coder-7B), same hyperparameters, same 2.6K training problems, differing only in response source (Section 4)."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Robust Evaluation and Efficient Training of Code LLMs' but the dataset covers only Python problems from LeetCode. Section 6 acknowledges coverage gaps but the title and abstract do not bound to Python or competitive programming."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations are discussed for the key findings. For example, the model-generated vs human-written gap could be due to response formatting rather than content quality, and the reasoning model advantage could partly reflect longer inference compute, but neither is considered."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures pass@1 on LeetCode problems and frames results in terms of coding performance on those specific tasks. Claims match the granularity of measurements without inflating to broader constructs like 'programming ability.'"
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "GPT-4o-0806 includes a specific snapshot date, but Claude 3.7 Sonnet, DeepSeek-V3, DeepSeek-R1, Qwen2.5-Max, and QwQ-Plus are all marketing names without snapshot dates or API version identifiers."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Appendix provides prompts for input generation (Figures 4-5) used in dataset construction, but the actual evaluation prompts used to test models are not provided. The paper states queries are 'consistent with LiveCodeBench's construction' without reproducing them."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3: 'temperature=0.2 and top_p=0.95' for evaluation. Section 4.1: 'learning rate of 1e-5, warmup ratio of 0.1, cosine learning rate scheduling, batch size of 32, 3 epochs.' Section 2.1: 'T=1.0' for data generation."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. Models are evaluated directly on coding problems without tool use or multi-step workflows."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 2.1 documents the full pipeline: metadata acquisition via GraphQL API (3,505 problems → 3,115 Python), canonical solution verification, entry point identification via pattern matching, LLM-based input generation, sandboxed test case generation (final 2,869 problems)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 'Limitations' presents three specific limitations: false positive risks, complexity analysis gap, and coverage gaps for multi-entry-point problems."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The limitations are specific to this dataset: (1) imbalanced test case distribution increasing false positive risk, (2) inability to assess time/space complexity, (3) exclusion of multi-function problems."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 6 explicitly states that multi-entry-point problems are excluded, complexity analysis is out of scope, and Section 2.1 notes the dataset covers only Python submissions on LeetCode."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The full dataset is available on HuggingFace (https://huggingface.co/datasets/newfacade/LeetCodeDataset) for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.1 provides a detailed description of data collection: GraphQL API for metadata, GitHub repositories for canonical solutions verified on LeetCode, LLM-generated test inputs, sandboxed execution for test case outputs."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data is sourced from the LeetCode platform, a standard public programming problem repository."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Section 2.1 documents the full pipeline with counts at each stage: 3,505 total problems → 3,115 Python-supported → 2,869 with generated outputs. Each transformation step is explained."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure, acknowledgments section, or grant information appears anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Authors list only personal email addresses (163.com, 126.com, gmail.com, ieee.org). No institutional or corporate affiliations are disclosed, which is unusual for an academic paper."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence cannot be assessed."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement or financial interests declaration appears in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Only GPT-4o-0806's release date (August 2024) is mentioned as a proxy. Actual training data cutoff dates are not stated for any model."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 3 explicitly discusses contamination via temporal analysis: 'The minimal temporal overlap between GPT-4o-0806's release date and our test problem release window strongly suggests authentic model capability measurements.' Monthly accuracy curves (Figure 3) are analyzed for contamination signals."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The core design addresses contamination: temporal split at July 2024 ensures test problems post-date model training. Section 3 analyzes monthly accuracy curves for contamination artifacts and concludes fluctuations are due to problem difficulty, not contamination."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference cost, latency, or tokens consumed are reported for any of the six evaluated models or for the data generation pipeline."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No GPU hours, total API spend, or training time is reported for either the SFT experiments or the dataset construction pipeline."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No multiple seeds or seed sensitivity analysis is mentioned. All results appear to be from single runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of runs producing the evaluation results is never stated."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Section 4.1 lists fixed hyperparameters but provides no justification for their selection and no mention of any hyperparameter search."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper states 'consistent hyperparameters' were used but does not justify why these specific values were chosen or whether they were tuned."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed, so multiple comparison correction is structurally inapplicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors created the benchmark and evaluated models on it without discussing whether their benchmark design might favor certain model types or problem-solving approaches."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Reasoning models (DeepSeek-R1, QwQ-Plus) use substantially more inference compute via chain-of-thought than non-reasoning models, but performance is compared without normalizing for or discussing this compute difference."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether LeetCode competitive programming problems actually measure the coding or reasoning abilities the paper claims to evaluate. The validity of LeetCode as a proxy for real-world coding capability is assumed."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is used. Models are evaluated directly on coding problems."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Temporal leakage is the central concern addressed by the dataset design. Problems released after July 1, 2024 form the test set, ensuring they post-date model training data collection. Section 3 analyzes monthly accuracy for temporal patterns."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup could leak answer information through the problem description, constraints, or test case structure."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether LeetCode problems share structural similarities with problems in model training data from other sources (e.g., similar problems on Codeforces, USACO, or GitHub)."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "Temporal splits serve as a concrete leakage prevention method. Additionally, Section 3 and Figure 3 analyze monthly pass@1 curves as a detection method — stable or rising accuracy post-cutoff would indicate contamination."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Reasoning models significantly outperform non-reasoning counterparts on competitive coding tasks.",
    374       "evidence": "Table 2: DeepSeek-R1 achieves 65.23% pass@1 vs GPT-4o at 35.55% and DeepSeek-V3 at 35.55%. QwQ-Plus achieves 56.25%. The gap is especially large on Hard problems (DeepSeek-R1 41.86% vs GPT-4o 10.47%).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "SFT with 2.6K model-generated LeetCode samples achieves performance comparable to models trained on 110K samples.",
    379       "evidence": "Table 4: LeetCodeDataset model-generated achieves 79.9% HumanEval and 77.5% MBPP, matching or exceeding Magicoder Evol-Instruct-110K (77.4% HumanEval, 74.1% MBPP). However, on harder benchmarks (LiveCodeBench 15.4% vs 16.9% for OpenThoughts; LeetCodeDataset 12.5% vs 16.4%), the advantage disappears.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Model-generated training data substantially outperforms human-written responses for code SFT.",
    384       "evidence": "Table 4: Same 2.6K LeetCode problems, model-generated responses achieve 79.9% vs 55.5% on HumanEval and 77.5% vs 53.4% on MBPP. Controlled comparison using same base model and hyperparameters.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The temporal split enables contamination-free evaluation.",
    389       "evidence": "Section 3 and Figure 3: Monthly accuracy curves show no systematic decline for post-July-2024 problems. The minimal temporal overlap between GPT-4o-0806's August 2024 release and the test window is noted. However, this is argued indirectly rather than via direct contamination detection.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The dataset covers over 90% of all Python problems on LeetCode with 100+ test cases per problem.",
    394       "evidence": "Section 2.1: 2,869 out of 3,115 Python-supported problems (92%) were successfully processed. 'An average of over 100 inputs per problem' generated via LLM prompting.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars or multiple runs",
    401       "detail": "All evaluation results (Tables 2-4) are single-run point estimates with no uncertainty quantification. Given the stochastic nature of LLM inference (temperature=0.2) and SFT training, results could vary meaningfully across runs."
    402     },
    403     {
    404       "flag": "Statistical significance claimed without tests",
    405       "detail": "The paper uses the word 'significantly' to describe the reasoning model advantage, but no statistical tests are performed. The difference could be within noise for some comparisons."
    406     },
    407     {
    408       "flag": "Missing author affiliations",
    409       "detail": "All eight authors list only personal email addresses (163.com, 126.com, gmail.com, ieee.org) with no institutional or corporate affiliations. This makes it impossible to assess potential conflicts of interest."
    410     },
    411     {
    412       "flag": "Selective framing of data efficiency claim",
    413       "detail": "The abstract's claim that 2.6K samples match 110K is true only for HumanEval and MBPP (easy benchmarks). On harder benchmarks (LiveCodeBench, LeetCodeDataset test set), the 2.6K model underperforms, which the paper acknowledges in Section 4.2 but the abstract omits."
    414     },
    415     {
    416       "flag": "Uncontrolled compute confound in reasoning model comparison",
    417       "detail": "Reasoning models (DeepSeek-R1, QwQ-Plus) use chain-of-thought with substantially more inference compute than non-reasoning models. The performance comparison does not account for this difference, making it unclear how much of the advantage comes from reasoning vs. raw compute."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Evaluating large language models trained on code",
    423       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    424       "year": 2021,
    425       "arxiv_id": "2107.03374",
    426       "relevance": "Introduces HumanEval, a foundational benchmark for code generation evaluation used as a baseline in this paper."
    427     },
    428     {
    429       "title": "Program synthesis with large language models",
    430       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    431       "year": 2021,
    432       "arxiv_id": "2108.07732",
    433       "relevance": "Introduces MBPP benchmark for code generation evaluation, used as a baseline in this paper."
    434     },
    435     {
    436       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    437       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    438       "year": 2024,
    439       "arxiv_id": "2403.07974",
    440       "relevance": "Key prior work on contamination-free code evaluation using temporal updates; this paper explicitly extends and competes with LiveCodeBench."
    441     },
    442     {
    443       "title": "Competition-level code generation with AlphaCode",
    444       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    445       "year": 2022,
    446       "relevance": "Introduces CodeContests dataset for competitive programming evaluation, a key prior benchmark in this space."
    447     },
    448     {
    449       "title": "Measuring coding challenge competence with APPS",
    450       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    451       "year": 2021,
    452       "relevance": "Introduces APPS benchmark for code generation from competitive programming, a foundational dataset this work improves upon."
    453     },
    454     {
    455       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    456       "authors": ["DeepSeek-AI"],
    457       "year": 2025,
    458       "arxiv_id": "2501.12948",
    459       "relevance": "Key reasoning model evaluated in this paper, demonstrating the reasoning model advantage in competitive coding."
    460     },
    461     {
    462       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    463       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    464       "year": 2023,
    465       "arxiv_id": "2305.01210",
    466       "relevance": "Introduces EvalPlus for more rigorous code generation evaluation, directly relevant to benchmark quality and false positive concerns."
    467     },
    468     {
    469       "title": "Magicoder: Empowering code generation with OSS-Instruct",
    470       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    471       "year": 2024,
    472       "arxiv_id": "2312.02120",
    473       "relevance": "Provides two SFT training datasets (Evol-Instruct-110K, OSS-Instruct-75K) used as baselines in the data efficiency comparison."
    474     },
    475     {
    476       "title": "Qwen2.5-Coder technical report",
    477       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    478       "year": 2024,
    479       "relevance": "Describes the base model (Qwen2.5-Coder-7B) used for SFT experiments and the model (Qwen2.5-Coder-32B-Instruct) used for training data generation."
    480     },
    481     {
    482       "title": "GPT-4o system card",
    483       "authors": ["OpenAI"],
    484       "year": 2024,
    485       "arxiv_id": "2410.21276",
    486       "relevance": "Technical report for GPT-4o, one of the key models evaluated in this benchmark study."
    487     },
    488     {
    489       "title": "CODEELO: Benchmarking competition-level code generation of LLMs with human-comparable Elo ratings",
    490       "authors": ["Shanghaoran Quan", "Jiaxi Yang", "Bowen Yu"],
    491       "year": 2025,
    492       "arxiv_id": "2501.01257",
    493       "relevance": "Contemporary code benchmark using Elo ratings from CodeForces submissions, complementary approach to LeetCode-based evaluation."
    494     },
    495     {
    496       "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation",
    497       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    498       "year": 2022,
    499       "arxiv_id": "2208.08227",
    500       "relevance": "Extends HumanEval/MBPP to 18 programming languages, relevant to multilingual code generation evaluation."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs