ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24742B)


      1 {
      2   "paper": {
      3     "title": "CodeArena: A Collective Evaluation Platform for LLM Code Generation",
      4     "authors": [
      5       "Mingzhe Du",
      6       "Anh Tuan Luu",
      7       "Bin Ji",
      8       "Xiaobao Wu",
      9       "Dong Huang",
     10       "Terry Yue Zhuo",
     11       "Qian Liu",
     12       "See-Kiong Ng"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2503.01295"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a website (https://codearena.online), a PyPI package (codearena), and the platform is described as built on the open-source DMOJ framework. The platform itself is accessible and a Python library is published at https://pypi.org/project/codearena/."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states 'all solutions and test cases are publicly accessible' (Section 1, Ethics Statement). A trial account is provided (Account: Test / Password: Haveatry!) for browsing data (Appendix D). The platform stores and makes accessible all submitted solutions and test cases."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Appendix C mentions '8 NVIDIA A100 GPUs' and 'bfloat16' format for open-source models, but no requirements.txt, Dockerfile, or detailed dependency specifications are provided for reproducing the platform or experiments."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper describes the platform architecture and API endpoints but does not provide instructions for replicating the experimental evaluation results (leaderboard numbers)."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Table 1 reports only point estimates for Dynamic Points and Pass scores. No confidence intervals or error bars are provided for any results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes comparative claims about model rankings (e.g., 'most closed-source LLMs adhere to the scaling law, significantly outperforming their open-source counterparts') but provides no statistical significance tests to support these comparisons."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Raw Dynamic Point scores and Pass percentages are reported in Table 1, but no effect sizes (e.g., Cohen's d, percentage improvement with baseline context) are calculated or discussed."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper evaluates 19 models on problems imported from APPS and Mercury benchmarks but does not justify the number of problems used or discuss whether the problem set is sufficiently large for reliable rankings."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Each LLM is allowed only a single attempt per problem. No variance, standard deviation, or spread measures across multiple runs are reported. Single-run results only."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper does not compare CodeArena's evaluation methodology against alternative evaluation approaches (e.g., standard Pass@1 on static benchmarks vs. Dynamic Points). The leaderboard shows model comparisons but no baseline evaluation framework is compared."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No baseline evaluation methodology is compared. LiveCodeBench (Jain et al., 2024), which also addresses contamination through temporal updates, is discussed in related work but not directly compared against as a baseline approach."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The Dynamic Point metric combines Challenge Score and Efficiency Score but no ablation is performed to show the individual contribution of each component or to validate that the collective scoring mechanism works better than simpler alternatives."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports both Dynamic Points (DP) and Pass rate as metrics in Table 1. The DP itself combines Challenge Score and Efficiency Score."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a benchmark/platform paper evaluating code generation correctness and efficiency through automated test case execution. Human evaluation of system outputs is not relevant to the claims."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "This is a platform/benchmark paper, not a machine learning training paper. There is no training/validation/test split to evaluate; the platform evaluates submissions against test cases."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Figure 5 provides per-difficulty-level acceptance rate breakdowns. Table 1 provides individual model results. Figure 4 shows per-model Dynamic Point changes over time checkpoints."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases are discussed. The paper does not analyze specific problems where models failed, types of errors in generated code, or patterns in failures."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Figure 5 shows that acceptance rate does not differentiate well across difficulty levels, which is a negative finding about existing difficulty labeling. Figure 4 shows open-source models declining over time, which is a meaningful negative observation."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims the collective evaluation mechanism 'dynamically recalibrates individual model scores... mitigating score biases caused by widespread benchmark leakage.' However, no empirical evidence is provided that the Dynamic Point mechanism actually mitigates contamination bias — it is argued theoretically via the formula but not validated experimentally (e.g., by comparing rankings on contaminated vs. clean problems)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims the dynamic scoring mechanism 'effectively mitigates the influence of data contamination' (Section 3.1). This is a causal claim but no controlled experiment demonstrates that contaminated problems produce different rankings under the DP metric vs. standard scoring. The evidence is only the mathematical formula, not an empirical validation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Collective Evaluation Platform for LLM Code Generation' broadly, but the evaluation uses only Python problems from APPS and Mercury. The platform supports multiple languages (Python, C, C++, Go, Haskell) but evaluation is shown only for Python. This overgeneralization is not bounded."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper observes that open-source models decline in DP scores over time while closed-source models remain stable (Figure 4), suggesting contamination. However, alternative explanations (e.g., closed-source models being generally more capable, new problems being harder) are not discussed."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Table 1 and Appendix A list models like 'GPT-4o', 'Claude-3-5-sonnet', 'Gemini-1.5-flash' without specifying exact API versions or snapshot dates. For example, 'GPT-4o' without a date-versioned identifier (e.g., gpt-4o-2024-05-13) is insufficient."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix B provides both the system prompt and inference prompt template used for all LLMs. While the inference prompt has placeholders (Example Problem Description, Example Solution, Problem), these are structural placeholders for the benchmark problems themselves, and the actual prompt text/structure is fully specified."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Appendix C states temperature of 0.7 for open-source models but does not report temperature or other inference settings for closed-source model API calls. For closed-source models, only the API links are provided (Appendix A), with no inference parameters specified."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The evaluation involves direct one-shot prompting of LLMs for code generation."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.2 describes the problem collection workflow (importing from APPS and Mercury, collecting from CodeForces and LeetCode weekly contests), test case generation process (using GPT-4o to generate test case generators), and filtering (removing problems with ambiguous outputs). The data pipeline is reasonably documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "A dedicated 'Limitations' section exists after the Conclusion, discussing reliance on external data sources and automated test case quality."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The limitations section mentions only two generic concerns: reliance on external data sources and automated test case quality. These are specific to the platform but not to the experimental evaluation results. No threats to the validity of the leaderboard rankings or the Dynamic Point mechanism are discussed."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. For example, it does not acknowledge that results are shown only for Python, that the Dynamic Point mechanism's contamination-mitigation effect is not empirically validated, or that the leaderboard reflects single-run performance."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The platform makes all solutions and test cases publicly accessible. A trial account is provided for browsing the data repository. The raw submission data is available through the platform APIs."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.2 describes the data collection procedure: problems imported from APPS and Mercury benchmarks, plus regular collection from CodeForces and LeetCode weekly contests. Test cases are generated using GPT-4o."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are recruited for the evaluation. The 'Code Generators' are LLMs pre-registered by the researchers, not recruited human users."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3.2 documents the pipeline: problem collection → test case generation via GPT-4o → filtering out ambiguous problems → submission through APIs → sandbox execution → dynamic scoring. Each step is described."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section listing grants or sponsors is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: NTU, NUS, HKU, Monash, and ByteDance. One author (Qian Liu) is from ByteDance, which is disclosed."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed. One author is affiliated with ByteDance, which has commercial interest in LLM evaluation, but no funding relationship is stated."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates 19 LLMs on code generation benchmarks including APPS and Mercury, but does not state the training data cutoff dates for any of the evaluated models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "While the paper's central motivation is benchmark contamination, it does not analyze whether the specific problems used (from APPS, Mercury, LeetCode, CodeForces) appear in the training data of the evaluated models. The Dynamic Point mechanism is proposed as a mitigation but no empirical analysis of actual contamination is performed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper acknowledges contamination as a problem and proposes Dynamic Points as a theoretical mitigation, but does not empirically verify whether APPS or Mercury benchmark problems were in the training data of evaluated models. APPS (2021) and HumanEval are well-known contaminated benchmarks for post-2022 models, but this is not specifically addressed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study. The evaluation involves only LLM code generators."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper calls APIs for multiple closed-source LLMs and runs open-source models on 8 A100 GPUs but reports no inference costs, API costs, tokens consumed, or wall-clock time for the evaluation."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Appendix C mentions '8 NVIDIA A100 GPUs' for open-source model inference but does not quantify total GPU hours, API spend for closed-source models, or overall computational budget."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "CodeArena's Dynamic Point mechanism mitigates score biases caused by benchmark leakage by dynamically recalibrating individual model scores based on holistic performance of all participating models.",
    295       "evidence": "Section 3.1 presents the Challenge Score formula (Eq. 1) which divides the basic problem score by the number of models solving it. The theoretical argument is that leaked problems solved by many models contribute less to the ranking. No empirical validation of contamination mitigation is provided.",
    296       "supported": "weak"
    297     },
    298     {
    299       "claim": "Most closed-source LLMs adhere to the scaling law, significantly outperforming their open-source counterparts.",
    300       "evidence": "Table 1 shows closed-source models generally rank higher, with DeepSeek-Coder at 249.28 DP vs. the best open-source model DeepSeek-Coder-V2-Lite at 223.67 DP. However, no statistical test supports the claim of 'significant' outperformance, and DeepSeek-Coder-V2-Lite (open-source, small) ranks 5th, challenging the scaling claim.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Open-source LLMs exhibit a clear downward trend in Dynamic Point scores over time, while closed-source LLMs maintain stable scores.",
    305       "evidence": "Figure 4 shows DP changes over checkpoints from July to November 2024. Open-source models (especially DeepSeek-V2-Lite) decline while closed-source models remain stable or slightly improve. However, this is shown as a trend line without statistical analysis or explanation of the mechanism.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Acceptance rate does not show significant variation across difficulty levels inherited from LeetCode.",
    310       "evidence": "Figure 5 shows acceptance rate distribution across Easy, Medium, and Hard difficulty levels. The distributions appear overlapping, suggesting LeetCode's human-defined difficulty does not predict LLM difficulty well. However, no statistical test for the claimed lack of variation is provided.",
    311       "supported": "moderate"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "CodeArena introduces an online evaluation platform for LLM code generation with three main features: a Dynamic Point scoring mechanism that theoretically mitigates benchmark contamination by weighting problems inversely to their acceptance rate, publicly accessible solution and test case data, and automation-friendly APIs. Evaluation of 19 LLMs shows closed-source models generally outperform open-source ones, and temporal analysis reveals declining Dynamic Point scores for open-source models as new problems are added. The Dynamic Point mechanism is argued theoretically but not empirically validated against actual contamination.",
    318   "red_flags": [
    319     {
    320       "flag": "Central claim not empirically validated",
    321       "detail": "The paper's main contribution — that Dynamic Points mitigate benchmark contamination — is supported only by the mathematical formula, not by any controlled experiment comparing rankings on contaminated vs. clean problems, or showing that known-contaminated models are penalized."
    322     },
    323     {
    324       "flag": "No statistical rigor in comparisons",
    325       "detail": "Model rankings and comparative claims (e.g., 'significantly outperforming') are made without any statistical tests, confidence intervals, or error bars. Each model gets only one attempt per problem."
    326     },
    327     {
    328       "flag": "Missing model version specificity",
    329       "detail": "Closed-source models are referenced by marketing names (GPT-4o, Claude-3-5-sonnet) with only website links, not API version identifiers. Model behavior changes across versions, making results unreproducible."
    330     },
    331     {
    332       "flag": "Self-citation pattern",
    333       "detail": "The first author (Mingzhe Du) is also the first author of the Mercury benchmark, which is heavily featured as a key data source and related work. Two other co-authors (Dong Huang, Terry Yue Zhuo) are authors of EffiBench and BigCodeBench respectively, also used as comparisons. While not inherently problematic, this overlap means the evaluation ecosystem is not independently validated."
    334     },
    335     {
    336       "flag": "Incomplete hyperparameter reporting",
    337       "detail": "Temperature is reported for open-source models (0.7) but not for closed-source API calls. This asymmetry means the evaluation conditions are not comparable across model types."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Evaluating large language models trained on code",
    343       "authors": ["Mark Chen", "Jerry Tworek"],
    344       "year": 2021,
    345       "arxiv_id": "2107.03374",
    346       "relevance": "Introduces HumanEval, the foundational LLM code generation benchmark that CodeArena builds upon and aims to improve."
    347     },
    348     {
    349       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    350       "authors": ["Naman Jain", "King Han"],
    351       "year": 2024,
    352       "arxiv_id": "2403.07974",
    353       "relevance": "Directly comparable contamination-free LLM code evaluation approach using temporal problem updates, a key related work to CodeArena."
    354     },
    355     {
    356       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    357       "authors": ["Terry Yue Zhuo", "Minh Chien Vu"],
    358       "year": 2024,
    359       "arxiv_id": "2406.15877",
    360       "relevance": "Contemporary code generation benchmark evaluating LLMs on complex programming tasks with diverse function calls."
    361     },
    362     {
    363       "title": "Mercury: An Efficiency Benchmark for LLM Code Synthesis",
    364       "authors": ["Mingzhe Du", "Anh Tuan Luu"],
    365       "year": 2024,
    366       "arxiv_id": "2402.07844",
    367       "relevance": "Efficiency-centric code generation benchmark that is integrated into CodeArena and shares authors with this paper."
    368     },
    369     {
    370       "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
    371       "authors": ["Qihao Zhu", "Daya Guo"],
    372       "year": 2024,
    373       "arxiv_id": "2406.11931",
    374       "relevance": "Open-source code generation model that achieves top performance in CodeArena, relevant to evaluating open vs. closed-source model capabilities."
    375     },
    376     {
    377       "title": "CyberSecEval 2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models",
    378       "authors": ["Manish Bhatt", "Sahana Chennabasappa"],
    379       "year": 2024,
    380       "arxiv_id": "2404.13161",
    381       "relevance": "Security-focused evaluation suite for LLM-generated code, relevant to understanding the broader LLM code evaluation landscape."
    382     },
    383     {
    384       "title": "Measuring Coding Challenge Competence with APPS",
    385       "authors": ["Dan Hendrycks", "Steven Basart"],
    386       "year": 2021,
    387       "relevance": "Major code generation benchmark integrated into CodeArena's problem set, important for understanding benchmark contamination risks."
    388     },
    389     {
    390       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    391       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    392       "year": 2023,
    393       "relevance": "Evaluates rigorousness of LLM code evaluation methodology, relevant to understanding evaluation quality concerns."
    394     },
    395     {
    396       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    397       "authors": ["Anton Lozhkov", "Raymond Li"],
    398       "year": 2024,
    399       "arxiv_id": "2402.19173",
    400       "relevance": "Open-source code generation model and training data, relevant to understanding code generation capabilities and data contamination."
    401     },
    402     {
    403       "title": "Code Llama: Open Foundation Models for Code",
    404       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    405       "year": 2023,
    406       "arxiv_id": "2308.12950",
    407       "relevance": "Major open-source code generation model family evaluated in CodeArena."
    408     }
    409   ]
    410 }

Impressum · Datenschutz