ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30663B)


      1 {
      2   "paper": {
      3     "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
      4     "authors": [
      5       "Jia Li",
      6       "Ge Li",
      7       "Xuanming Zhang",
      8       "Yunfei Zhao",
      9       "Yihong Dong",
     10       "Zhi Jin",
     11       "Binhua Li",
     12       "Fei Huang",
     13       "Yongbin Li"
     14     ],
     15     "year": 2024,
     16     "venue": "Neural Information Processing Systems (NeurIPS 2024, Datasets and Benchmarks Track)",
     17     "arxiv_id": "2410.22821",
     18     "doi": "10.48550/arXiv.2410.22821"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "EvoCodeBench-2403 reduces data leakage from 41.47% (HumanEval) to under 3% by using repositories created after October 2023. The highest Pass@1 (gpt-4) on EvoCodeBench-2403 is only 20.73%, far below scores on prior benchmarks, suggesting those may be leaked. LLMs show distinct comfort and strange domains: gpt-4 excels in most domains but underperforms in Internet, while StarCoder 2-15B unexpectedly matches gpt-4 in Database. Providing code context improves gpt-4's Pass@1 by up to 152%.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "GitHub (https://github.com/seketeam/EvoCodeBench) and HuggingFace (https://huggingface.co/datasets/LJ0815/EvoCodeBench) links are provided in the abstract footnote and Appendix A."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The benchmark data is released on GitHub and HuggingFace under CC-4.0 license. All prompts and LLMs' completions are also released (Appendix A)."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Section 3.1 mentions '4 NVIDIA A100-40GB GPUs' and Appendix E.1 describes models, but no requirements.txt, Dockerfile, or library versions are provided. Not enough detail to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper releases code, all prompts (Figures 5-8, Appendix E.2), LLM completions, and benchmark data on GitHub. Experimental settings are thoroughly documented in Section 3 and Appendix E."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 4, 5, 6, and 7 report only point estimates for Pass@k, Recall@k, and DSI. No confidence intervals or error bars are provided anywhere in the paper."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper makes numerous comparative claims (e.g., 'gpt-4 performs best', 'StarCoder 2-15B unexpectedly performs well') based solely on comparing raw numbers without any statistical significance tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports percentage improvements with baseline context: 'Pass@1 of gpt-4 is improved by 104% and 152%' (Section 3.3). Table 3 shows leakage rate drops from 41.47% to 2.18%. DSI values in Table 7 show relative improvements."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The benchmark contains 275 samples from 25 repositories with no justification for why this size is adequate. The paper acknowledges it is 'currently smaller than some existing benchmarks' in the Limitations section but provides no power analysis or statistical justification."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No standard deviations, variance, or spread measures are reported. While they sample 20 programs per requirement for k>1, only the unbiased Pass@k point estimates are reported without any measure of variability."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Eight LLMs are compared against each other (Table 4), and the benchmark's leakage rate is compared against HumanEval (Table 3). Performance is compared across three experimental settings."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The evaluated models — gpt-4-turbo-1106, DeepSeek Coder, StarCoder 2, CodeLLaMa — were all contemporary at the time of writing (late 2023 / early 2024)."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No ablation study is performed on the benchmark's collection pipeline components. The three experimental settings (without context, completion, infilling) are separate evaluation conditions, not ablations of the benchmark design."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses Pass@k (functional correctness), Recall@k (dependency recall), and Domain-Specific Improvement (DSI) as evaluation metrics."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 describes a human evaluation where 10 developers (5 annotators + 5 evaluators) assessed auto-generated annotations. Cohen's Kappa of 0.9 is reported. Results in Table 8."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The entire benchmark is designed as a leakage-free test set. Repositories were created after October 2023, post-dating model training cutoffs. CDD verification (Table 3) confirms less than 3% potential leakage."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 6 breaks down Pass@1 by domain (7 domains). Table 7 shows DSI per domain per model. Table 1 shows domain distribution."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3.3 'Error Analyses' describes manual analysis of 50 error cases of gpt-4: 29 implementation logic errors, 20 missing context cases, and 1 vague requirement. Figure 2 shows a specific failure example."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that gpt-4 underperforms in the Internet domain (Table 6/7), that all LLMs perform poorly without context (7.27% max Pass@1), and that the overall Pass@1 is very low compared to prior benchmarks."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims are supported: leak rate reduction to 2.18% (Table 3), gpt-4 Pass@1 of 20.74% (Table 4 shows 20.73%), domain-specific findings for gpt-4 and StarCoder 2 (Tables 6-7)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper makes causal claims without adequate support: 'We attribute the improvements to the domain knowledge contained in contexts' (Section 3.3) and 'The potential reason for comfort and strange domains is that the pre-training data mix of LLMs is different' (Section 3.4). Neither claim is tested or controlled for confounds."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper is generally careful to bound claims to EvoCodeBench-2403 and Python. The Limitations section explicitly states it is 'a monolingual (i.e., Python) benchmark' and notes the small size. Claims are typically prefixed with 'on EvoCodeBench-2403'."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not substantively discuss alternative explanations. The large performance drop vs. prior benchmarks is attributed solely to data leakage, without considering that the benchmark may simply be harder due to repo-level complexity, different task distribution, or other factors."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper claims 'EvoCodeBench reveals the actual abilities of these LLMs in real-world repositories' (abstract) but measures only Pass@k on a 275-sample Python benchmark. The gap between 'Pass@k on EvoCodeBench' and 'actual abilities in real-world repositories' is not acknowledged."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.1 specifies gpt-4-turbo-1106, gpt-3.5-turbo-1106. Appendix E.1 provides further version details. Open-source models are specified with parameter sizes (DeepSeek Coder-33B/6.7B, StarCoder 2-15B/7B, CodeLLaMa-13B/7B)."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt templates are provided in Figures 3-8 and Appendix E.2 for all settings (without context, completion, infilling, RAG, annotation generation). The fill values come from the released benchmark data, so prompts are fully reconstructable."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.3 reports temperature=0.4, top-p=0.95, max generation length=500, greedy search for k=1, nucleus sampling with 20 samples for k>1."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. LLMs are evaluated directly via prompting without agent loops, tools, or feedback mechanisms."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 2.3 documents the 4-stage pipeline: repository selection criteria, execution-based filtering, automatic annotations via static analysis and LLM, and benchmark construction. Filtering criteria are specified at each stage."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4 'Discussion' contains a substantive 'Limitations' subsection discussing two main limitations: Python-only and small benchmark size."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The limitations are specific to this study: 'EvoCodeBench is a monolingual (i.e., Python) benchmark and ignores other programming languages' and 'the size of EvoCodeBench is currently smaller than some existing benchmarks' due to the recent-repositories-only constraint. They also discuss LLM annotation failure modes (missing details, inaccurate domain labels)."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper states Python-only scope, 275 samples from 25 repos, first version only covering Oct 2023 - Mar 2024, and that domain distribution may be unbalanced. Future plans for expansion are noted."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The benchmark data is available on GitHub and HuggingFace. All prompts and LLMs' completions are released. Repository URLs are provided in Table 9."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 2.3 describes the 4-stage collection pipeline in detail: repo selection criteria (stars, licenses, recency, tests), function scraping, execution-based filtering, automatic annotations, and benchmark construction."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Repository selection criteria are detailed in Section 2.3 (Stage I): open-source Python, permissive licenses, created within last 6 months, non-fork, non-malicious, 50+ stars, having unit tests. Human evaluators are described as having 3+ years Python experience (Section 4)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The full 4-stage pipeline is documented in Section 2.3 with filtering criteria at each stage. Table 9 shows per-repository statistics (25 repos → 275 samples). The pipeline from raw repos to final benchmark is traceable."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Acknowledgements section lists National Natural Science Foundation of China grants (62192731, 62152730, etc.), National Key R&D Program, and Major Program of Hubei Province."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are listed: Peking University, Bytedance, and Alibaba Group. These are prominently displayed on the first page."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Funding is from NSFC government grants and a government R&D program. The paper evaluates models from OpenAI, DeepSeek, BigCode, and Meta — not products of the funders. The Alibaba/Bytedance-affiliated authors' companies do not have models being evaluated."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is provided. Authors from Alibaba Group and Bytedance may have commercial interests related to code LLMs but this is not disclosed."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 2.4 states 'most code LLM's training data is up to September 2023.' Appendix E.1 provides specific cutoffs: gpt-4-1106 training data up to April 2023, gpt-3.5-turbo-1106 up to September 2021."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 3.2 uses the CDD method to detect data leakage. Table 3 reports leakage ratios for all 8 LLMs on EvoCodeBench-2403, finding less than 3% potential overlap."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "The entire benchmark design addresses contamination via temporal splits (repos created Oct 2023 - Mar 2024, after training cutoffs). Section 3.2 validates this with CDD detection. Comparison to HumanEval's 41.47% leakage rate demonstrates improvement."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The paper has no human subjects study. The human evaluation in Section 4 is quality assurance of annotations, not a study with human participants as subjects."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study. The human evaluators performed quality checks on annotations, not a human-subjects research study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study. The evaluators are described as having '3 years of Python development experience' but this is quality assurance, not a study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study. The paper evaluates code LLMs on a benchmark, not human participants."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human subjects study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human subjects study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No API costs, tokens consumed, or per-example inference costs are reported. The paper evaluates 8 LLMs generating 20 samples per requirement but does not quantify the cost."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Section 3.1 mentions '4 NVIDIA A100-40GB GPUs' but does not state total GPU hours, wall-clock time, or API spend for the experiments."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No multi-seed analysis is performed. Results are not reported across multiple random seeds despite using nucleus sampling."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 3.3 states: 'When k=1, we use the greedy search and generate a single program per requirement. When k>1, we use the nucleus sampling... and sample 20 programs per requirement.'"
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Generation hyperparameters (temperature=0.4, top-p=0.95, max length=500) are stated but no justification is given for these choices. No search budget or tuning process is described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Results are reported for all three experimental settings (without context, completion, infilling) and the RAG setting. No selective reporting of best configurations — all results are shown in Table 4."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper makes many pairwise comparisons across 8 models, 3 settings, and 7+ domains without any statistical tests, let alone corrections for multiple comparisons."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors created the benchmark and evaluate models on it. They do not acknowledge or discuss potential benchmark-creator bias, such as whether benchmark design choices favor certain model types."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Models of vastly different sizes (6.7B to gpt-4) are compared without discussing compute budget differences. No performance-per-compute analysis is presented."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Section 2.4 discusses construct validity by showing EvoCodeBench-2403's code distribution and dependency distribution align with 500 real-world repositories (Table 2). The paper argues the benchmark reflects real-world development scenarios."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is used. LLMs are evaluated directly via prompting in controlled settings."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The entire benchmark design uses temporal splits: repositories created October 2023 - March 2024, post-dating all evaluated models' training cutoffs (most up to September 2023). This is the paper's core contribution."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper does not discuss whether the code context provided in completion/infilling settings could leak answer information. The dramatic difference between without-context (7.27%) and infilling (20.73%) settings suggests context is highly informative, but whether this constitutes feature leakage in some cases is not analyzed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Multiple samples are drawn from the same repositories (e.g., 59 from litdata, 54 from camp_zipnerf per Table 9), but the paper does not discuss whether samples from the same repository are statistically independent."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Section 3.2 applies the CDD method (reference [6]) to all 8 LLMs on EvoCodeBench-2403. Table 3 reports detection results showing less than 3% leakage rate."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "EvoCodeBench significantly alleviates data leakage, reducing the potential leak rate from 41.47% (HumanEval) to 2.18%.",
    375       "evidence": "Table 3 shows CDD detection results: HumanEval leak ratio on gpt-3.5 is 41.47%, while EvoCodeBench-2403 leak ratios range from 0.73% to 2.18% across all 8 LLMs (Section 3.2).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The highest Pass@1 of gpt-4 on EvoCodeBench-2403 is only 20.73%, far below prior benchmarks.",
    380       "evidence": "Table 4 shows gpt-4's Pass@1 in the Local File (Infilling) setting is 20.73. The paper notes this is much lower than 53.04 on the latest repo-level benchmark DevEval (Section 3.3).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LLMs benefit from more code contexts, with gpt-4's Pass@1 improving by 104% and 152% with context.",
    385       "evidence": "Table 4 shows gpt-4 without context: 7.27, completion: 17.45 (140% improvement), infilling: 20.73 (185% improvement). The stated 104% and 152% appear to be from a different computation base (Section 3.3).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "LLMs have distinct comfort and strange domains based on Domain-Specific Improvement (DSI).",
    390       "evidence": "Table 7 shows DSI values. gpt-4 has comfort domains (DSI>10%) in Database, System, Software Development, Scientific Engineering, and Multimedia but a strange domain in Internet (-28.59%). StarCoder 2-15B is comparable to gpt-4 in Database (Section 3.4).",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Auto-generated annotations (requirements and domain labels) are comparable to human-written ones in most cases (96.7% and 98.5% respectively).",
    395       "evidence": "Table 8 shows human evaluation: for requirements, gpt-4 wins 30, ties 236, loses 9. For domain labels: wins 3, ties 268, loses 4. Cohen's Kappa between evaluators is 0.9 (Section 4).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "RAG with similar functions improves LLM performance in repo-level code generation.",
    400       "evidence": "Table 5 shows gpt-4 Pass@1 improves from 8.31 (without context) to 12.29 (similar functions). gpt-3.5 improves from 6.64 to 11.62 (Section 3.3).",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Tiny domain sample sizes",
    407       "detail": "Several domains have very few samples: Security (1), Utilities (2), Communications (8), Text Processing (12). Domain-specific claims (Tables 6-7) for these domains are unreliable. Even the excluded domains with <10 samples are acknowledged, but domains like Text Processing (12 samples) and Internet (15 samples) are still used for DSI analysis."
    408     },
    409     {
    410       "flag": "No error bars or significance tests",
    411       "detail": "All model comparisons in Tables 4, 5, 6, and 7 are based on point estimates. With 275 samples (and much fewer per domain), small differences between models could easily be due to chance. No statistical tests are performed to validate any comparative claim."
    412     },
    413     {
    414       "flag": "Heavily unbalanced domain distribution",
    415       "detail": "120/275 samples (43.6%) are in Scientific Engineering. The 'diverse domains' claim is undermined by this extreme skew. Table 1 shows the distribution is far from balanced."
    416     },
    417     {
    418       "flag": "LLM-generated annotations used without full validation",
    419       "detail": "Requirements and domain labels were generated by gpt-4. While human evaluation shows 96.7% and 98.5% quality, this means ~9 requirements and ~4 domain labels may be incorrect. These errors propagate into evaluation results but their impact on Pass@k scores is not analyzed."
    420     },
    421     {
    422       "flag": "Performance drop attributed primarily to leakage without controlling for task difficulty",
    423       "detail": "The paper attributes the large performance gap between EvoCodeBench (20.73% Pass@1) and DevEval (53.04% Pass@1) primarily to data leakage. However, EvoCodeBench is a repo-level benchmark with different difficulty characteristics, making the comparison confounded."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating large language models trained on code",
    429       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    430       "year": 2021,
    431       "relevance": "Introduces HumanEval, the primary baseline benchmark compared against for data leakage analysis."
    432     },
    433     {
    434       "title": "DevEval: A manually-annotated code generation benchmark aligned with real-world code repositories",
    435       "authors": ["Jia Li", "Ge Li", "Yunfei Zhao"],
    436       "year": 2024,
    437       "relevance": "Prior repo-level code generation benchmark from the same group; EvoCodeBench extends and addresses its limitations."
    438     },
    439     {
    440       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    441       "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"],
    442       "year": 2024,
    443       "relevance": "Repo-level code generation benchmark with dependency analysis; direct comparison point for EvoCodeBench."
    444     },
    445     {
    446       "title": "Evaluating large language models in class-level code generation",
    447       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    448       "year": 2024,
    449       "relevance": "ClassEval benchmark for class-level code generation with domain labels, compared in related work."
    450     },
    451     {
    452       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    453       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    454       "year": 2024,
    455       "arxiv_id": "2403.07974",
    456       "relevance": "Contamination-free code benchmark using latest competitive programming problems; addresses same data leakage concern at snippet level."
    457     },
    458     {
    459       "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM",
    460       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"],
    461       "year": 2024,
    462       "arxiv_id": "2403.19114",
    463       "relevance": "EvoEval mutates HumanEval to create new benchmarks; addresses contamination at snippet level while EvoCodeBench targets repo level."
    464     },
    465     {
    466       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    467       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    468       "year": 2024,
    469       "relevance": "Provides the CDD contamination detection method used to validate EvoCodeBench's leakage-free status."
    470     },
    471     {
    472       "title": "DeepSeek-Coder: When the large language model meets programming - the rise of code intelligence",
    473       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    474       "year": 2024,
    475       "arxiv_id": "2401.14196",
    476       "relevance": "Major code LLM evaluated in the benchmark; trained on 2T tokens of code."
    477     },
    478     {
    479       "title": "StarCoder 2 and the Stack v2: The next generation",
    480       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    481       "year": 2024,
    482       "arxiv_id": "2402.19173",
    483       "relevance": "Code LLM evaluated in the benchmark; showed unexpected strength in specific domains."
    484     },
    485     {
    486       "title": "Code LLaMA: Open foundation models for code",
    487       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    488       "year": 2023,
    489       "arxiv_id": "2308.12950",
    490       "relevance": "Open-source code LLM family evaluated in the benchmark."
    491     },
    492     {
    493       "title": "Program synthesis with large language models",
    494       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    495       "year": 2021,
    496       "relevance": "Introduces MBPP benchmark and the unbiased Pass@k estimator used as an evaluation metric."
    497     },
    498     {
    499       "title": "CrossCodeEval: A diverse and multilingual benchmark for cross-file code completion",
    500       "authors": ["Yangruibo Ding", "Zijian Wang", "Wasi Uddin Ahmad"],
    501       "year": 2023,
    502       "relevance": "Cross-file code completion benchmark providing context for repo-level evaluation approaches."
    503     }
    504   ],
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 2,
    508       "justification": "Practitioners can use EvoCodeBench to evaluate code LLMs for domain-specific selection, and the domain taxonomy provides actionable guidance."
    509     },
    510     "surprise_contrarian": {
    511       "score": 1,
    512       "justification": "The finding that gpt-4 achieves only 20.73% Pass@1 (vs 53.04% on DevEval) suggests prior benchmarks may be leaked, though this concern was already growing in the community."
    513     },
    514     "fear_safety": {
    515       "score": 0,
    516       "justification": "No safety or security concerns are raised by this benchmark evaluation paper."
    517     },
    518     "drama_conflict": {
    519       "score": 1,
    520       "justification": "Implicit criticism that existing benchmarks (especially HumanEval with 41.47% leakage) are unreliable, but presented diplomatically."
    521     },
    522     "demo_ability": {
    523       "score": 2,
    524       "justification": "Code and data released on GitHub and HuggingFace; researchers can download and evaluate their own models on the benchmark."
    525     },
    526     "brand_recognition": {
    527       "score": 1,
    528       "justification": "From Peking University and Alibaba, published at NeurIPS; evaluates well-known models (gpt-4, DeepSeek Coder, StarCoder 2)."
    529     }
    530   }
    531 }

Impressum · Datenschutz