scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (19704B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Xuanming Zhang",
     10       "Yunfei Zhao",
     11       "Yihong Dong",
     12       "Zhi Jin",
     13       "Binhua Li",
     14       "Fei Huang",
     15       "Yongbin Li"
     16     ],
     17     "year": 2024,
     18     "venue": "Neural Information Processing Systems",
     19     "arxiv_id": "2410.22821",
     20     "doi": "10.48550/arXiv.2410.22821"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Abstract claims are supported: leak rate reduction to 2.18% (Table 3), gpt-4 Pass@1 of 20.74% (Table 4 shows 20.73%), domain-specific findings for gpt-4 and StarCoder 2 (Tables 6-7).",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper makes causal claims without adequate support: 'We attribute the improvements to the domain knowledge contained in contexts' (Section 3.3) and 'The potential reason for comfort and strange domains is that the pre-training data mix of LLMs is different' (Section 3.4). Neither claim is tested or controlled for confounds.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper is generally careful to bound claims to EvoCodeBench-2403 and Python. The Limitations section explicitly states it is 'a monolingual (i.e., Python) benchmark' and notes the small size. Claims are typically prefixed with 'on EvoCodeBench-2403'.",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper does not substantively discuss alternative explanations. The large performance drop vs. prior benchmarks is attributed solely to data leakage, without considering that the benchmark may simply be harder due to repo-level complexity, different task distribution, or other factors.",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims 'EvoCodeBench reveals the actual abilities of these LLMs in real-world repositories' (abstract) but measures only Pass@k on a 275-sample Python benchmark. The gap between 'Pass@k on EvoCodeBench' and 'actual abilities in real-world repositories' is not acknowledged.",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 4 'Discussion' contains a substantive 'Limitations' subsection discussing two main limitations: Python-only and small benchmark size.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The limitations are specific to this study: 'EvoCodeBench is a monolingual (i.e., Python) benchmark and ignores other programming languages' and 'the size of EvoCodeBench is currently smaller than some existing benchmarks' due to the recent-repositories-only constraint. They also discuss LLM annotation failure modes (missing details, inaccurate domain labels).",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper states Python-only scope, 275 samples from 25 repos, first version only covering Oct 2023 - Mar 2024, and that domain distribution may be unbalanced. Future plans for expansion are noted.",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The Acknowledgements section lists National Natural Science Foundation of China grants (62192731, 62152730, etc.), National Key R&D Program, and Major Program of Hubei Province.",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are listed: Peking University, Bytedance, and Alibaba Group. These are prominently displayed on the first page.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Funding is from NSFC government grants and a government R&D program. The paper evaluates models from OpenAI, DeepSeek, BigCode, and Meta — not products of the funders. The Alibaba/Bytedance-affiliated authors' companies do not have models being evaluated.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement is provided. Authors from Alibaba Group and Bytedance may have commercial interests related to code LLMs but this is not disclosed.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms are defined: 'data leakage' (test data in training data), 'domain-specific evaluation', 'DSI' (Domain-Specific Improvement with formula), 'comfort domains' (DSI > 10%), 'strange domains' (DSI < -10%), Pass@k and Recall@k with mathematical definitions.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Three explicit contributions are numbered in the abstract and introduction: (1) evolving temporal benchmark, (2) domain taxonomy with labels, (3) domain-specific evaluation metrics including DSI.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 5 provides substantive comparison with related benchmarks (HumanEval, ClassEval, DevEval, CoderEval, LiveCodeBench, EvoEval), explicitly explaining how EvoCodeBench differs and fills gaps in existing work.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "benchmark-creation": {
    124       "construct_design": {
    125         "construct_validity_argued": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper asserts that repo-level code generation is more realistic, and shows distribution alignment with 500 real repositories (Table 2), but does not argue why Pass@k measures the claimed capability ('actual coding ability') beyond inherited convention from prior work.",
    129           "source": "haiku"
    130         },
    131         "difficulty_distribution_characterized": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No difficulty distribution is characterized; there are no easy/medium/hard tiers, no item-level difficulty analysis, and no systematic measure of what makes individual tasks harder or easier beyond the domain label.",
    135           "source": "haiku"
    136         },
    137         "ceiling_floor_effects_checked": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No explicit ceiling/floor effect check is performed; without-context scores (~5-8% Pass@1) approach floor levels for some models, which is not discussed as a discriminability concern.",
    141           "source": "haiku"
    142         },
    143         "human_baseline_included": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No human baseline performance is reported; the paper only includes human evaluation of annotation quality (requirements and domain labels), not human performance on the code generation tasks.",
    147           "source": "haiku"
    148         },
    149         "scoring_rubric_justified": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Pass@k and Recall@k are adopted from prior work without justifying why these metrics (vs. others like CodeBLEU, exact match, or partial credit) are appropriate; the choice of k ∈ {1, 3, 5, 10} and temperature 0.4 are stated but not justified.",
    153           "source": "haiku"
    154         }
    155       },
    156       "robustness": {
    157         "contamination_resistance_designed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Temporal splitting is the core anti-contamination design: repositories created October 2023–March 2024, after most evaluated models' training cutoffs; validated with CDD detection showing <3% leak rate vs 41.47% for HumanEval.",
    161           "source": "haiku"
    162         },
    163         "temporal_robustness_discussed": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "The paper explicitly plans 6-month update cycles, states that scores across versions are not comparable, and explains the automated collection pipeline that enables new versions to be constructed from latest repositories.",
    167           "source": "haiku"
    168         },
    169         "failure_modes_discussed": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper acknowledges annotation errors (~3.3% requirements, ~1.5% domain labels) but does not discuss benchmark failure modes like test case inadequacy, gaming via requirement memorization, or the small sample sizes in minority domains making domain conclusions unreliable.",
    173           "source": "haiku"
    174         },
    175         "baseline_implementations_provided": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "All prompts, LLM completions, and benchmark code are released on GitHub (https://github.com/seketeam/EvoCodeBench) and HuggingFace, enabling reproduction of all reported numbers.",
    179           "source": "haiku"
    180         }
    181       },
    182       "documentation": {
    183         "dataset_documentation_complete": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Appendix C is a full datasheet following Gebru et al. (2021) format covering motivation, composition, collection process, preprocessing, uses, distribution, and maintenance — with specific details including timeframes, tools used, and collection criteria.",
    187           "source": "haiku"
    188         },
    189         "licensing_and_access_clear": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "CC-4.0 license for the dataset and BSD 3-Clause for code are explicitly stated; dataset is accessible via GitHub and HuggingFace with Croissant metadata.",
    193           "source": "haiku"
    194         },
    195         "intended_use_specified": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "The datasheet states the benchmark is designed for code generation evaluation and lists other possible tasks, but does not specify what conclusions should NOT be drawn (e.g., that domain findings based on n=1 or n=2 samples are unreliable).",
    199           "source": "haiku"
    200         }
    201       }
    202     }
    203   },
    204   "claims": [
    205     {
    206       "claim": "EvoCodeBench reduces data leakage from 41.47% (HumanEval/gpt-3.5) to under 2.18% across all evaluated LLMs.",
    207       "evidence": "Table 3: CDD detection results showing leak ratios of 0.73%–2.18% for EvoCodeBench-2403 vs 41.47% for HumanEval.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "The highest Pass@1 on EvoCodeBench-2403 is only 20.73% (gpt-4), far below 53.04% on DevEval, suggesting prior benchmarks were inflated by contamination.",
    212       "evidence": "Table 4 shows gpt-4 Local File Infilling Pass@1 = 20.73%; the 53.04% DevEval figure is cited from prior work [16].",
    213       "supported": "moderate"
    214     },
    215     {
    216       "claim": "Providing local file context improves Pass@1 by 100%+ compared to no-context generation across models.",
    217       "evidence": "Table 4: gpt-4 without context 7.27% vs Local File Infilling 20.73% (185% increase); similar gains for all models.",
    218       "supported": "strong"
    219     },
    220     {
    221       "claim": "StarCoder 2-15B performs comparably to gpt-4 in the Database domain despite being much smaller overall.",
    222       "evidence": "Table 6: Database Pass@1 — gpt-4 38.89%, StarCoder 2-15B 38.89%. Table 7: StarCoder 2-15B has comfort domain status in Database (DSI > 10%).",
    223       "supported": "moderate"
    224     },
    225     {
    226       "claim": "gpt-4 underperforms relative to other models in the Internet domain despite leading overall.",
    227       "evidence": "Table 6: Internet Pass@1 — gpt-4 20.00% vs gpt-3.5/DeepSeek Coder 26.67%. Table 7: Internet is a strange domain for gpt-4 (DSI < -10%).",
    228       "supported": "moderate"
    229     },
    230     {
    231       "claim": "Auto-generated requirements by gpt-4 are comparable to human-written ones in 96.7% of cases.",
    232       "evidence": "Table 8: Human evaluators rated 30 Win, 236 Tie, 9 Lose for gpt-4 requirements; Cohen's Kappa = 0.9 among evaluators.",
    233       "supported": "moderate"
    234     }
    235   ],
    236   "methodology_tags": [
    237     "benchmark-eval"
    238   ],
    239   "key_findings": "EvoCodeBench is a repo-level Python code generation benchmark that addresses data contamination via temporal splitting (repositories created Oct 2023–Mar 2024), reducing measured leakage from 41.47% to under 3%. LLM performance on this benchmark is dramatically lower than on established benchmarks (gpt-4 tops at 20.73% Pass@1 vs 53.04% on DevEval), suggesting prior evaluations were inflated by contamination. Domain-specific analysis reveals meaningful performance heterogeneity: model overall rankings do not hold within specific domains (e.g., StarCoder 2-15B matches gpt-4 in Database), and an automated 4-stage pipeline enables periodic benchmark refresh every 6 months to maintain temporal freshness.",
    240   "red_flags": [
    241     {
    242       "flag": "Trivially small domain samples",
    243       "detail": "Several domains have extremely small sample counts: Security n=1, Utilities n=2, Communications n=8, Text Processing n=12. Domain-specific conclusions for these categories are statistically meaningless and should not be reported."
    244     },
    245     {
    246       "flag": "Circular LLM annotation validation",
    247       "detail": "Requirements and domain labels are generated by gpt-4, then quality is partially assessed via human evaluation — but the 96.7% agreement rate is measured by humans comparing gpt-4 outputs to other human-written ones, not an independent quality audit of whether the gpt-4 requirements accurately capture function semantics."
    248     },
    249     {
    250       "flag": "No human performance baseline",
    251       "detail": "The paper establishes no human baseline for the code generation tasks, making it impossible to assess whether benchmarked scores represent meaningful capability gaps or near-human performance."
    252     },
    253     {
    254       "flag": "Leakage vs difficulty confound",
    255       "detail": "The claim that lower EvoCodeBench scores prove leakage in prior benchmarks does not control for task difficulty differences; temporal repositories and different selection criteria may produce inherently harder tasks independent of contamination."
    256     },
    257     {
    258       "flag": "Alibaba author conflict",
    259       "detail": "Three authors are from Alibaba Group, which has commercial interests in code LLM development; no competing interests statement is present despite this affiliation."
    260     },
    261     {
    262       "flag": "No ceiling/floor effect analysis",
    263       "detail": "Without-context Pass@1 scores (~5-8%) approach floor levels and are not analyzed for whether the no-context condition meaningfully discriminates among models."
    264     }
    265   ],
    266   "cited_papers": [
    267     {
    268       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    269       "relevance": "Primary baseline benchmark that EvoCodeBench is designed to improve upon; used to demonstrate data leakage rates."
    270     },
    271     {
    272       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    273       "relevance": "Closest prior work for repo-level code generation; EvoCodeBench directly compares against it for leakage, dependency statistics, and performance."
    274     },
    275     {
    276       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-Trained Models",
    277       "relevance": "Repo-level benchmark with known contamination issues that EvoCodeBench is designed to address."
    278     },
    279     {
    280       "title": "ClassEval: Evaluating Large Language Models in Class-level Code Generation",
    281       "relevance": "Domain-annotated benchmark compared to EvoCodeBench in Table 2; cited for cost of manual annotation (500 person-hours)."
    282     },
    283     {
    284       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    285       "relevance": "Contemporary contamination-resistant benchmark using competitive programming; EvoCodeBench distinguishes itself as repo-level vs snippet-level."
    286     },
    287     {
    288       "title": "EvoEval: Evolving Coding Benchmarks via LLM",
    289       "relevance": "Alternative contamination-resistant approach (LLM mutation of HumanEval); compared to EvoCodeBench's temporal filtering approach."
    290     },
    291     {
    292       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models (CDD)",
    293       "relevance": "The leakage detection method (CDD) used to validate EvoCodeBench's contamination resistance claims."
    294     },
    295     {
    296       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    297       "relevance": "One of the primary evaluated code LLMs; DeepSeek Coder-33B achieves the best dependency recall on EvoCodeBench."
    298     },
    299     {
    300       "title": "Datasheets for Datasets",
    301       "relevance": "Framework used for EvoCodeBench's Appendix C dataset documentation."
    302     }
    303   ],
    304   "engagement_factors": {
    305     "practical_relevance": {
    306       "score": 2,
    307       "justification": "Practitioners can use EvoCodeBench to evaluate code LLMs for domain-specific selection, and the domain taxonomy provides actionable guidance."
    308     },
    309     "surprise_contrarian": {
    310       "score": 1,
    311       "justification": "The finding that gpt-4 achieves only 20.73% Pass@1 (vs 53.04% on DevEval) suggests prior benchmarks may be leaked, though this concern was already growing in the community."
    312     },
    313     "fear_safety": {
    314       "score": 0,
    315       "justification": "No safety or security concerns are raised by this benchmark evaluation paper."
    316     },
    317     "drama_conflict": {
    318       "score": 1,
    319       "justification": "Implicit criticism that existing benchmarks (especially HumanEval with 41.47% leakage) are unreliable, but presented diplomatically."
    320     },
    321     "demo_ability": {
    322       "score": 2,
    323       "justification": "Code and data released on GitHub and HuggingFace; researchers can download and evaluate their own models on the benchmark."
    324     },
    325     "brand_recognition": {
    326       "score": 1,
    327       "justification": "From Peking University and Alibaba, published at NeurIPS; evaluates well-known models (gpt-4, DeepSeek Coder, StarCoder 2)."
    328     }
    329   },
    330   "hn_data": {
    331     "threads": [
    332       {
    333         "hn_id": "42172392",
    334         "title": "Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View Synthesis",
    335         "points": 2,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=42172392"
    338       },
    339       {
    340         "hn_id": "42007858",
    341         "title": "Universality of the π²/6 Pathway in Avoiding Model Collapse [pdf]",
    342         "points": 1,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=42007858"
    345       }
    346     ],
    347     "top_points": 2,
    348     "total_points": 3,
    349     "total_comments": 0
    350   }
    351 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs