scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19202B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Xuanming Zhang",
     10       "Yunfei Zhao",
     11       "Yihong Dong",
     12       "Zhi Jin",
     13       "Binhua Li",
     14       "Fei Huang",
     15       "Yongbin Li"
     16     ],
     17     "year": 2024,
     18     "venue": "Neural Information Processing Systems",
     19     "arxiv_id": "2410.22821",
     20     "doi": "10.48550/arXiv.2410.22821"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Key abstract claims — leak rate reduction (41.47%→2.18%), gpt-4 Pass@1 of 20.74%, gpt-4 Internet domain weakness, StarCoder 2-15B Database strength — are directly supported by Tables 3, 4, and 6/7 respectively.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper claims 'LLMs benefit from more code contexts' and attributes 104–152% improvements to 'domain knowledge contained in contexts,' but this is observational (comparing prompt settings); no controls rule out alternative explanations such as reduced generation length or structural constraint effects.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper repeatedly claims EvoCodeBench 'reveals the actual abilities of LLMs in real-world repositories,' a broad generalization from 275 Python-only samples from 25 repositories; the gap between test-passing and real-world development ability is not bounded.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper does not distinguish whether lower scores vs. prior benchmarks reflect contamination removal versus inherently harder repo-level tasks; no alternative explanations for context improvement or domain variation are considered.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Pass@k (test case execution) is equated with 'actual abilities in real-world repositories' without acknowledging the gap between passing provided test cases and broader software development competence.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 4 contains a dedicated 'Limitations' paragraph explicitly discussing the monolingual (Python only) scope and small dataset size relative to some prior benchmarks.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The limitations discussion is generic (language coverage and size) and omits specific threats such as test case quality variability, LLM-annotation bias, or the near-zero sample count in Security (1) and Utilities (2) domains undermining domain-specific claims.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper explicitly bounds scope to Python, repositories created Oct 2023–Mar 2024, and notes that Pass@k across different benchmark versions are not comparable.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Funding is disclosed in the Acknowledgements section: National Natural Science Foundation of China (multiple grants), National Key R&D Program, and Major Program of Hubei Province.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly listed on the title page: Peking University, Bytedance, and Alibaba Group.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Primary funders are Chinese government research grants (NSFC, national R&D programs) with no financial stake in benchmark outcomes; the paper evaluates competitors' models (OpenAI, DeepSeek, Meta, BigCode).",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "There is no competing interests statement, no disclosure of patents, equity, or consulting relationships; Alibaba-affiliated co-authors' potential interests in competing code models are not addressed.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms are defined: 'repo-level code generation' (Section 2.2), 'data leakage' (Section 1), 'Domain-Specific Improvement/DSI' (Equation 3), 'comfort/strange domains' (threshold T=10%), Pass@k (Equation 1), and Recall@k (Equation 2).",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper explicitly states three contributions: evolving data to prevent leakage, a 10-domain taxonomy with labels, and domain-specific evaluation metrics (DSI, comfort/strange domains).",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 5 systematically compares EvoCodeBench with HumanEval, MBPP, ClassEval, CoderEval, DevEval, LiveCodeBench, and EvoEval, explaining differences in scope, approach, and methodology; Table 2 provides a structured comparison.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "benchmark-creation": {
    124       "construct_design": {
    125         "construct_validity_argued": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper argues construct validity by demonstrating that EvoCodeBench-2403 aligns with 500 real-world repositories on code distribution and dependency distribution (Table 2), supporting the claim that it approximates real-world coding conditions.",
    129           "source": "haiku"
    130         },
    131         "difficulty_distribution_characterized": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No difficulty tiers (easy/medium/hard) are defined or measured; the paper reports aggregate Pass@k but does not characterize the distribution of item difficulty across the 275 samples.",
    135           "source": "haiku"
    136         },
    137         "ceiling_floor_effects_checked": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No ceiling effect exists (top Pass@1 ≈20%), but floor effects are not addressed: in the Text Processing domain (12 samples), 7 of 8 models score 0% Pass@1 — a measurement validity concern not discussed.",
    141           "source": "haiku"
    142         },
    143         "human_baseline_included": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No human baseline for the coding task is provided; human evaluation is conducted only for annotation quality (requirements and domain labels), not for benchmark programming tasks themselves.",
    147           "source": "haiku"
    148         },
    149         "scoring_rubric_justified": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Pass@k (functional correctness via test execution) is established from prior work with formula provided (Equation 1); Recall@k is introduced with clear formula (Equation 2) and justified as measuring dependency utilization in repo-level generation.",
    153           "source": "haiku"
    154         }
    155       },
    156       "robustness": {
    157         "contamination_resistance_designed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Contamination resistance is core to the design: repositories created after most LLMs' training cutoffs (Oct 2023–Mar 2024); CDD detection validates leak rates of 0.73%–2.18% across all evaluated models (Table 3).",
    161           "source": "haiku"
    162         },
    163         "temporal_robustness_discussed": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Temporal robustness is explicitly designed via the 'evolving' mechanism (updates every ~6 months), and the paper notes that Pass@k across versions are not comparable, with new versions planned.",
    167           "source": "haiku"
    168         },
    169         "failure_modes_discussed": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper does not discuss failure modes of the benchmark itself, such as test case quality variability, gaming via function name similarity (used in RAG baseline), or consistent biases introduced by LLM-generated requirements.",
    173           "source": "haiku"
    174         },
    175         "baseline_implementations_provided": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "All prompts, LLM completions, and code are released on GitHub and HuggingFace, with Croissant metadata provided, enabling full reproduction of reported results.",
    179           "source": "haiku"
    180         }
    181       },
    182       "documentation": {
    183         "dataset_documentation_complete": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "A comprehensive datasheet (Appendix C, following Datasheets for Datasets v8 [9]) covers motivation, composition, collection process, preprocessing, uses, distribution, and maintenance with detailed answers to all standard questions.",
    187           "source": "haiku"
    188         },
    189         "licensing_and_access_clear": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Dataset is under CC-4.0 license, code under BSD 3-Clause; access via GitHub and HuggingFace is specified with long-term maintenance commitment from the SEKE team at Peking University.",
    193           "source": "haiku"
    194         },
    195         "intended_use_specified": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Intended use (evaluating LLMs in repo-level code generation) is clearly stated; alternative uses are mentioned (code completion, test generation, summarization); however, explicit guidance on what should NOT be concluded is absent.",
    199           "source": "haiku"
    200         }
    201       }
    202     }
    203   },
    204   "claims": [
    205     {
    206       "claim": "EvoCodeBench reduces data leakage from 41.47% (HumanEval/gpt-3.5) to under 2.18% across all tested models.",
    207       "evidence": "CDD detection results in Table 3 show leak rates of 0.73%–2.18% for EvoCodeBench-2403 vs. 41.47% for HumanEval with gpt-3.5.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "GPT-4's highest Pass@1 on EvoCodeBench-2403 is only 20.73%, far below its 53.04% on the prior repo-level benchmark DevEval.",
    212       "evidence": "Table 4 shows gpt-4 Pass@1 of 20.73% (Local File Infilling); the DevEval comparison is stated directly in Section 3.3.",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "LLMs benefit substantially from code context: adding local file context improves gpt-4 Pass@1 by 104–152%.",
    217       "evidence": "Table 4 shows gpt-4 at 7.27% (Without Context) vs. 17.45% (Completion) and 20.73% (Infilling); percentages stated in Section 3.3 but causal mechanism is assumed not tested.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Domain-specific ranking diverges from overall ranking: gpt-4 underperforms in the Internet domain despite leading overall.",
    222       "evidence": "Table 6 shows gpt-4 Pass@1 of 20.00% in Internet vs. 26.67% for gpt-3.5 and DeepSeek Coder; Table 7 confirms gpt-4 DSI of -28.59% in Internet.",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "StarCoder 2-15B unexpectedly performs as well as GPT-4 in the Database domain, outperforming larger 33B models.",
    227       "evidence": "Table 6 shows StarCoder 2-15B at 38.89% in Database, equal to gpt-4, gpt-3.5, and DeepSeek Coder 33B; Table 7 confirms positive DSI for StarCoder 2-7B vs. negative for DeepSeek Coder 33B.",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "GPT-4-generated annotations are comparable to human-written ones: 96.7% requirement quality and 98.5% domain label agreement.",
    232       "evidence": "Human evaluation in Table 8 with Cohen's Kappa of 0.9 among evaluators; gpt-4 wins/ties on (30+236)/275=96.7% of requirements and (3+268)/275=98.5% of domain labels.",
    233       "supported": "strong"
    234     }
    235   ],
    236   "methodology_tags": [
    237     "benchmark-eval",
    238     "qualitative"
    239   ],
    240   "key_findings": "EvoCodeBench-2403 achieves data leak rates under 2.2% by restricting to repositories created after major LLMs' training cutoffs (Oct 2023–Mar 2024), compared to 41.47% for HumanEval. All tested models score dramatically lower than on prior benchmarks (gpt-4 Pass@1: 20.73% vs. 53.04% on DevEval), suggesting significant contamination in existing evaluations. Domain-specific evaluation reveals that overall ranking does not predict domain-level ranking: gpt-4 underperforms in Internet while StarCoder 2-15B matches gpt-4 in Database despite being smaller. The benchmark's evolving design — planned updates every ~6 months from new repositories — is the primary mechanism for sustained contamination resistance.",
    241   "red_flags": [
    242     {
    243       "flag": "Severely skewed domain distribution",
    244       "detail": "Scientific Engineering has 120 samples while Security has 1 and Utilities has 2; domains with <10 samples are excluded from domain analysis, quietly undercutting the '10 domain coverage' contribution."
    245     },
    246     {
    247       "flag": "LLM evaluates its own annotation quality",
    248       "detail": "GPT-4 generates the natural language requirements and domain labels, then GPT-4 is one of the 8 evaluated models; requirements phrased in GPT-4's style may structurally advantage GPT-4 without the authors acknowledging this bias."
    249     },
    250     {
    251       "flag": "No human baseline on benchmark coding tasks",
    252       "detail": "Without a human ceiling, the practical significance of 20% Pass@1 cannot be calibrated — it is unclear whether this is difficult or trivially easy for an expert developer."
    253     },
    254     {
    255       "flag": "Floor effects in Text Processing not addressed",
    256       "detail": "7 of 8 models score 0% Pass@1 in Text Processing (12 samples); this is a measurement validity concern for that domain that is not acknowledged or analyzed."
    257     },
    258     {
    259       "flag": "Causal attribution without controls",
    260       "detail": "Performance improvement from code context is attributed to 'domain knowledge in contexts,' but confounds like reduced generation length, structural constraints, or hint about function existence are not ruled out."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Foundational snippet-level benchmark that EvoCodeBench explicitly addresses for data leakage; provides the 41.47% contamination baseline for comparison."
    267     },
    268     {
    269       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    270       "relevance": "Primary prior-art repo-level benchmark; EvoCodeBench is directly compared to DevEval on methodology and Pass@k results."
    271     },
    272     {
    273       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-Trained Models",
    274       "relevance": "Another repo-level benchmark compared in Table 2; establishes the dependency-aware evaluation paradigm EvoCodeBench extends."
    275     },
    276     {
    277       "title": "Evaluating Large Language Models in Class-Level Code Generation (ClassEval)",
    278       "relevance": "Benchmark with manually designed domain labels; EvoCodeBench addresses limitations of its narrow domain coverage and potential future leakage."
    279     },
    280     {
    281       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    282       "relevance": "Contemporary contamination-free benchmark for snippet-level competitive programming; contrasted with EvoCodeBench's repo-level approach."
    283     },
    284     {
    285       "title": "EvoEval: Evolving Coding Benchmarks via LLM",
    286       "relevance": "Related work on evolving benchmarks using LLM mutation; the paper differentiates EvoCodeBench's temporal split approach from EvoEval's mutation approach."
    287     },
    288     {
    289       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models (CDD)",
    290       "relevance": "Detection method used to empirically validate EvoCodeBench's contamination resistance claims in Table 3."
    291     },
    292     {
    293       "title": "Datasheets for Datasets",
    294       "relevance": "Framework followed for EvoCodeBench's comprehensive dataset documentation in Appendix C."
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 3,
    300       "justification": "Practitioners can directly use EvoCodeBench to select the best model for their specific programming domain, with immediately available data on GitHub and HuggingFace."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "Challenges the assumption that overall ranking predicts domain-specific performance — a 15B model outperforms GPT-4 in Database, and GPT-4 is worst-in-class for Internet despite leading overall."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "No AI safety or risk concerns are raised; the paper focuses exclusively on evaluation methodology."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "Implies that prior benchmark results (including HumanEval with 41.47% leakage) are unreliable, but frames this constructively rather than as a direct attack on prior work."
    313     },
    314     "demo_ability": {
    315       "score": 3,
    316       "justification": "Benchmark is immediately runnable from GitHub with full code, prompts, and model completions released for community reuse."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "Peking University and Alibaba DAMO Academy are respectable institutions but not marquee AI labs; the paper evaluates well-known models (GPT-4, DeepSeek Coder) which adds indirect recognition."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "42172392",
    327         "title": "Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View Synthesis",
    328         "points": 2,
    329         "comments": 0,
    330         "url": "https://news.ycombinator.com/item?id=42172392"
    331       },
    332       {
    333         "hn_id": "42007858",
    334         "title": "Universality of the π²/6 Pathway in Avoiding Model Collapse [pdf]",
    335         "points": 1,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=42007858"
    338       }
    339     ],
    340     "top_points": 2,
    341     "total_points": 3,
    342     "total_comments": 0
    343   }
    344 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs