scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (22887B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Xuanming Zhang",
     10       "Yihong Dong",
     11       "Zhi Jin"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2404.00599",
     16     "doi": "10.48550/arXiv.2404.00599"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims gpt-4's highest Pass@1 is 20.73%, supported by Table 4 (Local File Infilling). Claims about alignment with real-world distributions are supported by Table 2 comparing EvoCodeBench-2403 statistics with 500 real repositories (27% standalone, 73% non-standalone, avg 3.46 vs 3.22 dependencies).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper claims code contexts improve performance ('Pass@1 of gpt-4 is improved by 104% and 152%'). This causal claim is supported by controlled comparison: the same models are evaluated under three conditions (no context, completion, infilling) with only the context variable changing, which constitutes adequate single-variable manipulation.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims alignment with 'Real-World Code Repositories' and the abstract says results 'reveal the coding abilities of these LLMs in real-world repositories,' but the benchmark is Python-only with English requirements from 25 repositories. The paper acknowledges this in Section 8 but the abstract and title do not qualify the scope.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes performance differences primarily to context availability and model design (instruction tuning vs. standard LM training) but does not consider other potential explanations such as prompt formatting effects, repository difficulty variance, or whether the specific 25 repositories are representative. The observation about GPT family's higher Pass@k but lower Recall@k receives only speculation, not substantive analysis.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures Pass@k (functional correctness via test cases) and Recall@k (dependency recall via static analysis). These are presented as specific metrics, not over-framed as broader constructs. The paper discusses the bias in Recall@k explicitly (Section 5), noting the parser may miss runtime-determined dependencies and quantifying the bias at 0.16.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 'Limitations' is a dedicated section with six specific limitation points covering monolingual scope, auto-generated requirement quality, Recall@k bias, limited LLMs evaluated, limited context exploration, and hyperparameter sensitivity.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 8 discusses specific threats: 'EvoCodeBench is a monolingual benchmark (requirements in English and code in Python),' 'auto-generated requirements... may lack necessary details (e.g., hyper-parameters),' Recall@k bias quantified at 0.16, 'limited computing budgets' constrained the number of LLMs evaluated, and 'we do not carefully tune hyper-parameters and prompts.'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 8 explicitly states what was not tested: other programming languages, other natural languages, more LLMs, cross-file context extraction methods, and hyperparameter tuning. Each is identified as a specific gap with future work plans.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding sources, grants, or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors are clearly identified as affiliated with 'School of Computer Science, Peking University.' No product being evaluated is affiliated with the authors.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Cannot assess funder independence because no funding source is disclosed. The authors are academic researchers at Peking University with no apparent commercial stake, but the lack of disclosure prevents verification.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Standalone' vs. 'non-standalone' functions are precisely defined with Figure 1 and examples; 'repository-level code generation' is formally defined in Section 2.2; dependency types (intra-class, intra-file, cross-file) are defined with examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists four contributions in the Introduction: five benchmark features, EvoCodeBench itself, the repository-level code generation task definition, and evaluation of 10 LLMs with failure analysis.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 discusses both LLM code generation work and existing benchmarks; Table 1 explicitly compares EvoCodeBench against 10 existing benchmarks across five criteria, situating the contribution relative to CoderEval, ClassEval, SWE-bench, and others.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues explicitly that real-world code is predominantly non-standalone (73%) with multiple dependency types, and that measuring LLM performance without these properties fails to capture actual coding ability in practice; this 'why this measures what we claim' argument is developed in Section 1 and validated by the 1M+ function analysis of 500 repositories.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper characterizes code by type (standalone/non-standalone, 27%/73%) and dependency type, but never explicitly characterizes difficulty distribution with easy/medium/hard tiers or measures item-level difficulty; difficulty is assumed to follow from dependency complexity rather than measured.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No explicit ceiling/floor effect analysis is performed; while the low scores (GPT-4 max 20.73%) suggest no ceiling, Qwen 1.5 at 4.00% Pass@1 without context suggests possible floor effects for weaker models, but the paper does not discuss or test for these.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human performance baseline is included on the code generation task; the only human comparison in the paper is annotation quality (Table 5), not task performance, so we cannot judge whether 20.73% is near or far from human-level.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Pass@k is justified by citing prior benchmarks and is computed using the standard unbiased estimator (Equation 1); Recall@k is introduced and justified because Pass@k alone does not capture dependency usage ability; the parser bias (0.16) is empirically measured and discussed in Section 5.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Temporal contamination resistance is explicitly designed in: EvoCodeBench-2403 uses repositories created from 2023-10 to 2024-2, chosen because 'the latest LLM's training data is up to 2023-9,' creating a post-training-cutoff temporal split.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The evolving nature is a core design claim; the paper states the benchmark will be 'dynamically updated every period (e.g., 6 months) to avoid data leakage,' and notes that Pass@k/Recall@k are not comparable across versions.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper discusses benchmark failure modes in the Limitations section and Section 5: Recall@k underestimates true recall due to dynamic typing (quantified), only local-file contexts are explored leaving cross-file coverage as a gap, and monolingual scope limits generalization.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper releases 'all prompts, and LLMs' completions for further community analysis' via a GitHub link, enabling full reproduction of their reported baseline numbers.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Collection methodology is described in detail in Section 3 (6 pipeline stages), repository selection criteria are explicit (open-source Python, recent, non-fork, 50+ stars, unit tests), Table 7 lists all 25 repositories with creation date, stars, file counts, and sample counts.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper provides a GitHub link and says the benchmark is released, but no explicit license terms (MIT, Apache, CC, etc.) are stated in the paper, and the terms under which others may use or redistribute the benchmark are not addressed.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Intended use is stated: evaluate LLMs on repository-level code generation in Python; the paper explicitly notes that results are not comparable across versions of the benchmark, bounding interpretation of scores.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "EvoCodeBench's code and dependency distributions match those of 500 real-world Python repositories (27% standalone, 73% non-standalone; avg 3.46 vs 3.22 dependencies per program).",
    203       "evidence": "Table 2 directly compares EvoCodeBench-2403 statistics against 500 real repositories across both code distribution and dependency distribution metrics.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "GPT-4 achieves only 20.73% Pass@1 on EvoCodeBench, compared to 88.4% on HumanEval — a dramatic drop showing existing benchmarks overestimate real-world coding ability.",
    208       "evidence": "Table 4 reports 20.73% for gpt-4-turbo-1106 in the Local File (Infilling) setting; the HumanEval comparison is stated in the text of Section 4.4.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Auto-generated requirements (GPT-4) are comparable in quality to human-written requirements, with 92% of cases tied or won by GPT-4.",
    213       "evidence": "Table 5 shows 41 ties, 5 GPT-4 wins, 4 human wins among 50 randomly selected functions; Cohen's Kappa between evaluators is 0.92.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "More repository context consistently improves LLM performance; GPT-4's Pass@1 improves 104% (completion) and 152% (infilling) when local file context is added.",
    218       "evidence": "Table 4 shows GPT-4 Without Context Pass@1=7.27%, Completion=17.45%, Infilling=20.73%; percentage improvements are calculated in Section 4.4.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "GPT family models have higher Pass@k but lower Recall@k than open-source code LLMs, attributed to instruction-tuning making them conservative about generating dependencies.",
    223       "evidence": "Table 4 supports the pattern (e.g., DeepSeek Coder 33B Recall@1=71.46% vs GPT-4=68.24% in infilling despite lower Pass@1); explanation is speculative reasoning about training objectives.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Recall@k underestimates true dependency recall by only 0.16 due to Python's dynamic typing, small relative to inter-LLM variation of 7.77.",
    228       "evidence": "50 programs were manually annotated and compared against parser output to estimate the bias; the comparison to inter-LLM variation contextualizes the magnitude.",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Temporal separation (repositories post-dating LLM training cutoffs) provides contamination resistance.",
    233       "evidence": "Repos from 2023-10 to 2024-2 used; authors cite that the latest LLM's training data ends at 2023-9. No empirical test of contamination is performed — this is a design argument.",
    234       "supported": "moderate"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-eval"
    239   ],
    240   "key_findings": "EvoCodeBench reveals that top LLMs perform dramatically worse on real-world repository-level code generation (GPT-4 max 20.73% Pass@1) than on isolated function benchmarks like HumanEval (88.4%), exposing a large gap between benchmark and real-world coding ability. The benchmark's code and dependency distributions match those of 500 real-world Python repositories, providing more realistic coverage than prior work. Repository context is critical: adding local file context improves GPT-4 Pass@1 by up to 152%, and the primary failure modes are implementation logic errors (29/50 analyzed cases) and missing cross-file context (20/50). Auto-generated requirements using GPT-4 are comparable to human-written ones (92% equivalent quality) at a fraction of the cost, validating a scalable annotation approach.",
    241   "red_flags": [
    242     {
    243       "flag": "No human performance baseline",
    244       "detail": "The paper reports LLM Pass@1 values but provides no human developer baseline, making it impossible to assess how far models are from human-level performance on the same tasks."
    245     },
    246     {
    247       "flag": "Circular requirement generation",
    248       "detail": "Requirements are generated by GPT-4, which is also one of the evaluated models; while the quality comparison (Table 5) uses separate evaluators, there is a potential circularity in using GPT-4 to design test inputs for evaluating GPT-4."
    249     },
    250     {
    251       "flag": "Small benchmark size and limited repo diversity",
    252       "detail": "275 samples from 25 repositories, skewed heavily toward image processing and deep learning domains (Table 7); 54/275 samples come from a single repository (camp_zipnerf), raising diversity concerns."
    253     },
    254     {
    255       "flag": "No funding disclosure",
    256       "detail": "No funding sources or conflicts of interest are disclosed anywhere in the paper, making it impossible to assess potential biases."
    257     },
    258     {
    259       "flag": "No licensing terms stated",
    260       "detail": "The benchmark is released publicly but no license is specified in the paper, creating legal uncertainty for downstream users."
    261     },
    262     {
    263       "flag": "Contamination resistance is assumed, not tested",
    264       "detail": "The temporal split is designed to prevent contamination, but no membership inference or other empirical test of actual contamination is performed to validate the assumption."
    265     }
    266   ],
    267   "cited_papers": [
    268     {
    269       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    270       "relevance": "Primary comparison baseline; EvoCodeBench's motivation is the gap between HumanEval's 88.4% GPT-4 score and real-world performance."
    271     },
    272     {
    273       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-Trained Models",
    274       "relevance": "Closest prior work on non-standalone code generation benchmarks; EvoCodeBench extends it with dependency paths, test cases, and evolving design."
    275     },
    276     {
    277       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    278       "relevance": "Contemporary repository-level benchmark; contrasted with EvoCodeBench (issue repair vs. new code generation)."
    279     },
    280     {
    281       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    282       "relevance": "One of the top-performing open-source LLMs evaluated; training cutoff date used to define the temporal split for contamination avoidance."
    283     },
    284     {
    285       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    286       "relevance": "Evaluated LLM; training data cutoff (2023-9) is cited as justification for the temporal boundary of EvoCodeBench-2403."
    287     },
    288     {
    289       "title": "Repository-Level Prompt Generation for Large Language Models of Code",
    290       "relevance": "Prior work on repository-level context extraction that motivates the experimental settings (local file completion/infilling)."
    291     },
    292     {
    293       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation",
    294       "relevance": "Prior benchmark compared in Table 1 and Table 2; shows class-level but not repository-level evaluation."
    295     },
    296     {
    297       "title": "Program Synthesis with Large Language Models (MBPP)",
    298       "relevance": "Standard benchmark used for comparison; all standalone functions, contrasted with EvoCodeBench's mixed distribution."
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 2,
    304       "justification": "Released benchmark and evaluation framework can be used by researchers and practitioners to evaluate code LLMs on realistic tasks, though it requires significant setup."
    305     },
    306     "surprise_contrarian": {
    307       "score": 1,
    308       "justification": "The finding that gpt-4 drops from 80% to 20.73% on real-world code is notable but broadly expected by practitioners who know HumanEval is easy."
    309     },
    310     "fear_safety": {
    311       "score": 0,
    312       "justification": "No AI safety or security concerns raised; purely a benchmark evaluation paper."
    313     },
    314     "drama_conflict": {
    315       "score": 1,
    316       "justification": "Implicitly argues that existing benchmarks like HumanEval are misleading about LLM coding abilities, but presents this as a gap to fill rather than a controversy."
    317     },
    318     "demo_ability": {
    319       "score": 2,
    320       "justification": "Code and data released on GitHub; researchers can download and run evaluations, though it requires setting up repositories and test environments."
    321     },
    322     "brand_recognition": {
    323       "score": 1,
    324       "justification": "From Peking University (well-known in CS); evaluates recognizable models (gpt-4, DeepSeek Coder) but not from a major AI lab."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {
    330         "hn_id": "38853706",
    331         "title": "Possible Meissner effect near room temperature: copper-substituted lead apatite",
    332         "points": 729,
    333         "comments": 318,
    334         "url": "https://news.ycombinator.com/item?id=38853706"
    335       },
    336       {
    337         "hn_id": "28757897",
    338         "title": "GitHub Repositories with Links to Academic Papers [pdf]",
    339         "points": 59,
    340         "comments": 0,
    341         "url": "https://news.ycombinator.com/item?id=28757897"
    342       },
    343       {
    344         "hn_id": "40383885",
    345         "title": "Special Characters Attack: Toward Scalable Training Data Extraction from LLMs",
    346         "points": 10,
    347         "comments": 0,
    348         "url": "https://news.ycombinator.com/item?id=40383885"
    349       },
    350       {
    351         "hn_id": "40282999",
    352         "title": "Proof of the Geometric Langlands Conjecture Part 1/5",
    353         "points": 8,
    354         "comments": 1,
    355         "url": "https://news.ycombinator.com/item?id=40282999"
    356       },
    357       {
    358         "hn_id": "38850232",
    359         "title": "LK99: Possible Meissner effect near room temperature",
    360         "points": 6,
    361         "comments": 2,
    362         "url": "https://news.ycombinator.com/item?id=38850232"
    363       },
    364       {
    365         "hn_id": "28759597",
    366         "title": "Voltage-Gate Assisted Spin-Orbit Torque Magnetic Random Access Memory",
    367         "points": 3,
    368         "comments": 0,
    369         "url": "https://news.ycombinator.com/item?id=28759597"
    370       },
    371       {
    372         "hn_id": "42043783",
    373         "title": "MarsCode Agent: AI-Native Automated Bug Fixing",
    374         "points": 1,
    375         "comments": 0,
    376         "url": "https://news.ycombinator.com/item?id=42043783"
    377       },
    378       {
    379         "hn_id": "40588050",
    380         "title": "Empirical influence functions to understand the logic of fine-tuning",
    381         "points": 1,
    382         "comments": 0,
    383         "url": "https://news.ycombinator.com/item?id=40588050"
    384       },
    385       {
    386         "hn_id": "39429077",
    387         "title": "Hydragen: High-Throughput LLM Inference with Shared Prefixes",
    388         "points": 1,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=39429077"
    391       },
    392       {
    393         "hn_id": "22819031",
    394         "title": "Neural network based country wise risk prediction of Covid-19",
    395         "points": 1,
    396         "comments": 0,
    397         "url": "https://news.ycombinator.com/item?id=22819031"
    398       }
    399     ],
    400     "top_points": 729,
    401     "total_points": 819,
    402     "total_comments": 321
    403   }
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs