scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19815B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Xuanming Zhang",
     10       "Yihong Dong",
     11       "Zhi Jin"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2404.00599",
     16     "doi": "10.48550/arXiv.2404.00599"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims are verified in the paper: gpt-4 Pass@1 of 20.73% is in Table 4, alignment with real-world distributions is shown in Table 2, and the evolving pipeline is described in Section 3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper compares three controlled context settings (without context, local completion, local infilling) to attribute performance improvements to context availability; this ablation design is adequate for the observational causal claims made.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper explicitly scopes to Python code and English requirements, 275 samples from 25 repositories, and the limitations section acknowledges the benchmark cannot generalize to multilingual settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper offers single-interpretation explanations (e.g., 'instruction tuning causes GPT models to be conservative') without considering alternative explanations for the observed GPT vs. open-source Recall@k divergence.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly distinguishes between functional correctness (Pass@k via test execution) and dependency recall (Recall@k via static analysis), acknowledging each measures a different aspect of repository-level code generation.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 is a dedicated 'Limitations' section listing six specific limitations of both the benchmark and the evaluation experiments.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are quantified: the Recall@k bias from the static parser is measured at 0.16 on 50 manually annotated samples; the monolingual constraint and limited context settings are named as concrete scope restrictions.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states the benchmark is Python-only, English requirements only, and that Pass@k/Recall@k scores are not comparable across benchmark versions.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "There is no funding acknowledgment or grant disclosure anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed as affiliated with 'School of Computer Science, Peking University' on the first page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Funding is not disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests statement or conflict of interest declaration in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper defines 'standalone vs. non-standalone functions,' 'repository-level code generation,' 'reference dependencies' (with path format), and formally defines Pass@k and Recall@k with equations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly enumerates four contributions in Section 1: the five benchmark features, the benchmark itself, the repository-level task definition, and the LLM evaluation results.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 systematically compares EvoCodeBench against 10 prior benchmarks on five criteria, and Section 6 situates the work relative to both LLM code generation and repository-level benchmark lines of work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that measuring repository-level coding ability requires non-standalone functions with dependencies, and validates alignment by showing EvoCodeBench-2403's code/dependency distributions match those of 500 real-world repositories (Table 2).",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper characterizes code type (27% standalone, 73% non-standalone) and dependency type distributions, but does not define easy/medium/hard difficulty tiers or provide a formal difficulty analysis of benchmark items.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "All LLMs score between ~5-21% Pass@1, indicating a likely floor effect that is acknowledged qualitatively ('far from practical applications') but not formally analyzed as a benchmark validity concern.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human performance baseline on the code generation task is provided; the human comparison in Section 5 is for annotation quality of requirements, not task performance.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Pass@k is justified by convention and prior work; Recall@k is motivated by the need to assess dependency usage beyond functional correctness, and its bias is quantified at 0.16 via 50 manually annotated programs.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Contamination resistance is a core design feature: EvoCodeBench-2403 collects from repositories created October 2023–February 2024, after the training data cutoff (September 2023) of the most recent LLM evaluated.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper explicitly plans dynamic updates every 6 months and warns that Pass@k/Recall@k are not comparable across versions, directly addressing temporal obsolescence.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper identifies specific failure modes: Recall@k undercounts due to Python dynamic typing (quantified), auto-generated requirements may miss details, and the benchmark covers only English/Python.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper releases all prompt templates (Figures 6-10), LLM completions, and the full benchmark at the GitHub repository, enabling reproduction of reported numbers.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 7 provides per-repository metadata (creation date, star count, file counts, line counts, sample counts, domain), and the 6-stage collection pipeline is described in detail in Section 3.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The benchmark GitHub URL is provided and the paper states it is released publicly, but no explicit license (MIT, CC, Apache, etc.) is stated for the benchmark data or code.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The paper specifies that EvoCodeBench should be used for repository-level code generation evaluation and explicitly states that results are not comparable across benchmark versions.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "GPT-4's highest Pass@1 on EvoCodeBench is only 20.73%, compared to ~80% on HumanEval.",
    203       "evidence": "Table 4 shows gpt-4-turbo-1106 achieves Pass@1 of 20.73% in the local file infilling setting; gpt-4 achieves 88.4 on HumanEval as stated in Section 4.4.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "EvoCodeBench's code and dependency distributions are consistent with 500 real-world repositories.",
    208       "evidence": "Table 2 shows EvoCodeBench-2403 has 27%/73% standalone/non-standalone split and 3.46 avg dependencies, matching the 500-repository sample (27%/73%, 3.22 avg).",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Introducing local file context improves gpt-4 Pass@1 by 104-152% over the no-context baseline.",
    213       "evidence": "Table 4 shows gpt-4 goes from 7.27% (no context) to 17.45% (local completion, +104%) and 20.73% (local infilling, +185%), though the 152% figure cited in the text appears to be approximate.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Auto-generated requirements from GPT-4 are comparable in quality to human-written requirements.",
    218       "evidence": "Table 5 shows GPT-4 and human developers tied on 41/50 functions; Cohen's Kappa between evaluators is 0.92, and GPT-4 wins on 5 functions vs. humans winning on 4.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "GPT-family models have higher Pass@k but lower Recall@k than other models due to instruction tuning.",
    223       "evidence": "Table 4 shows gpt-4 and gpt-3.5 achieve the highest Pass@1 but relatively lower Recall@1 (68.24% and 61.94%) compared to DeepSeek Coder 33B (71.46%); the explanation is speculative.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Most LLM failures (29/50 analyzed) are due to implementation logic errors, not missing context.",
    228       "evidence": "Manual analysis of 50 error cases for gpt-4 in the local file infilling setting is reported in Section 4.4, with 29 logic errors and 20 missing-context failures.",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "observational"
    235   ],
    236   "key_findings": "EvoCodeBench demonstrates that existing code generation benchmarks significantly overestimate LLM capabilities: GPT-4 achieves only 20.73% Pass@1 on repository-level tasks versus ~80% on HumanEval. The benchmark's key innovation is temporal contamination resistance (collecting from repos created after LLM training cutoffs) and distribution alignment with 500 real-world repositories. Introducing local file context improves performance substantially (up to 2-3x), indicating that dependency-awareness is a major gap in current LLMs.",
    237   "red_flags": [
    238     {
    239       "flag": "No human task baseline",
    240       "detail": "Human performance on the code generation task itself is not measured; only the annotation quality of requirements is compared to humans, leaving no upper bound for benchmark interpretation."
    241     },
    242     {
    243       "flag": "LLM-generated annotations",
    244       "detail": "Natural language requirements are generated by GPT-4, the same model evaluated as top performer, creating a potential systematic advantage for GPT-4 which may be better calibrated to its own annotation style."
    245     },
    246     {
    247       "flag": "No license specified",
    248       "detail": "Despite releasing the benchmark publicly, no software or data license is stated, creating legal ambiguity for reuse."
    249     },
    250     {
    251       "flag": "Floor effect not analyzed",
    252       "detail": "All LLMs score 5-21% Pass@1, suggesting a potential floor effect, but the paper does not formally assess whether the benchmark discriminates at that difficulty level or whether the test cases are well-calibrated."
    253     },
    254     {
    255       "flag": "No difficulty characterization",
    256       "detail": "The benchmark lacks a formal difficulty distribution; items are only split by type (standalone/non-standalone) rather than by measured or estimated difficulty, limiting analysis of where models fail."
    257     },
    258     {
    259       "flag": "Small corpus for an 'evolving' benchmark",
    260       "detail": "275 samples from 25 repositories in a single version limits statistical power; comparisons between model sizes and families may be unreliable at this sample size."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Primary baseline benchmark that EvoCodeBench is compared against; establishes Pass@k metric used in this paper."
    267     },
    268     {
    269       "title": "Program Synthesis with Large Language Models (MBPP)",
    270       "relevance": "Another baseline standalone-function benchmark contrasted with EvoCodeBench's repository-level approach."
    271     },
    272     {
    273       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    274       "relevance": "Most similar prior benchmark with non-standalone functions; EvoCodeBench directly improves on its annotation comprehensiveness."
    275     },
    276     {
    277       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    278       "relevance": "Contemporaneous repository-level benchmark; contrasted as issue-repair vs. EvoCodeBench's code generation task."
    279     },
    280     {
    281       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    282       "relevance": "Related repository-level completion benchmark lacking natural language requirements; contrasted in related work."
    283     },
    284     {
    285       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    286       "relevance": "Top-performing open-source model evaluated on EvoCodeBench; training data cutoff used to justify contamination resistance design."
    287     },
    288     {
    289       "title": "Repository-Level Prompt Generation for Large Language Models of Code",
    290       "relevance": "Prior work on repository-level context extraction that motivates EvoCodeBench's experimental settings."
    291     },
    292     {
    293       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    294       "relevance": "Related cross-file repository benchmark discussed in related work, contrasted with EvoCodeBench's full-generation task."
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 3,
    300       "justification": "Practitioners can directly use this benchmark to evaluate LLMs for real-world repository coding tasks, with the GitHub release and planned updates."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "The 80% → 20% Pass@1 drop from HumanEval to EvoCodeBench for GPT-4 is a striking finding that challenges conventional benchmark-based assessments of LLM coding ability."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "No AI safety or risk concerns are raised; the paper is a purely technical benchmark contribution."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "Implicit criticism of existing benchmarks as inadequate is present but not framed as controversy; the tone is constructive."
    313     },
    314     "demo_ability": {
    315       "score": 3,
    316       "justification": "The benchmark is publicly released with a GitHub link, prompt templates, and LLM completions, enabling immediate replication and use."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "Authors are from Peking University, a respected institution, but not an industry AI lab; no famous product affiliation."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "38853706",
    327         "title": "Possible Meissner effect near room temperature: copper-substituted lead apatite",
    328         "points": 729,
    329         "comments": 318,
    330         "url": "https://news.ycombinator.com/item?id=38853706"
    331       },
    332       {
    333         "hn_id": "28757897",
    334         "title": "GitHub Repositories with Links to Academic Papers [pdf]",
    335         "points": 59,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=28757897"
    338       },
    339       {
    340         "hn_id": "40383885",
    341         "title": "Special Characters Attack: Toward Scalable Training Data Extraction from LLMs",
    342         "points": 10,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=40383885"
    345       },
    346       {
    347         "hn_id": "40282999",
    348         "title": "Proof of the Geometric Langlands Conjecture Part 1/5",
    349         "points": 8,
    350         "comments": 1,
    351         "url": "https://news.ycombinator.com/item?id=40282999"
    352       },
    353       {
    354         "hn_id": "38850232",
    355         "title": "LK99: Possible Meissner effect near room temperature",
    356         "points": 6,
    357         "comments": 2,
    358         "url": "https://news.ycombinator.com/item?id=38850232"
    359       },
    360       {
    361         "hn_id": "28759597",
    362         "title": "Voltage-Gate Assisted Spin-Orbit Torque Magnetic Random Access Memory",
    363         "points": 3,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=28759597"
    366       },
    367       {
    368         "hn_id": "42043783",
    369         "title": "MarsCode Agent: AI-Native Automated Bug Fixing",
    370         "points": 1,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=42043783"
    373       },
    374       {
    375         "hn_id": "40588050",
    376         "title": "Empirical influence functions to understand the logic of fine-tuning",
    377         "points": 1,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=40588050"
    380       },
    381       {
    382         "hn_id": "39429077",
    383         "title": "Hydragen: High-Throughput LLM Inference with Shared Prefixes",
    384         "points": 1,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=39429077"
    387       },
    388       {
    389         "hn_id": "22819031",
    390         "title": "Neural network based country wise risk prediction of Covid-19",
    391         "points": 1,
    392         "comments": 0,
    393         "url": "https://news.ycombinator.com/item?id=22819031"
    394       }
    395     ],
    396     "top_points": 729,
    397     "total_points": 819,
    398     "total_comments": 321
    399   }
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs