scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23467B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Yunfei Zhao",
     10       "Yongming Li",
     11       "Huanyu Liu",
     12       "Hao Zhu",
     13       "Lecheng Wang",
     14       "Kaibo Liu",
     15       "Zheng Fang",
     16       "Lanshen Wang",
     17       "Jiazheng Ding",
     18       "Xuanming Zhang",
     19       "Yuqi Zhu",
     20       "Yihong Dong",
     21       "Zhi Jin",
     22       "Binhua Li",
     23       "Fei Huang",
     24       "Yongbin Li"
     25     ],
     26     "year": 2024,
     27     "venue": "Annual Meeting of the Association for Computational Linguistics",
     28     "arxiv_id": "2405.19856",
     29     "doi": "10.48550/arXiv.2405.19856"
     30   },
     31   "checklist": {
     32     "claims_and_evidence": {
     33       "abstract_claims_supported": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Abstract claims that existing benchmarks are poorly aligned with real-world repos are evidenced by Table 2 comparison of distributions across 10 benchmarks vs. 500 real-world repos. Claims about DevEval's features are supported by construction details in Section 3 and evaluation results in Table 5.",
     37         "source": "haiku"
     38       },
     39       "causal_claims_justified": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper claims contexts improve performance (205% improvement for gpt-4) and supports this with controlled comparison across three settings (no context, local file completion, local file infilling). The study design adequately isolates the effect of context availability.",
     43         "source": "haiku"
     44       },
     45       "generalization_bounded": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Scope explicitly bounded in Limitations section: Python only, English requirements only, 8 specific LLMs tested, 117 repositories from 10 domains. The paper appropriately qualifies that results apply to this setting, though some statements could be more cautious.",
     49         "source": "haiku"
     50       },
     51       "alternative_explanations_discussed": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Multiple explanations offered for key findings: performance drops due to context length AND heterogeneity (Section 4.5); successful cases attributed to domain knowledge AND requirement clarity; dependency generation success explained by reasoning from requirements OR guessing from naming conventions.",
     55         "source": "haiku"
     56       },
     57       "proxy_outcome_distinction": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Paper clearly distinguishes between measured outcomes (Pass@k for functional correctness, Recall@k for dependency accuracy) and claimed 'coding abilities.' These are specific, measurable proxies. Caveat: dependencies are dynamic in Python (Section 5), so some dependencies are missed by parser.",
     61         "source": "haiku"
     62       }
     63     },
     64     "limitations_and_scope": {
     65       "limitations_section_present": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 9 is a dedicated Limitations section with three specific limitations: monolingual scope, Recall@k parser bias, local-file-only contexts. This is not boilerplate; each limitation is substantive.",
     69         "source": "haiku"
     70       },
     71       "threats_to_validity_specific": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Specific threats identified: (1) Monolingual—paper acknowledges Python/English-only limits practitioners' ability to generalize to other languages; (2) Recall@k bias quantified at 0.16 due to dynamic typing; (3) Data leakage risk acknowledged with reasoning about why impact is minimal (Section 5). Section 4.5 identifies specific LLM failure modes (context length, hallucinations).",
     75         "source": "haiku"
     76       },
     77       "scope_boundaries_stated": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Explicit boundaries: Python code, English requirements, 117 repositories from 10 specific domains, evaluation of 8 specified LLMs (Table 4). Monolingual limitation stated in Section 9. Boundaries are stated but not always emphasized in main text.",
     81         "source": "haiku"
     82       }
     83     },
     84     "conflicts_of_interest": {
     85       "funding_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 8 Acknowledgments: 'This research was supported by the National Natural Science Foundation of China (Nos. 62192731, 62152730).' Funding source is clearly disclosed.",
     89         "source": "haiku"
     90       },
     91       "affiliations_disclosed": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "All authors list affiliations: Peking University (institutions 1, 2) or Alibaba Group (institution 3). Affiliations are clearly stated on title page with no hidden conflicts regarding evaluated models.",
     95         "source": "haiku"
     96       },
     97       "funder_independent_of_outcome": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "NSF China is government funding independent of benchmark outcomes. Paper evaluates multiple LLMs (gpt-4, DeepSeek, StarCoder, CodeLLaMa) without favoring Alibaba-affiliated products. Funder has no stake in which LLM performs best.",
    101         "source": "haiku"
    102       },
    103       "financial_interests_declared": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No competing interests statement provided. No disclosure of patents, equity holdings, or consulting relationships. Standard practice to include such a statement even if none exist.",
    107         "source": "haiku"
    108       }
    109     },
    110     "scope_and_framing": {
    111       "key_terms_defined": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Code generation: Section 2.2 defines as 'write code based on requirements and repository.' Repository-level code generation defined as 'simulates developers' coding process in a working repository.' Standalone/non-standalone clearly defined in Figure 1 with examples. Dependency types (intra-class, intra-file, cross-file) defined with examples.",
    115         "source": "haiku"
    116       },
    117       "intended_contribution_clear": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract and introduction clearly enumerate contributions: (1) four features for benchmarks, (2) DevEval benchmark itself, (3) repository-level code generation task, (4) evaluation of 8 LLMs with analysis. Contributions are explicit and numbered.",
    121         "source": "haiku"
    122       },
    123       "engagement_with_prior_work": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 1 discusses existing benchmarks (HumanEval, MBPP, APPS, etc.) and their shortcomings. Table 1 compares DevEval to 10 prior benchmarks on four dimensions. Section 6 covers related work comprehensively. Paper clearly positions DevEval as addressing gaps in prior benchmarks.",
    127         "source": "haiku"
    128       }
    129     }
    130   },
    131   "type_checklist": {
    132     "benchmark-creation": {
    133       "construct_design": {
    134         "construct_validity_argued": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "Paper argues that benchmarks must align with real-world code distributions to validly measure coding ability. Evidence: analysis of 1M+ functions from 500 real-world repos (Section 3) showing distributions of standalone/non-standalone code and dependency types. However, argument is somewhat circular (real-world code has these properties, so we should measure on real-world code).",
    138           "source": "haiku"
    139         },
    140         "difficulty_distribution_characterized": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No characterization of task difficulty distribution. No easy/medium/hard tiers identified. Figure 5 shows performance differences by program type but not difficulty level. No difficulty metrics computed or reported.",
    144           "source": "haiku"
    145         },
    146         "ceiling_floor_effects_checked": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table 5 shows Pass@1 ranges 12.7%-53.04% across models/settings, suggesting no extreme ceiling/floor effects. However, paper does not explicitly analyze or discuss whether ceiling/floor effects exist. The discussion of this potential issue is absent.",
    150           "source": "haiku"
    151         },
    152         "human_baseline_included": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No human performance reported on benchmark tasks. Paper mentions 13 developers annotated the data but does not report how well humans complete the code generation tasks themselves. This is a significant omission.",
    156           "source": "haiku"
    157         },
    158         "scoring_rubric_justified": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Pass@k justified by reference to prior work and 'assess functional correctness by executing test cases.' Recall@k justified by 'expect LLMs to invoke relevant dependencies.' Justifications are present but relatively brief and could provide deeper reasoning for metric choices over alternatives.",
    162           "source": "haiku"
    163         }
    164       },
    165       "robustness": {
    166         "contamination_resistance_designed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Paper acknowledges data leakage risk (Section 5) but does not design anti-gaming measures into the benchmark itself. No temporal splits, canary strings, or dynamic generation employed. Mitigation relies on future LLM developers excluding these repositories—a procedural, not technical, safeguard.",
    170           "source": "haiku"
    171         },
    172         "temporal_robustness_discussed": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Paper does not discuss whether benchmark will become gamed, obsoleted, or outdated. Future work mentions updating with more projects/languages/tests, but no anti-gaming strategy or discussion of benchmark longevity is provided.",
    176           "source": "haiku"
    177         },
    178         "failure_modes_discussed": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Paper discusses LLM failure modes well (Section 4.5): struggles with long contexts, generates hallucinations, poor at cross-file dependencies. Benchmark failure modes: Recall@k parser bias (0.16 quantified), Python dynamic typing evasion, monolingual limitation. However, discussion focuses more on LLM failures than benchmark limitations.",
    182           "source": "haiku"
    183         },
    184         "baseline_implementations_provided": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "Paper states 'DevEval, prompts, and LLMs' predictions have been released' with GitHub link, but does not explicitly confirm baseline implementations for reproducing reported numbers are included. Reproducibility details are unclear from paper.",
    188           "source": "haiku"
    189         }
    190       },
    191       "documentation": {
    192         "dataset_documentation_complete": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Source description: 1,874 samples from 117 repos in 10 domains from PyPI. Collection methodology: detailed 5-stage pipeline in Section 3 (repository selection, function parsing, test construction, human annotation, benchmark construction). Preprocessing steps clearly described. Characteristics well documented in Section 2.4 and Tables 2-3.",
    196           "source": "haiku"
    197         },
    198         "licensing_and_access_clear": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Access: GitHub link provided (https://github.com/seketeam/DevEval). Licensing: repositories are from PyPI and noted as open-source with licenses, though specific DevEval license not mentioned in paper. Access terms are clear; licensing could be more explicit.",
    202           "source": "haiku"
    203         },
    204         "intended_use_specified": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Intended use: 'evaluate the coding abilities of Large Language Models (LLMs)' and 'practitioners can pick up superior LLMs and facilitate application of code generation in real-world repositories.' Limitations section provides guidance on what should not be concluded (e.g., results apply only to Python/English). More detailed use guidance could be provided.",
    208           "source": "haiku"
    209         }
    210       }
    211     }
    212   },
    213   "claims": [
    214     {
    215       "claim": "Existing code generation benchmarks are poorly aligned with real-world code repositories in code distributions and dependency distributions.",
    216       "evidence": "Table 2 compares DevEval to 10 existing benchmarks and 500 real-world repositories. Previous benchmarks are 100% standalone code with no dependencies; 500 real repos are 27% standalone, 73% non-standalone with 3.22 dependencies per sample.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "DevEval's code and dependency distributions closely match those of 500 real-world repositories.",
    221       "evidence": "Table 2 shows DevEval: 27% standalone, 73% non-standalone, 3.41 dependencies per sample vs. 500 repos: 27% standalone, 73% non-standalone, 3.22 dependencies. Table 3 shows dependency type distributions closely aligned.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "gpt-4-turbo achieves only 53.04% Pass@1 on DevEval compared to ~80% on HumanEval, indicating existing benchmarks overestimate LLM coding ability.",
    226       "evidence": "Table 5 shows gpt-4 Pass@1=53.04% on DevEval (local file infilling). Paper states 'gpt-4-turbo-1106 achieves a Pass@1 score of 80% on HumanEval' in abstract.",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "Adding local file contexts dramatically improves LLM code generation: gpt-4 Pass@1 improves 205% (no context 17.4% → local file infilling 53.04%).",
    231       "evidence": "Table 5 directly shows these values across three context settings for all 8 models, with consistent improvements across all models.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "LLMs struggle to understand long and heterogeneous code contexts, causing them to disregard knowledge in contexts and generate hallucinations.",
    236       "evidence": "Section 4.5 error case analysis (Figure 4): gpt-3.5 invokes non-existent function 'create_connection' despite available 'connect' in contexts. Paper attributes to context length (9× gpt-4 context window) and heterogeneous code from multiple files, citing Liu et al. (2023a) finding that LLMs 'ignore relevant information in the middle of long contexts.'",
    237       "supported": "moderate"
    238     },
    239     {
    240       "claim": "Cross-file dependencies are significantly harder for LLMs to generate than intra-file or intra-class dependencies.",
    241       "evidence": "Figure 6 shows gpt-4 Recall@1: intra-class 73%, intra-file 70%, cross-file 60% (local file infilling). Without context: intra-class 24%, intra-file 15%, cross-file 8%. Cross-file consistently lowest.",
    242       "supported": "strong"
    243     }
    244   ],
    245   "methodology_tags": [
    246     "benchmark-eval",
    247     "empirical"
    248   ],
    249   "key_findings": "DevEval is a manually-annotated benchmark of 1,874 code samples from 117 real-world repositories, designed to evaluate code generation in repository contexts where models have access to local code files. Evaluation of 8 LLMs reveals that coding ability in realistic settings is 45-53% Pass@1 (gpt-4) compared to 80%+ on isolated function benchmarks, with dramatic improvement when local file contexts are provided (205% improvement for gpt-4). The main bottleneck is LLM inability to understand long, heterogeneous code contexts—models generate hallucinations (non-existent dependencies) and struggle especially with cross-file dependencies. Empirical lesson: context and requirement clarity are crucial for realistic code generation evaluation.",
    250   "red_flags": [
    251     {
    252       "flag": "No human baseline",
    253       "detail": "Paper doesn't report human performance on the benchmark tasks, making it impossible to judge if 53% Pass@1 is strong, weak, or reasonable. 13 developers annotated data but their own performance is not reported."
    254     },
    255     {
    256       "flag": "No task difficulty characterization",
    257       "detail": "Benchmark lacks easy/medium/hard task tiers or difficulty metrics. Unclear if all tasks are similarly challenging or if benchmark has ceiling/floor effects that would reduce discriminative power."
    258     },
    259     {
    260       "flag": "No built-in contamination resistance",
    261       "detail": "Paper acknowledges data leakage risk (repositories may be in training data) but designs no anti-gaming mechanisms (temporal splits, canary strings, dynamic generation). Mitigation relies on future LLM developers' voluntary exclusion—a procedural safeguard, not technical."
    262     },
    263     {
    264       "flag": "Limited context evaluation scope",
    265       "detail": "Paper evaluates only local file contexts, not realistic broader contexts (imports, sibling files, external libraries). Authors acknowledge this as Section 9 limitation but it reduces benchmark's real-world alignment claim."
    266     },
    267     {
    268       "flag": "Recall@k metric has known bias",
    269       "detail": "Python dynamic typing causes some dependencies to be identified only at runtime, eluding the static parser. Bias quantified at 0.16 Recall@1 but still a systematic underestimation of actual dependency recall."
    270     },
    271     {
    272       "flag": "Reproducibility details unclear",
    273       "detail": "Paper claims code is released on GitHub but doesn't explicitly confirm baseline implementations for reproducing reported Pass@k/Recall@k numbers are provided. Reproducibility from scratch is unclear."
    274     }
    275   ],
    276   "cited_papers": [
    277     {
    278       "title": "Evaluating Large Language Models Trained on Code",
    279       "relevance": "HumanEval benchmark, foundational prior benchmark for code generation that DevEval compares against"
    280     },
    281     {
    282       "title": "Program Synthesis with Large Language Models",
    283       "relevance": "MBPP benchmark, another key baseline benchmark used to contextualize DevEval's performance numbers"
    284     },
    285     {
    286       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    287       "relevance": "CoderEval benchmark with non-standalone functions; DevEval builds on and extends this approach with more comprehensive annotations"
    288     },
    289     {
    290       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation",
    291       "relevance": "Class-level code generation benchmark; DevEval positions itself as extending to repository-level contexts"
    292     },
    293     {
    294       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    295       "relevance": "Supports DevEval's finding that LLMs struggle with long contexts; cited as evidence for why models fail on heterogeneous code"
    296     },
    297     {
    298       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    299       "relevance": "Repository-level code task for issue resolution; closely related to DevEval's repository-level code generation approach"
    300     },
    301     {
    302       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    303       "relevance": "Repository-level code completion benchmark; related work comparing approaches to evaluate repository-aware code tasks"
    304     },
    305     {
    306       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    307       "relevance": "Repository-level code completion; related benchmark for evaluating contextual code generation"
    308     }
    309   ],
    310   "engagement_factors": {
    311     "practical_relevance": {
    312       "score": 3,
    313       "justification": "Directly addresses practitioners' need to evaluate LLMs for real-world code generation. Provides benchmark to select LLMs and understand their real-world coding ability—immediate practical value."
    314     },
    315     "surprise_contrarian": {
    316       "score": 2,
    317       "justification": "Challenges narrative that LLMs are good at code (80% HumanEval → 53% DevEval), but this finding is somewhat expected given focus on realism. Moderately contrarian to optimistic benchmark narratives."
    318     },
    319     "fear_safety": {
    320       "score": 0,
    321       "justification": "No AI safety, alignment, or risk considerations. Purely a methodological benchmark paper with no safety angle."
    322     },
    323     "drama_conflict": {
    324       "score": 1,
    325       "justification": "Minor implicit conflict: existing benchmarks are misleading (HumanEval overstates ability). Not a major controversy or active conflict in the field."
    326     },
    327     "demo_ability": {
    328       "score": 2,
    329       "justification": "Benchmark released on GitHub, others can evaluate their models. Not instantly demo-able (requires evaluating on 1,874 samples), but reproducible and usable."
    330     },
    331     "brand_recognition": {
    332       "score": 2,
    333       "justification": "Peking University and Alibaba are well-known institutions with strong CS reputation. Not top-tier (OpenAI, DeepMind, FAIR) but reputable."
    334     }
    335   },
    336   "hn_data": {
    337     "threads": [
    338       {
    339         "hn_id": "40281516",
    340         "title": "Kan: Kolmogorov-Arnold Networks",
    341         "points": 28,
    342         "comments": 4,
    343         "url": "https://news.ycombinator.com/item?id=40281516"
    344       },
    345       {
    346         "hn_id": "41522319",
    347         "title": "Show HN: Ask LLMs to predict anything based on news",
    348         "points": 25,
    349         "comments": 9,
    350         "url": "https://news.ycombinator.com/item?id=41522319"
    351       },
    352       {
    353         "hn_id": "44511458",
    354         "title": "Large Language Models as Autonomous Spacecraft Operators in Kerbal Space Program",
    355         "points": 6,
    356         "comments": 0,
    357         "url": "https://news.ycombinator.com/item?id=44511458"
    358       },
    359       {
    360         "hn_id": "40234345",
    361         "title": "Kan: Kolmogorov-Arnold Networks",
    362         "points": 4,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=40234345"
    365       },
    366       {
    367         "hn_id": "40226580",
    368         "title": "Kan: Kolmogorov-Arnold Networks",
    369         "points": 3,
    370         "comments": 1,
    371         "url": "https://news.ycombinator.com/item?id=40226580"
    372       },
    373       {
    374         "hn_id": "40272252",
    375         "title": "Kan: Kolmogorov–Arnold Networks",
    376         "points": 3,
    377         "comments": 0,
    378         "url": "https://news.ycombinator.com/item?id=40272252"
    379       },
    380       {
    381         "hn_id": "40261190",
    382         "title": "Kan: Kolmogorov-Arnold Networks",
    383         "points": 3,
    384         "comments": 0,
    385         "url": "https://news.ycombinator.com/item?id=40261190"
    386       },
    387       {
    388         "hn_id": "39933799",
    389         "title": "Approaching Human-Level Forecasting with Language Models",
    390         "points": 3,
    391         "comments": 0,
    392         "url": "https://news.ycombinator.com/item?id=39933799"
    393       },
    394       {
    395         "hn_id": "40607770",
    396         "title": "Potential Field Based Deep Metric Learning",
    397         "points": 2,
    398         "comments": 0,
    399         "url": "https://news.ycombinator.com/item?id=40607770"
    400       },
    401       {
    402         "hn_id": "39948335",
    403         "title": "Towards a Brazilian History Knowledge Graph",
    404         "points": 1,
    405         "comments": 0,
    406         "url": "https://news.ycombinator.com/item?id=39948335"
    407       }
    408     ],
    409     "top_points": 28,
    410     "total_points": 78,
    411     "total_comments": 14
    412   }
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs