scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21935B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
      6     "authors": [
      7       "Jia Li",
      8       "Ge Li",
      9       "Yunfei Zhao",
     10       "Yongmin Li",
     11       "Huanyu Liu",
     12       "Hao Zhu",
     13       "Lecheng Wang",
     14       "Kaibo Liu",
     15       "Zheng Fang",
     16       "Lanshen Wang",
     17       "Jiazheng Ding",
     18       "Xuanming Zhang",
     19       "Yuqi Zhu",
     20       "Yihong Dong",
     21       "Zhi Jin",
     22       "Binhua Li",
     23       "Fei Huang",
     24       "Yongbin Li"
     25     ],
     26     "year": 2024,
     27     "venue": "Annual Meeting of the Association for Computational Linguistics",
     28     "arxiv_id": "2405.19856",
     29     "doi": "10.48550/arXiv.2405.19856"
     30   },
     31   "checklist": {
     32     "claims_and_evidence": {
     33       "abstract_claims_supported": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The abstract claims gpt-4-turbo's highest Pass@1 is 53.04%, which matches Table 5 (Local File Infilling). Claims about benchmark alignment are supported by Tables 2-3.",
     37         "source": "opus"
     38       },
     39       "causal_claims_justified": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The main causal claim is that code context improves LLM performance. The three experimental settings (no context vs completion vs infilling) form a controlled comparison that adequately supports this. Other causal language is modest.",
     43         "source": "opus"
     44       },
     45       "generalization_bounded": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The title claims alignment with 'Real-World Code Repositories' broadly, but the benchmark is Python-only and English-only. While the Limitations section (Section 9) acknowledges this is monolingual, the title and main framing overgeneralize beyond what was tested.",
     49         "source": "opus"
     50       },
     51       "alternative_explanations_discussed": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 4.4 discusses two reasons LLMs can generate dependencies without context (reasoning from requirements, guessing from naming conventions). It also explains why gpt-family vs open-source models behave differently (instruction tuning). Error analysis considers context length and heterogeneity as alternative failure explanations.",
     55         "source": "opus"
     56       },
     57       "proxy_outcome_distinction": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Pass@k explicitly measures functional correctness via test execution, and Recall@k measures dependency recall. The paper is precise about what these metrics capture and introduces Recall@k specifically to address a gap not captured by Pass@k alone. Claims match measurement granularity.",
     61         "source": "opus"
     62       }
     63     },
     64     "limitations_and_scope": {
     65       "limitations_section_present": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 9 'Limitations' provides substantive discussion of three specific limitations: monolingual scope, Recall@k bias from static analysis, and limited context settings.",
     69         "source": "opus"
     70       },
     71       "threats_to_validity_specific": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Specific threats are discussed: monolingual limitation (Python/English only), Recall@k parser bias quantified at 0.16 from a 50-program sample comparison with human annotators, and the limitation of only using local file context.",
     75         "source": "opus"
     76       },
     77       "scope_boundaries_stated": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper explicitly states it is Python-only, English-only, uses only local file contexts (not imported or sibling files), and that Recall@k has slight bias. Future work lists specific extensions planned (multilingual, more projects, more test cases).",
     81         "source": "opus"
     82       }
     83     },
     84     "conflicts_of_interest": {
     85       "funding_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 8 states: 'This research was supported by the National Natural Science Foundation of China (Nos. 62192731, 62152730).'",
     89         "source": "opus"
     90       },
     91       "affiliations_disclosed": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Author affiliations are clearly stated: Peking University (School of Computer Science, Key Laboratory) and Alibaba Group.",
     95         "source": "opus"
     96       },
     97       "funder_independent_of_outcome": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The National Natural Science Foundation of China is a government funding agency with no commercial stake in benchmark outcomes. While Alibaba authors are involved, Alibaba's own models are not being evaluated.",
    101         "source": "opus"
    102       },
    103       "financial_interests_declared": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No competing interests or financial interest statement is included in the paper.",
    107         "source": "opus"
    108       }
    109     },
    110     "scope_and_framing": {
    111       "key_terms_defined": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Key terms defined precisely: 'standalone' vs 'non-standalone' (Figure 1 with examples), dependency types 'intra-class/intra-file/cross-file' (Figure 2, Table 3), 'repository-level code generation' (Section 2.2), Pass@k and Recall@k (equations in Section 2.3).",
    115         "source": "haiku"
    116       },
    117       "intended_contribution_clear": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Explicit contributions stated: (1) four features a benchmark should have, (2) DevEval benchmark dataset (released), (3) repository-level code generation task, (4) evaluation of 8 LLMs with analysis. Contribution type (dataset + task + empirical study) is unambiguous.",
    121         "source": "haiku"
    122       },
    123       "engagement_with_prior_work": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 6 systematically reviews LLMs for code generation (Codex, ChatGPT, CodeLLaMa, DeepSeek) and code benchmarks (HumanEval, MBPP, APPS, ClassEval, CoderEval). Table 1 provides feature-by-feature comparison. Distinguishes from recent work (CrossCodeEval, RepoBench, RepoEval, SWE-bench) with detailed rationale.",
    127         "source": "haiku"
    128       }
    129     }
    130   },
    131   "type_checklist": {
    132     "benchmark-creation": {
    133       "construct_design": {
    134         "construct_validity_argued": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "Argument: because DevEval uses real repository code with realistic distributions and dependencies (Table 2 alignment to 500 repos), it measures real-world coding ability better than hand-crafted benchmarks. The claim is: alignment with real repos = valid measurement of real-world skill.",
    138           "source": "haiku"
    139         },
    140         "difficulty_distribution_characterized": {
    141           "applies": true,
    142           "answer": true,
    143           "justification": "Difficulty variation evident from results: gpt-4 Pass@1 ranges from 17.4% (no context) to 53% (with context), and Figure 5 shows performance varies by program type (standalone ~40-45% vs non-standalone ~53%). Table 2 reports avg requirement length (91.5 tokens) and repo scale variation. Implicitly characterized.",
    144           "source": "haiku"
    145         },
    146         "ceiling_floor_effects_checked": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Table 5 shows performance range for gpt-4: 17.4% (without context) to 60.65% (Pass@10 with context), avoiding extreme floors (<10%) or ceilings (>90%). Variation across models (12.54% to 53.04%) shows discrimination. No explicit ceiling/floor analysis but results demonstrate both.",
    150           "source": "haiku"
    151         },
    152         "human_baseline_included": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Benchmark uses human-written reference code and test cases, but no human developers were asked to generate code to solve the requirements. Human baseline performance is absent, limiting calibration validation.",
    156           "source": "haiku"
    157         },
    158         "scoring_rubric_justified": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Pass@k justified by citation to prior work (Chen et al. 2021, Austin et al. 2021). Recall@k introduced to measure dependency generation as separate dimension from functional correctness. Equation 2 explicitly specifies metric. Parser bias (0.16) is acknowledged and quantified in Section 5.",
    162           "source": "haiku"
    163         }
    164       },
    165       "robustness": {
    166         "contamination_resistance_designed": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Section 5 addresses contamination risk. Mitigations: (1) manually-written requirements never seen during training, (2) empirical test showing 0.36 Pass@1 difference between seen/unseen repos (minimal), (3) future guidance to exclude repos from training. Moderate resistance—relies on new requirements rather than fundamental design.",
    170           "source": "haiku"
    171         },
    172         "temporal_robustness_discussed": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Paper acknowledges that future LLMs may be trained on released repos and recommends exclusion in future work. Future work section states plan to update DevEval. Temporal concerns discussed but not deeply analyzed—no discussion of gaming strategies or benchmark decay rate.",
    176           "source": "haiku"
    177         },
    178         "failure_modes_discussed": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Section 4.4 error analysis discusses LLM failure modes: struggling with long contexts (citing Liu et al. 2023a), heterogeneous context fragments (citing Shi et al. 2023), and hallucinating non-existent dependencies (Figure 4). Discussion of why LLMs fail on the benchmark is provided.",
    182           "source": "haiku"
    183         },
    184         "baseline_implementations_provided": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Paper states 'DevEval, prompts, and LLMs' predictions have been released' with GitHub link (https://github.com/seketeam/DevEval). Results in Table 5 cover 8 models, implying reproducible experimental setup is available.",
    188           "source": "haiku"
    189         }
    190       },
    191       "documentation": {
    192         "dataset_documentation_complete": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Comprehensive documentation provided: 5-stage collection pipeline detailed in Section 3 and Figure 3, data statistics in Tables 2-3, composition breakdown (27% standalone/73% non-standalone), annotation process (13 developers, dual-check), test construction methodology. Preprocessing and quality criteria specified.",
    196           "source": "haiku"
    197         },
    198         "licensing_and_access_clear": {
    199           "applies": true,
    200           "answer": false,
    201           "justification": "Paper states benchmark 'has been released' but does not explicitly specify: (1) DevEval's license, (2) terms of use, (3) access restrictions. Repository selection mentions 'open-source licenses' but DevEval's own license is unstated.",
    202           "source": "haiku"
    203         },
    204         "intended_use_specified": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Intended use stated in Section 2.4: 'DevEval... can serve as an arena to compare approaches ranging from retrieval and long-context models to decision-making agents.' Section 1: 'practitioners can pick up superior LLMs and facilitate the application of code generation techniques in real-world repositories.' Intent is evaluating code generation on repository-level tasks.",
    208           "source": "haiku"
    209         }
    210       }
    211     }
    212   },
    213   "claims": [
    214     {
    215       "claim": "Existing code generation benchmarks are poorly aligned with real-world code repositories in code distribution and dependency patterns.",
    216       "evidence": "Table 2 and Table 3 show HumanEval, MBPP, and others have 100% standalone code with 0 dependencies, while 500 real repos have 27% standalone/73% non-standalone and 3M dependencies across all types.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "DevEval achieves alignment with real-world repositories across code distribution, dependency distribution, and repository scale.",
    221       "evidence": "Table 2 shows DevEval has 27% standalone (vs 27% in 500 repos), 73% non-standalone (vs 73%), 3.41 avg dependencies per sample (vs 3.22), and 243 avg files per repo (vs ~238).",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "LLM performance on repository-level code generation is dramatically lower than on function-level benchmarks.",
    226       "evidence": "gpt-4-turbo achieves 80% Pass@1 on HumanEval but only 53.04% Pass@1 on DevEval without context (17.4%) and with local context (53.04%).",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "Context from local files improves LLM code generation performance by 2-3x.",
    231       "evidence": "Table 5 shows gpt-4 improves from 17.4% (no context) to 53.04% (infilling context), a 205% relative improvement; gpt-3.5 goes from 13.98% to 44.50%, a 218% improvement.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "LLMs struggle to understand long and heterogeneous code contexts, leading to hallucinated dependencies and missed relevant code.",
    236       "evidence": "Section 4.4 error analysis: Figure 4 shows gpt-3.5 inventing non-existent 'create_connection' function despite available 'connect' function in context. Cross-file dependency Recall@1 is only 16.85% without context.",
    237       "supported": "strong"
    238     },
    239     {
    240       "claim": "Manually-written requirements help LLMs understand repository context and code generation objectives better than original code comments.",
    241       "evidence": "Section 4.4 successful case analysis attributes improvements to 'synergy of contexts and requirements.' DevEval requirement avg length 91.5 tokens vs CoderEval 41.5 tokens; Section 2.4 notes original comments are 'vague.'",
    242       "supported": "moderate"
    243     }
    244   ],
    245   "methodology_tags": [
    246     "benchmark-eval",
    247     "observational"
    248   ],
    249   "key_findings": "DevEval is a benchmark of 1,874 code generation samples from 117 real-world Python repositories with manually-written requirements and comprehensive dependency annotations, aligned with real-code distributions (27% standalone, 73% non-standalone). Evaluation of 8 popular LLMs reveals repository-level code generation is substantially harder than function-level benchmarks (gpt-4 drops from 80% to 53% Pass@1), with context from local files improving performance by 2-3x, yet LLMs still fail to consistently generate correct dependencies and often hallucinate non-existent functions, attributed to difficulty understanding long and heterogeneous code contexts.",
    250   "red_flags": [
    251     {
    252       "flag": "Missing human baseline",
    253       "detail": "No human developers were evaluated on DevEval, preventing validation that the task is appropriately calibrated (non-trivial but not impossible for humans)."
    254     },
    255     {
    256       "flag": "No inter-annotator agreement metrics",
    257       "detail": "13 developers annotated requirements with dual-check process (mentioned in Section 4), but no Cohen's kappa or agreement percentages reported despite 674 person-hours of annotation effort."
    258     },
    259     {
    260       "flag": "Weak contamination analysis",
    261       "detail": "Risk that 117 repos appear in LLM training data is acknowledged. Mitigation evidence (0.36 Pass@1 difference between seen/unseen repos) is weak and within expected variance; no strong prevention measures like canary strings."
    262     },
    263     {
    264       "flag": "Parser bias in Recall@k metric",
    265       "detail": "Static analysis parser misses runtime-determined dependencies in Python. While bias is quantified at 0.16, this introduces systematic error that could favor LLMs that generate fewer dependencies."
    266     },
    267     {
    268       "flag": "Limited scope reduces generalizability",
    269       "detail": "Monolingual (English + Python only), local-file-only context, only 10 PyPI domains, only generation (not debugging/fixing). Findings may not generalize to other languages, multi-file contexts, or real development workflows."
    270     },
    271     {
    272       "flag": "Tautological construct validity",
    273       "detail": "Argument that 'real repository code = valid measurement of real-world skill' is somewhat circular. The benchmark measures real code but doesn't establish that real code inherently measures the skills practitioners need."
    274     }
    275   ],
    276   "cited_papers": [
    277     {
    278       "title": "Evaluating Large Language Models Trained on Code",
    279       "relevance": "HumanEval benchmark, foundational function-level code generation evaluation."
    280     },
    281     {
    282       "title": "Program Synthesis with Large Language Models",
    283       "relevance": "MBPP benchmark; prior function-level code generation benchmark."
    284     },
    285     {
    286       "title": "Measuring Coding Challenge Competence with APPS",
    287       "relevance": "APPS benchmark for competition-style code problems; prior benchmark."
    288     },
    289     {
    290       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation",
    291       "relevance": "Recent class-level code generation benchmark; direct comparison baseline."
    292     },
    293     {
    294       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    295       "relevance": "CoderEval with non-standalone code and dependencies; most similar prior work."
    296     },
    297     {
    298       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    299       "relevance": "Repository-level code completion benchmark; related task, different from generation."
    300     },
    301     {
    302       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    303       "relevance": "Repository-level completion benchmark; similar scope, different task."
    304     },
    305     {
    306       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    307       "relevance": "Explains why LLMs struggle with middle of long contexts; cited in DevEval error analysis."
    308     },
    309     {
    310       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    311       "relevance": "Repository-level software engineering benchmark; different task (bug fixing vs generation)."
    312     }
    313   ],
    314   "engagement_factors": {
    315     "practical_relevance": {
    316       "score": 2,
    317       "justification": "Benchmark is publicly released and directly usable for evaluating code generation LLMs on realistic tasks."
    318     },
    319     "surprise_contrarian": {
    320       "score": 1,
    321       "justification": "Shows LLMs perform much worse on real-world code than HumanEval, which is somewhat expected but well-quantified."
    322     },
    323     "fear_safety": {
    324       "score": 0,
    325       "justification": "No safety, security, or AI risk concerns are raised."
    326     },
    327     "drama_conflict": {
    328       "score": 1,
    329       "justification": "Implicitly challenges HumanEval and similar benchmarks as insufficient, but framed constructively rather than controversially."
    330     },
    331     "demo_ability": {
    332       "score": 2,
    333       "justification": "Benchmark, prompts, and model predictions released on GitHub for immediate use and reproduction."
    334     },
    335     "brand_recognition": {
    336       "score": 1,
    337       "justification": "Peking University and Alibaba Group are moderately well-known in the AI research community."
    338     }
    339   },
    340   "hn_data": {
    341     "threads": [
    342       {
    343         "hn_id": "40281516",
    344         "title": "Kan: Kolmogorov-Arnold Networks",
    345         "points": 28,
    346         "comments": 4,
    347         "url": "https://news.ycombinator.com/item?id=40281516"
    348       },
    349       {
    350         "hn_id": "41522319",
    351         "title": "Show HN: Ask LLMs to predict anything based on news",
    352         "points": 25,
    353         "comments": 9,
    354         "url": "https://news.ycombinator.com/item?id=41522319"
    355       },
    356       {
    357         "hn_id": "44511458",
    358         "title": "Large Language Models as Autonomous Spacecraft Operators in Kerbal Space Program",
    359         "points": 6,
    360         "comments": 0,
    361         "url": "https://news.ycombinator.com/item?id=44511458"
    362       },
    363       {
    364         "hn_id": "40234345",
    365         "title": "Kan: Kolmogorov-Arnold Networks",
    366         "points": 4,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=40234345"
    369       },
    370       {
    371         "hn_id": "40226580",
    372         "title": "Kan: Kolmogorov-Arnold Networks",
    373         "points": 3,
    374         "comments": 1,
    375         "url": "https://news.ycombinator.com/item?id=40226580"
    376       },
    377       {
    378         "hn_id": "40272252",
    379         "title": "Kan: Kolmogorov–Arnold Networks",
    380         "points": 3,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=40272252"
    383       },
    384       {
    385         "hn_id": "40261190",
    386         "title": "Kan: Kolmogorov-Arnold Networks",
    387         "points": 3,
    388         "comments": 0,
    389         "url": "https://news.ycombinator.com/item?id=40261190"
    390       },
    391       {
    392         "hn_id": "39933799",
    393         "title": "Approaching Human-Level Forecasting with Language Models",
    394         "points": 3,
    395         "comments": 0,
    396         "url": "https://news.ycombinator.com/item?id=39933799"
    397       },
    398       {
    399         "hn_id": "40607770",
    400         "title": "Potential Field Based Deep Metric Learning",
    401         "points": 2,
    402         "comments": 0,
    403         "url": "https://news.ycombinator.com/item?id=40607770"
    404       },
    405       {
    406         "hn_id": "39948335",
    407         "title": "Towards a Brazilian History Knowledge Graph",
    408         "points": 1,
    409         "comments": 0,
    410         "url": "https://news.ycombinator.com/item?id=39948335"
    411       }
    412     ],
    413     "top_points": 28,
    414     "total_points": 78,
    415     "total_comments": 14
    416   }
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs