scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21732B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation",
      6     "authors": [
      7       "Wei Li",
      8       "Xin Zhang",
      9       "Zhongxin Guo",
     10       "Shaoguang Mao",
     11       "Wen Luo"
     12     ],
     13     "year": 2025,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2503.06680",
     16     "doi": "10.48550/arXiv.2503.06680"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims 'LLMs perform significantly worse in the FEA-Bench' and 'the best-performing LLM, DeepSeek-R1, successfully resolves only about 10%' are directly supported by Table 2 (9.92% resolved ratio).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal-like claims (e.g., 'the format of code edits is a critical factor limiting performance') are supported by controlled comparisons varying one factor at a time (same model, different formats in Table 4; same model, different context lengths in Table 3).",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The title scopes to 'repository-level code generation for feature implementation.' The Limitations section explicitly states 'our benchmark includes only Python repositories' and acknowledges 'certain scenario limitations.'",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While the paper explores different experimental factors (retrieval, format, context length), it does not substantively discuss alternative explanations for the main finding of low performance — e.g., whether benchmark construction choices, test quality, or task formulation artificially inflate difficulty.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures unit test pass rate and frames claims tightly around 'resolved ratio' of task instances. Claims match the granularity of measurement — they do not extrapolate from test-passing to broader software quality claims.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section is present between Section 7 (Conclusion) and the Ethics Statement, with substantive discussion across multiple paragraphs.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The Limitations section discusses specific threats: Python-only scope, scarcity of high-quality new-feature PRs, early-stage repo development not captured, single-round generation bias, and resource constraints limiting DeepSeek experiments.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states Python-only scope, acknowledges 'certain scenario limitations' from the PR-based construction, and notes that 'high-quality and usable pull requests for new feature development are relatively scarce.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments state: 'This work was supported by National Science and Technology Major Project (No. 2022ZD0116308) and National Natural Science Foundation of China (62036001).'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Peking University and Microsoft Research Asia. The paper header marks these with symbols (♡ and ♠).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funding is from Chinese government agencies (NSFC, national project) which have no direct financial stake in the benchmark results or the relative performance of any evaluated model.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interest statement is present. Authors from Microsoft Research Asia could have interests related to AI code generation products, but no declaration is made.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Repository-level incremental code development' is explicitly defined as implementing new features by adding new components (functions/classes) to existing repositories; 'new components' are defined as newly added classes and functions identified by parsing patches.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit bullet-point contributions in the introduction state: (1) introducing the task, (2) constructing the first benchmark with execution-based evaluation, and (3) scaling to 83 repositories with public data release.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages substantively with SWE-bench, HumanEval, MBPP, DevEval, EvoCodeBench, and repository-level completion benchmarks; Table 1 provides quantitative comparison with SWE-bench statistics showing how FEA-Bench differs structurally.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that feature implementation 'requires LLMs to simultaneously possess code completion capabilities for new components and code editing abilities for other relevant parts,' distinguishing this from code completion and bug-fix tasks with quantitative statistics in Table 1.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Figure 5 shows resolved ratios decrease with more added functions, and a lite subset is defined with lower difficulty criteria, but there is no formal difficulty tier characterization (easy/medium/hard) or difficulty score assigned to individual instances.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper notes that best performance is 9.92% (suggesting no ceiling concern), but CodeLlama 13B achieves 0.14%—near floor—without any discussion of whether the benchmark discriminates among weak models or whether floor effects distort comparisons.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline is included; the paper mentions a future plan to 'collaborate with professional software engineers to annotate a verified subset' (Appendix A.3) but this has not been done.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "The all-or-nothing unit-test-pass criterion is adopted from SWE-bench without formal justification; partial credit, test suite quality, or edge cases in scoring (e.g., tests that pass trivially) are not discussed.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No temporal splits, canary strings, or dynamic generation are implemented; popular repositories like scikit-learn, django, and sympy are almost certainly in the training data of evaluated models, and the paper does not address this.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper mentions continuous updates are possible by releasing the collection pipeline, but does not discuss how future training data will inevitably contaminate static benchmark instances or any plan to detect or mitigate this.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Python-only scope and single-round generation are mentioned as limitations, but failure modes of the benchmark mechanism itself—such as insufficient unit test suites, GPT-4o intent classification errors, or the 25% new-component threshold arbitrariness—are not discussed.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Agentless and Agentless-lite are evaluated as agent-framework baselines (Table 5); Oracle and BM25 retrieval baselines are provided; evaluation code adapted from SWE-bench is described with sufficient detail for reproduction, and will be publicly released.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 6 lists all 83 repositories with licenses, topics, and categories; Table 7 provides instance counts at each filtering stage; Appendix A.2 describes the full construction pipeline including fast validation, PR filtering, intent classification, and unit test verification.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Source repository licenses are documented in Table 6, but the benchmark's own license is never stated; access is conditioned on 'will soon be publicly available' language with no release date, making current usability uncertain.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The benchmark's purpose is described (evaluating LLM feature implementation capability), and Docker evaluation is recommended in the Ethics Statement; however, what should NOT be concluded from results (e.g., no generalization to non-Python, no multi-round capability claims) is not specified.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "FEA-Bench is the first benchmark dedicated to evaluating repository-level feature implementation (incremental code development)",
    203       "evidence": "Comparison with SWE-bench (bug fixes) and code completion benchmarks in Section 2; Table 1 quantitatively shows FEA-Bench has 8× more new-component lines than SWE-bench",
    204       "supported": "moderate"
    205     },
    206     {
    207       "claim": "Current LLMs perform significantly worse on FEA-Bench than on bug-fix benchmarks; best model (DeepSeek-R1) resolves only ~10% of instances",
    208       "evidence": "Table 2: DeepSeek-R1 achieves 9.92% in Oracle/Detailed setting; SWE-bench resolution rates for comparable models are substantially higher",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Natural output format significantly outperforms Patch format across all tested models",
    213       "evidence": "Table 4: GPT-4o resolves 6.14% in Natural vs 1.86% in Patch mode; the success rate of git apply is also substantially higher in Natural mode",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "More complex feature requests (more added functions) lead to lower resolved ratios",
    218       "evidence": "Figure 5: resolved ratio drops from 18.96% (1 added function) to 8.24% (2) to 5.47% (3+) for DeepSeek-R1",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Increasing context length (27K→40K tokens) does not improve and slightly hurts model performance despite marginally higher recall",
    223       "evidence": "Table 3: GPT-4 and GPT-4o performance is equal or lower at 40K vs 27K despite recall improving from 76.04% to 77.14%",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Detailed new component hints generally lead to better performance than brief hints",
    228       "evidence": "Table 2: Detailed outperforms Brief in most settings for the full benchmark, though the pattern reverses in FEA-Bench lite for several models",
    229       "supported": "moderate"
    230     },
    231     {
    232       "claim": "Agentless improves resolved ratio over BM25 retrieval for most models due to better code edit format adherence",
    233       "evidence": "Table 5: GPT-4o improves from 4.0% (BM25) to 9.0% (Agentless); improvement correlates with increased %Apply success rate",
    234       "supported": "moderate"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-eval",
    239     "case-study"
    240   ],
    241   "key_findings": "FEA-Bench introduces 1,401 task instances from 83 Python repositories targeting new feature implementation via merged GitHub PRs, filtered by rule-based and GPT-4o intent classification. The best model (DeepSeek-R1) resolves only 9.92% of instances, substantially below SWE-bench performance levels, confirming that feature implementation is harder than bug fixing. Output format (Natural vs Patch) is a critical performance bottleneck—Natural format yields 3–5× higher success rates due to stricter formatting requirements of Patch mode. Task difficulty scales sharply with number of added functions (18.96% → 5.47% as functions increase from 1 to 3+), and increasing context length beyond 27K tokens does not improve performance despite marginally better file recall.",
    242   "red_flags": [
    243     {
    244       "flag": "No human baseline",
    245       "detail": "The benchmark provides no human performance data, making it impossible to calibrate whether 9.92% represents near-zero capability or a reasonable fraction of human performance; a 'verified subset' with human annotations is only planned for future work."
    246     },
    247     {
    248       "flag": "GPT-4o used for construction and evaluation",
    249       "detail": "GPT-4o is used in the data collection pipeline for intent classification (filtering PRs as 'new feature'), and is also one of the primary models evaluated on the resulting benchmark—creating circularity in the evaluation."
    250     },
    251     {
    252       "flag": "No contamination analysis",
    253       "detail": "Highly popular repositories (scikit-learn, django, sympy, matplotlib) are almost certainly in the training data of all evaluated models; no temporal splits or contamination detection are implemented, and the paper does not address this threat."
    254     },
    255     {
    256       "flag": "Missing results in main table",
    257       "detail": "Table 2 has numerous missing entries marked '×' for API-constrained models, and the paper attributes this to 'scarcity of API resources'; this leaves the main comparison incomplete for smaller open-source models against BM25 retrieval."
    258     },
    259     {
    260       "flag": "Arbitrary 25% threshold for new components",
    261       "detail": "The filtering criterion requiring new components to constitute >25% of edited lines is described as 'set relatively low' but is not empirically validated or sensitivity-tested."
    262     },
    263     {
    264       "flag": "MSRA affiliation conflict undisclosed",
    265       "detail": "Multiple authors are from Microsoft Research Asia, and the benchmark evaluates GPT-4 and GPT-4o (Microsoft-affiliated OpenAI models); no competing interests statement is provided."
    266     }
    267   ],
    268   "cited_papers": [
    269     {
    270       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    271       "relevance": "Direct predecessor benchmark for repository-level code generation (bug fixes); FEA-Bench explicitly positions itself as complementary, covering feature implementation rather than bug fixing"
    272     },
    273     {
    274       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    275       "relevance": "Foundational standalone code generation benchmark used as baseline comparison for what FEA-Bench advances beyond"
    276     },
    277     {
    278       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    279       "relevance": "Best-performing model on FEA-Bench (9.92% resolved); chain-of-thought reasoning models are a key evaluation subject"
    280     },
    281     {
    282       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    283       "relevance": "Agent framework evaluated on FEA-Bench lite (Table 5); represents state-of-the-art software engineering agent methodology"
    284     },
    285     {
    286       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    287       "relevance": "Referenced as an example of contamination-aware benchmark design that FEA-Bench does not replicate"
    288     },
    289     {
    290       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    291       "relevance": "Repository-level code completion benchmark representing the category FEA-Bench distinguishes itself from"
    292     },
    293     {
    294       "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
    295       "relevance": "Evolving benchmark design relevant to FEA-Bench's stated goal of continuous updates"
    296     },
    297     {
    298       "title": "Towards More Realistic Evaluation of LLM-Based Code Generation (future context leakage study)",
    299       "relevance": "Cited as motivation for why code completion benchmarks suffer from future context leakage—a problem FEA-Bench avoids by using PR-based task construction"
    300     }
    301   ],
    302   "engagement_factors": {
    303     "practical_relevance": {
    304       "score": 2,
    305       "justification": "The benchmark is directly useful for researchers evaluating code LLMs on feature implementation, though not immediately actionable for practitioners."
    306     },
    307     "surprise_contrarian": {
    308       "score": 1,
    309       "justification": "Finding that LLMs struggle with complex repository-level tasks is expected; the specific quantification (~10% max) adds some informational value but is not surprising."
    310     },
    311     "fear_safety": {
    312       "score": 0,
    313       "justification": "No safety or security concerns are raised by this benchmark paper."
    314     },
    315     "drama_conflict": {
    316       "score": 1,
    317       "justification": "Mild drama in showing that even the best models fail 90%+ of feature implementation tasks, but no controversy or adversarial framing."
    318     },
    319     "demo_ability": {
    320       "score": 0,
    321       "justification": "Code and data are not yet publicly available — only a promise of future release at a GitHub URL."
    322     },
    323     "brand_recognition": {
    324       "score": 2,
    325       "justification": "From Microsoft Research Asia and Peking University; evaluates well-known models (GPT-4, DeepSeek-R1) and positions against the prominent SWE-bench."
    326     }
    327   },
    328   "hn_data": {
    329     "threads": [
    330       {
    331         "hn_id": "43021849",
    332         "title": "Competitive Programming with Large Reasoning Models",
    333         "points": 16,
    334         "comments": 1,
    335         "url": "https://news.ycombinator.com/item?id=43021849"
    336       },
    337       {
    338         "hn_id": "9262882",
    339         "title": "Exploring Non-Homogeneity and Dynamicity of High Scale Cloud [pdf]",
    340         "points": 9,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=9262882"
    343       },
    344       {
    345         "hn_id": "43025479",
    346         "title": "Competitive Programming with Large Reasoning Models",
    347         "points": 6,
    348         "comments": 0,
    349         "url": "https://news.ycombinator.com/item?id=43025479"
    350       },
    351       {
    352         "hn_id": "43072941",
    353         "title": "OpenAI: Competitive Programming with Large Reasoning Models",
    354         "points": 2,
    355         "comments": 1,
    356         "url": "https://news.ycombinator.com/item?id=43072941"
    357       },
    358       {
    359         "hn_id": "43022224",
    360         "title": "OpenAI o3 just scored 99.8% on CodeForces using brute-force",
    361         "points": 2,
    362         "comments": 1,
    363         "url": "https://news.ycombinator.com/item?id=43022224"
    364       },
    365       {
    366         "hn_id": "43030525",
    367         "title": "Competitive programming with large language models",
    368         "points": 2,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=43030525"
    371       },
    372       {
    373         "hn_id": "30685387",
    374         "title": "Infinite Wordle",
    375         "points": 2,
    376         "comments": 0,
    377         "url": "https://news.ycombinator.com/item?id=30685387"
    378       },
    379       {
    380         "hn_id": "43055820",
    381         "title": "Competitive Programming with Large Reasoning Models",
    382         "points": 1,
    383         "comments": 0,
    384         "url": "https://news.ycombinator.com/item?id=43055820"
    385       },
    386       {
    387         "hn_id": "42705257",
    388         "title": "What Hawking Radiation Looks Like as You Fall into a Black Hole",
    389         "points": 1,
    390         "comments": 0,
    391         "url": "https://news.ycombinator.com/item?id=42705257"
    392       }
    393     ],
    394     "top_points": 16,
    395     "total_points": 41,
    396     "total_comments": 3
    397   }
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs