scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22604B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation",
      6     "authors": [
      7       "Wei Li",
      8       "Xin Zhang",
      9       "Zhongxin Guo",
     10       "Shaoguang Mao",
     11       "Wen Luo",
     12       "Guangyue Peng",
     13       "Yangyu Huang",
     14       "Houfeng Wang",
     15       "Scarlett Li"
     16     ],
     17     "year": 2025,
     18     "venue": "Annual Meeting of the Association for Computational Linguistics",
     19     "arxiv_id": "2503.06680",
     20     "doi": "10.48550/arXiv.2503.06680"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All abstract claims are backed by the paper: the benchmark uses PRs from 83 GitHub repos (Section 3.2), includes unit tests for verification (Section 3.3), and LLMs performing poorly is substantiated by Table 2 (best model ~10%).",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Claims like 'detailed hints lead to better performance' and 'increasing context beyond 27K reduces performance' are backed by direct ablation comparisons in Tables 2 and 3 with controlled prompt settings; the study design supports these comparisons.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper explicitly bounds generalization to Python repositories and single-round generation in its Limitations section, and the benchmark is framed as measuring a specific task (incremental feature development), not general coding ability.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper discusses why brief hints outperform detailed hints on the lite subset (lack of structured presentation in the prompt, Figure 6), and why BM25 sometimes matches Oracle (files containing new components are always included as known conditions).",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper uses execution-based unit test pass rates as the metric and explicitly frames this as directly measuring whether the code change works correctly, not as a proxy — test pass/fail directly evaluates the implemented feature.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "There is a dedicated 'Limitations' section in the paper (before the Ethics Statement) discussing language coverage, data scarcity, and single-round evaluation.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Specific threats are named: Python-only scope limits cross-language applicability; single-round generation akin to Pass@1 'may introduce a certain level of bias'; API scarcity caused missing results for some model/setting combinations.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The Limitations section explicitly states the benchmark covers only Python repositories and only single-round generation, and the paper's framing throughout restricts claims to incremental feature development (not bug fixing or standalone generation).",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Funding is disclosed in Acknowledgments: National Science and Technology Major Project (No. 2022ZD0116308) and National Natural Science Foundation of China (62036001).",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly listed in the paper header: Peking University and Microsoft Research Asia.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The named funders (Chinese national science foundations) are independent of benchmark outcomes; however, several authors are from Microsoft Research Asia, and while Microsoft does not appear to have a competing product being advantaged, this affiliation is undisclosed as a potential interest.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "There is no competing interests statement, no declaration of patents, equity, or consulting relationships; the paper only discloses funding and affiliations.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms are defined: 'repository-level incremental code development' is defined in the introduction, 'Oracle' and 'BM25' retrieval settings are explained in Section 4.2, 'resolved ratio' is defined, and 'new components' are defined as newly added functions and classes.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Three contributions are explicitly bulleted in the introduction: introducing the task, constructing the first benchmark for it, and providing a scalable automated data collection pipeline with public release.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 thoroughly covers prior code LLMs and benchmarks, explicitly contrasting FEA-Bench against SWE-bench (bug fixing vs. feature implementation), code completion benchmarks (localized vs. repository-wide changes), and standalone benchmarks (HumanEval, MBPP).",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "benchmark-creation": {
    124       "construct_design": {
    125         "construct_validity_argued": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper argues that GitHub pull requests classified as 'new feature' by GPT-4o, verified by unit tests before and after patch application, measure incremental feature implementation capability — contrasting with SWE-bench (bug fixes) and completion benchmarks (localized edits).",
    129           "source": "haiku"
    130         },
    131         "difficulty_distribution_characterized": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Table 1 provides statistics on lines edited, files edited, and added functions; Figure 5 shows that resolved ratio decreases as the number of added functions increases (18.96% for 1 function down to 5.47% for 3+), characterizing difficulty through complexity metrics.",
    135           "source": "haiku"
    136         },
    137         "ceiling_floor_effects_checked": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The best model resolves only ~10% of tasks — a near-floor result — but the paper does not explicitly discuss floor or ceiling effects as a design concern, nor does it assess whether the benchmark discriminates appropriately across model capability levels.",
    141           "source": "haiku"
    142         },
    143         "human_baseline_included": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No human baseline is included in the evaluation; the paper only evaluates LLMs, leaving the question of human-level performance on these tasks unanswered.",
    147           "source": "haiku"
    148         },
    149         "scoring_rubric_justified": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The binary 'resolved' metric (all unit tests pass) is used without justifying it against alternatives such as partial credit or function-level pass rate; edge cases like vacuously passing tests are not discussed.",
    153           "source": "haiku"
    154         }
    155       },
    156       "robustness": {
    157         "contamination_resistance_designed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The benchmark uses publicly available GitHub pull requests that may be in LLM training corpora; there is no temporal split, canary strings, or other contamination-mitigation mechanism, and the paper does not discuss this risk.",
    161           "source": "haiku"
    162         },
    163         "temporal_robustness_discussed": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The paper briefly notes intent to allow 'continuous updates and the creation of new versions of FEA-Bench' but does not discuss how the benchmark will be kept relevant as model capabilities improve or how gaming will be prevented.",
    167           "source": "haiku"
    168         },
    169         "failure_modes_discussed": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper extensively discusses failure modes of LLMs on the benchmark (format adherence, context length, retrieval), but does not discuss failure modes of the benchmark itself — e.g., tests that are too weak, task instances that are ambiguous, or scenarios where the gold patch is not the only valid solution.",
    173           "source": "haiku"
    174         },
    175         "baseline_implementations_provided": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Comprehensive baseline results are provided in Table 2 across 12 models and multiple settings, and evaluation code is released at https://github.com/microsoft/FEA-Bench, enabling reproduction of reported numbers.",
    179           "source": "haiku"
    180         }
    181       },
    182       "documentation": {
    183         "dataset_documentation_complete": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Appendix A provides detailed collection methodology, filtering criteria, and statistics for all 83 repositories (Tables 6 and 7); the pipeline stages are illustrated in Figure 3 with explicit quantification at each filtering step.",
    187           "source": "haiku"
    188         },
    189         "licensing_and_access_clear": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "All 83 source repositories have licenses listed in Table 6; the paper states 'The dataset and code for our proposed method will be made publicly available for academic research' at the Microsoft GitHub repository.",
    193           "source": "haiku"
    194         },
    195         "intended_use_specified": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The benchmark is explicitly scoped for evaluating LLMs on repository-level incremental feature development; the lite subset is designated for computationally expensive multi-round systems, and the Ethics Statement notes that Docker-based evaluation is recommended to prevent harm from generated code.",
    199           "source": "haiku"
    200         }
    201       }
    202     }
    203   },
    204   "claims": [
    205     {
    206       "claim": "Current LLMs perform significantly worse on FEA-Bench than other benchmarks; the best model (DeepSeek-R1) resolves only ~10% of task instances.",
    207       "evidence": "Table 2 shows DeepSeek-R1 at 9.92% resolved ratio under Oracle+Detailed settings; the paper compares this unfavorably with LLM performance on HumanEval and SWE-bench.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "FEA-Bench tasks involve substantially more new code generation than SWE-bench, with new components averaging 87.1 lines (8x more) and constituting 67.8% of edits.",
    212       "evidence": "Table 1 directly compares FEA-Bench vs. SWE-bench statistics: average lines of added components (87.1 vs. 10.9), percentage of new component lines (67.8% vs. 28.9%).",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "Detailed new component hints generally improve model performance over brief hints.",
    217       "evidence": "Table 2 shows detailed hints outperform brief in most model/setting combinations on the full benchmark, though the lite version shows the opposite trend, which the paper attributes to presentation issues.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Increasing context length from 27K to 40K tokens does not improve and slightly decreases model performance despite marginally better recall.",
    222       "evidence": "Table 3 shows GPT-4 and GPT-4o performance unchanged or slightly reduced at 40K vs. 27K, despite recall improving from 76.04% to 77.14%.",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "Natural-format code edit generation significantly outperforms direct patch generation due to higher git apply success rates.",
    227       "evidence": "Table 4 shows GPT-4o Natural format resolves 6.14% vs. 1.86% for Patch, and apply success rates of 66.38% vs. 19.49% respectively.",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "Task difficulty increases with the number of added functions; resolved ratio drops from 18.96% (1 function) to 5.47% (3+ functions).",
    232       "evidence": "Figure 5 shows the distribution of resolved vs. all instances by number of added functions with specific percentages reported in Section 6.5.",
    233       "supported": "strong"
    234     },
    235     {
    236       "claim": "The Agentless framework's improvement over BM25 retrieval is primarily attributable to better adherence to code editing format, not better retrieval.",
    237       "evidence": "Table 5 shows Agentless improvement correlates with higher %Apply success rates; the paper explicitly states this 'strongly correlates with the increased success rate of applying code edits.'",
    238       "supported": "moderate"
    239     }
    240   ],
    241   "methodology_tags": [
    242     "benchmark-eval",
    243     "observational"
    244   ],
    245   "key_findings": "FEA-Bench introduces 1,401 task instances from 83 Python GitHub repositories specifically targeting repository-level feature implementation, a gap between code completion and bug-fixing benchmarks. Current LLMs perform poorly, with the best model (DeepSeek-R1) resolving only ~10% of tasks, demonstrating that incremental development is a substantially harder capability than issue resolution. Code edit output format is a critical limiting factor — natural format outperforms patch format by 3-4x in resolved ratio due to higher git apply success rates. Counterintuitively, providing more context beyond model window limits decreases performance, suggesting retrieval precision matters more than recall.",
    246   "red_flags": [
    247     {
    248       "flag": "No contamination analysis",
    249       "detail": "The benchmark uses publicly available GitHub pull requests that were likely in LLM training corpora. No temporal filtering, canary strings, or contamination testing is performed or discussed, making it impossible to assess how much LLMs are recalling versus reasoning."
    250     },
    251     {
    252       "flag": "No human baseline",
    253       "detail": "There is no human performance measurement on any subset of tasks. Without a human baseline, it is unclear whether 10% resolved ratio represents a meaningful capability gap or whether the tasks are unreasonably difficult even for humans."
    254     },
    255     {
    256       "flag": "Binary scoring not justified",
    257       "detail": "The all-or-nothing 'all unit tests must pass' metric is not justified against alternatives. Partial credit, function-level pass rates, or test coverage metrics are not considered; a single failing test disqualifies an otherwise correct implementation."
    258     },
    259     {
    260       "flag": "Single-round evaluation bias",
    261       "detail": "All experiments use single-round generation (Pass@1). The paper acknowledges this 'may introduce a certain level of bias' but does not report Pass@k or multi-round results, which would better characterize model capability on hard tasks."
    262     },
    263     {
    264       "flag": "Near-floor performance with no ceiling check",
    265       "detail": "Best model performance (~10%) is very close to floor, but the paper does not discuss whether this reflects benchmark design limitations, task impossibility, or genuine model failure. No easier subsets are fully analyzed beyond the lite version."
    266     },
    267     {
    268       "flag": "GPT-4o used in benchmark construction",
    269       "detail": "GPT-4o is used to classify PR intent as 'new feature' during dataset construction, and then GPT-4o is also evaluated as a benchmark participant. This creates a mild circularity where the model filtered the training distribution and is then tested on it."
    270     }
    271   ],
    272   "cited_papers": [
    273     {
    274       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    275       "relevance": "The most directly related benchmark; FEA-Bench explicitly positions itself as complementary, covering feature implementation where SWE-bench covers bug fixing."
    276     },
    277     {
    278       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    279       "relevance": "Foundational standalone code generation benchmark used as contrast to motivate repository-level evaluation."
    280     },
    281     {
    282       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    283       "relevance": "Relevant benchmark addressing contamination concerns in code evaluation — a gap FEA-Bench does not address."
    284     },
    285     {
    286       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    287       "relevance": "Prior repository-level code completion benchmark directly compared and contrasted with FEA-Bench."
    288     },
    289     {
    290       "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
    291       "relevance": "Related evolving benchmark for repository-level code generation; demonstrates temporal update strategies FEA-Bench could adopt."
    292     },
    293     {
    294       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    295       "relevance": "Agent framework evaluated on FEA-Bench in Section 6.3; results show current SOTA agents have substantial room for improvement on feature implementation."
    296     },
    297     {
    298       "title": "RepoCoder: Repository-Level Code Completion through Iterative Retrieval and Generation",
    299       "relevance": "Prior work on repository-level retrieval-augmented code generation; relevant to FEA-Bench's BM25 retrieval baseline."
    300     },
    301     {
    302       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    303       "relevance": "Related complex code generation benchmark representing the expanding frontier of evaluation beyond HumanEval-style tasks."
    304     }
    305   ],
    306   "engagement_factors": {
    307     "practical_relevance": {
    308       "score": 3,
    309       "justification": "The benchmark directly addresses a real-world software engineering task (adding features to codebases), is publicly released with evaluation code, and results show concrete performance gaps in current tools like Copilot and Devin."
    310     },
    311     "surprise_contrarian": {
    312       "score": 2,
    313       "justification": "The finding that best-in-class models (DeepSeek-R1, o1) resolve only ~10% of tasks challenges optimism about LLMs for software engineering; the counter-intuitive result that more context decreases performance is also notable."
    314     },
    315     "fear_safety": {
    316       "score": 1,
    317       "justification": "The Ethics Statement warns that benchmark inference may generate code harmful to computer systems and recommends Docker isolation, a minor safety concern."
    318     },
    319     "drama_conflict": {
    320       "score": 1,
    321       "justification": "There is mild competitive framing against SWE-bench and implicit positioning of DeepSeek-R1 outperforming OpenAI's o1, but no explicit controversy."
    322     },
    323     "demo_ability": {
    324       "score": 2,
    325       "justification": "The benchmark is publicly released on GitHub with evaluation scripts, allowing practitioners to run their own models against it, though the computational cost is high."
    326     },
    327     "brand_recognition": {
    328       "score": 2,
    329       "justification": "Microsoft Research Asia and Peking University are recognized institutions; the benchmark evaluates high-profile models (GPT-4, GPT-4o, o1, DeepSeek-R1) which attract attention."
    330     }
    331   },
    332   "hn_data": {
    333     "threads": [
    334       {
    335         "hn_id": "43021849",
    336         "title": "Competitive Programming with Large Reasoning Models",
    337         "points": 16,
    338         "comments": 1,
    339         "url": "https://news.ycombinator.com/item?id=43021849"
    340       },
    341       {
    342         "hn_id": "9262882",
    343         "title": "Exploring Non-Homogeneity and Dynamicity of High Scale Cloud [pdf]",
    344         "points": 9,
    345         "comments": 0,
    346         "url": "https://news.ycombinator.com/item?id=9262882"
    347       },
    348       {
    349         "hn_id": "43025479",
    350         "title": "Competitive Programming with Large Reasoning Models",
    351         "points": 6,
    352         "comments": 0,
    353         "url": "https://news.ycombinator.com/item?id=43025479"
    354       },
    355       {
    356         "hn_id": "43072941",
    357         "title": "OpenAI: Competitive Programming with Large Reasoning Models",
    358         "points": 2,
    359         "comments": 1,
    360         "url": "https://news.ycombinator.com/item?id=43072941"
    361       },
    362       {
    363         "hn_id": "43022224",
    364         "title": "OpenAI o3 just scored 99.8% on CodeForces using brute-force",
    365         "points": 2,
    366         "comments": 1,
    367         "url": "https://news.ycombinator.com/item?id=43022224"
    368       },
    369       {
    370         "hn_id": "43030525",
    371         "title": "Competitive programming with large language models",
    372         "points": 2,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=43030525"
    375       },
    376       {
    377         "hn_id": "30685387",
    378         "title": "Infinite Wordle",
    379         "points": 2,
    380         "comments": 0,
    381         "url": "https://news.ycombinator.com/item?id=30685387"
    382       },
    383       {
    384         "hn_id": "43055820",
    385         "title": "Competitive Programming with Large Reasoning Models",
    386         "points": 1,
    387         "comments": 0,
    388         "url": "https://news.ycombinator.com/item?id=43055820"
    389       },
    390       {
    391         "hn_id": "42705257",
    392         "title": "What Hawking Radiation Looks Like as You Fall into a Black Hole",
    393         "points": 1,
    394         "comments": 0,
    395         "url": "https://news.ycombinator.com/item?id=42705257"
    396       }
    397     ],
    398     "top_points": 16,
    399     "total_points": 41,
    400     "total_comments": 3
    401   }
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs