scan-v4.json (19942B)
1 { 2 "scan_version": 4, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "FeatBench: Towards More Realistic Evaluation of Feature-level Code Generation", 6 "authors": [ 7 "Haorui Chen", 8 "Chengze Li", 9 "Jia Li" 10 ], 11 "year": 2026, 12 "venue": "ACM Transactions on Software Engineering and Methodology (preprint)", 13 "arxiv_id": "2509.22237", 14 "doi": "10.1145/nnnnnnn.nnnnnnn" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims — 29.94% max resolved rate (Table 4), scope creep leading to regressions (Section 5.3, Figure 11), 157 tasks from 27 repos (Table 3) — are all directly supported by results in the paper.", 22 "source": "opus" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper makes causal claims: 'This high regression rate is driven by a behavioral pattern termed aggressive implementation' (Section 5.3). This causal link is inferred from manual case analysis of 122 failures by two author-evaluators, not from controlled experiments isolating the cause.", 28 "source": "opus" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The title frames results broadly as 'Feature-level Code Generation' but the benchmark is Python-only with 27 repositories. The abstract does not mention the Python limitation. Only Section 6.2 acknowledges 'FeatBench focuses primarily on Python repositories' and 'our findings may not fully extrapolate to statically typed languages.'", 34 "source": "opus" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "Section 6.2 discusses three specific threats: quality of LLM-generated requirements (hallucination risk), reliability of test-based evaluation (false positives), and generalizability limitations. The temporal analysis in Section 5.2 considers data leakage as an alternative explanation for performance patterns.", 40 "source": "opus" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper measures test pass rates (Resolved%, FV%, RT%) and frames them as measuring agent performance on feature implementation tasks. The claims match the granularity of the measurements without inflating to broader constructs like 'software engineering capability.'", 46 "source": "opus" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section 6.2 'Threats to Validity' provides substantive discussion of three specific threats: quality of LLM-generated requirements, reliability of test-based evaluation, and generalizability/scope.", 54 "source": "opus" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Section 6.2 discusses threats specific to this study: LLM hallucination risk in requirement synthesis (mitigated by human verification showing 93.3% quality), false positive risk from sparse test suites (mitigated by dual F2P+P2P validation averaging 1694.6 tests), and Python-only scope limitation.", 60 "source": "opus" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "Section 6.2 explicitly states 'Currently, FeatBench focuses primarily on Python repositories' and 'our findings may not fully extrapolate to statically typed languages like Java or C++.' Section 4.3 notes the Agentless regression-testing stage was omitted. Future work section states plans to expand beyond Python.", 66 "source": "opus" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding or acknowledgments section is present in the paper. No grants, sponsors, or funding agencies are mentioned.", 74 "source": "opus" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are listed: Tsinghua University, University of Electronic Science and Technology of China, and Nanjing University. The authors evaluate third-party products (GPT-5, DeepSeek, Trae-agent, Agentless) rather than their own.", 80 "source": "opus" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "Since no funding is disclosed, independence of funding cannot be verified. The lack of any funding statement makes this unanswerable.", 86 "source": "opus" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial disclosure statement is present in the paper.", 92 "source": "opus" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "'Feature-level code generation', 'Resolved Rate', 'F2P/P2P tests', 'aggressive implementation', and 'scope creep' are all defined precisely with examples in the paper.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Contributions are explicitly numbered: a benchmark with realistic NL-only inputs, an evolving automated pipeline, a rigorous data collection process, comprehensive test suites, and diverse domain coverage.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Table 1 systematically compares FeatBench against seven prior benchmarks across five dimensions; Section 2 explains how FeatBench differs from SWE-bench, FEA-Bench, NoCode-bench, and others in terms of specific design choices.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "benchmark-creation": { 118 "construct_design": { 119 "construct_validity_argued": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper argues that removing code hints forces agents to independently bridge user intent to code (measuring true feature implementation capability), and that F2P+P2P tests measure both functional correctness and backward compatibility—articulating why this design measures the claimed capability.", 123 "source": "haiku" 124 }, 125 "difficulty_distribution_characterized": { 126 "applies": true, 127 "answer": true, 128 "justification": "Figures 8 and 10 characterize difficulty empirically by repository complexity (files, LOC) and patch complexity, showing resolved rates ranging from 60-70% for small repos to 10-30% for large ones—difficulty is measured rather than assumed.", 129 "source": "haiku" 130 }, 131 "ceiling_floor_effects_checked": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not explicitly check for ceiling or floor effects at the task level; while the 29.94% maximum resolved rate rules out an obvious ceiling, the complexity analysis (Figs. 8-10) suggests near-zero success for many task subsets without flagging this as a floor-effect concern.", 135 "source": "haiku" 136 }, 137 "human_baseline_included": { 138 "applies": true, 139 "answer": false, 140 "justification": "No human performance baseline is provided; the human evaluation in Section 6.1 assesses requirement solvability (information completeness), not how well human developers would perform on the implementation tasks.", 141 "source": "haiku" 142 }, 143 "scoring_rubric_justified": { 144 "applies": true, 145 "answer": true, 146 "justification": "The dual F2P+P2P validation is justified: F2P verifies the new feature works, P2P ensures no regressions—this mirrors production-grade software engineering standards and the potential false-positive risk is acknowledged and partially mitigated by scale (avg. 1,694.6 test cases per task).", 147 "source": "haiku" 148 } 149 }, 150 "robustness": { 151 "contamination_resistance_designed": { 152 "applies": true, 153 "answer": true, 154 "justification": "Contamination resistance is explicitly designed in through a June 2024 cutoff for initial tasks (post-training-data) and a planned 6-month automated refresh cycle; Fig. 9 empirically validates that resolved rates are consistent across task creation periods.", 155 "source": "haiku" 156 }, 157 "temporal_robustness_discussed": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 5.2 explicitly tests temporal robustness by analyzing resolved rates across five creation periods (2308-2509) and finding no performance trend, validating the evolving benchmark strategy; the automated pipeline for 6-month updates is described.", 161 "source": "haiku" 162 }, 163 "failure_modes_discussed": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 6.2 discusses three benchmark failure modes: LLM hallucination in synthesized requirements, false positives from sparse F2P test suites, and generalizability limits to Python; the paper does not discuss active gaming strategies beyond data leakage.", 167 "source": "haiku" 168 }, 169 "baseline_implementations_provided": { 170 "applies": true, 171 "answer": true, 172 "justification": "The benchmark, automated pipeline, and all experimental results are open-sourced at https://github.com/TsinghuaISE/FeatBench; Section 4.3 details exact agent configurations (temperature=0.0, max 150 steps, specific tool sets) for reproducibility.", 173 "source": "haiku" 174 } 175 }, 176 "documentation": { 177 "dataset_documentation_complete": { 178 "applies": true, 179 "answer": true, 180 "justification": "Table 2 defines all task instance fields, Table 3 provides repository and instance statistics, Appendix A.1 lists all 27 repositories with licenses and instance counts, and Section 3.3 documents the three-stage collection pipeline in detail.", 181 "source": "haiku" 182 }, 183 "licensing_and_access_clear": { 184 "applies": true, 185 "answer": true, 186 "justification": "Appendix A.1 lists the license for all 27 source repositories (MIT, Apache-2.0, BSD-3-Clause, LGPL-3.0); the benchmark is publicly accessible at the provided GitHub URL.", 187 "source": "haiku" 188 }, 189 "intended_use_specified": { 190 "applies": true, 191 "answer": true, 192 "justification": "The intended use (evaluating LLM agents on feature-level code generation without code hints) is clearly stated; the threats section delimits what cannot be concluded (no generalization to non-Python languages), though the paper lacks explicit guidance on misuse cases.", 193 "source": "haiku" 194 } 195 } 196 } 197 }, 198 "claims": [ 199 { 200 "claim": "FeatBench poses a significant challenge: the top-performing configuration (Trae-agent + GPT-5) achieves a resolved rate of only 29.94%.", 201 "evidence": "Table 4 reports all agent-model combinations; Trae-agent + GPT-5 is the highest at 29.94%. All other combinations are lower.", 202 "supported": "strong" 203 }, 204 { 205 "claim": "Autonomous planning agents substantially outperform rigid pipeline-based agents on feature implementation tasks.", 206 "evidence": "Table 4: Trae-agent average resolved rate 22.13% vs. Agentless 10.83%; Trae-agent file localization 76.42% vs. Agentless 48.90%.", 207 "supported": "strong" 208 }, 209 { 210 "claim": "Regressive implementation is the predominant failure reason, accounting for 73.6% of analyzed failure cases, driven by 'aggressive implementation' scope creep.", 211 "evidence": "Manual inspection of 122 Trae-agent failure cases by two computer science researchers from the author team (Fig. 11); two case studies provided.", 212 "supported": "moderate" 213 }, 214 { 215 "claim": "Agent performance is strictly constrained by repository and patch complexity, with resolved rates dropping to 10-30% for repos exceeding 800 files or 300K LOC.", 216 "evidence": "Fig. 8 shows consistent inverse correlation between repo file count/LOC and resolved rate across all four LLMs; Fig. 10 shows near-zero success for patches spanning 5+ files or 50+ LOC.", 217 "supported": "strong" 218 }, 219 { 220 "claim": "Consistent performance across task creation periods confirms the absence of data leakage.", 221 "evidence": "Fig. 9 shows stable resolved rates for Trae-agent + Doubao-Seed-1.6 across five creation periods (2308-2509); only one agent-model pair tested for this claim.", 222 "supported": "moderate" 223 }, 224 { 225 "claim": "93.3% of sampled synthesized requirements are fully solvable and unambiguous.", 226 "evidence": "Human evaluation of 30 randomly sampled tasks by two author-team researchers using a 3-point Likert scale; average score 1.93/2.", 227 "supported": "moderate" 228 } 229 ], 230 "methodology_tags": [ 231 "benchmark-eval", 232 "case-study" 233 ], 234 "key_findings": "FeatBench is a 157-task benchmark for feature-level code generation that removes code hints and uses an automated evolving pipeline to prevent data contamination, sourcing tasks from 27 actively maintained Python repositories. State-of-the-art coding agents achieve a maximum resolved rate of only 29.94% (Trae-agent + GPT-5), highlighting the difficulty of realistic feature implementation. Autonomous planning agents substantially outperform pipeline-based agents but at much higher token cost. The dominant failure mode is 'regressive implementation' (73.6% of failures), where agents exhibit scope creep by implementing beyond the specified requirements and breaking existing tests—a behavior that occasionally produces superior architectural designs but more commonly introduces defects.", 235 "red_flags": [ 236 { 237 "flag": "Author-team failure analysis", 238 "detail": "The 73.6% regressive implementation finding is based on manual inspection of 122 failure cases by two researchers who are members of the author team, introducing potential categorization bias and lack of independence." 239 }, 240 { 241 "flag": "Temporal consistency tested on one configuration", 242 "detail": "The no-data-leakage claim (Fig. 9) is validated using only one agent-model pair (Trae-agent + Doubao-Seed-1.6); other combinations are not tested for temporal consistency." 243 }, 244 { 245 "flag": "Small human solvability sample", 246 "detail": "Only 30 of 157 tasks (19%) were human-evaluated for requirement quality, and the evaluators were from the author team rather than independent assessors." 247 }, 248 { 249 "flag": "No human performance baseline", 250 "detail": "The benchmark includes no measurement of how human developers would perform on these tasks, making it impossible to calibrate how far agents lag behind human-level performance." 251 }, 252 { 253 "flag": "Python-only scope with broad claims", 254 "detail": "All 157 tasks are from Python repositories, but the abstract and conclusions make general statements about 'feature-level code generation' without consistently foregrounding this constraint." 255 }, 256 { 257 "flag": "LLM-generated requirements not independently validated", 258 "detail": "The natural language requirements are reverse-engineered by an LLM from code diffs; while a human check was done by author-team members, there is no independent verification that the requirements are fully equivalent to the original developer intent." 259 } 260 ], 261 "cited_papers": [ 262 { 263 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 264 "relevance": "Primary related benchmark; FeatBench explicitly differentiates from SWE-bench's bug-fixing focus and extends to feature implementation" 265 }, 266 { 267 "title": "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation", 268 "relevance": "Direct predecessor benchmark; FeatBench addresses its limitation of providing function signatures as code hints" 269 }, 270 { 271 "title": "NoCode-bench: A Benchmark for Evaluating Natural Language-Driven Feature Addition", 272 "relevance": "Direct predecessor; FeatBench addresses its use of documentation-update identifier hints rather than pure NL requirements" 273 }, 274 { 275 "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories", 276 "relevance": "Precedent for evolving benchmark design; FeatBench extends this concept to feature-level tasks" 277 }, 278 { 279 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 280 "relevance": "Foundational function-level code generation benchmark; establishes the baseline evaluation tradition FeatBench extends" 281 }, 282 { 283 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 284 "relevance": "One of two agent frameworks evaluated on FeatBench; represents the pipeline-based agent paradigm" 285 }, 286 { 287 "title": "Trae Agent: An LLM-based Agent for Software Engineering with Test-time Scaling", 288 "relevance": "The other agent framework evaluated; represents the autonomous planning agent paradigm and achieves best performance on FeatBench" 289 }, 290 { 291 "title": "SWE-bench Goes Live!", 292 "relevance": "Related evolving benchmark work; FeatBench's environment configuration pipeline is inspired by SWE-bench-Live's two-phase approach" 293 } 294 ], 295 "engagement_factors": { 296 "practical_relevance": { 297 "score": 2, 298 "justification": "Directly useful for evaluating coding agents on realistic tasks; the benchmark and pipeline are released, but practitioners need to set up Docker infrastructure to use it." 299 }, 300 "surprise_contrarian": { 301 "score": 1, 302 "justification": "Confirms the expectation that agents struggle with realistic tasks; the 'aggressive implementation' finding about scope creep is a moderately novel observation." 303 }, 304 "fear_safety": { 305 "score": 0, 306 "justification": "No safety or AI risk angle; the paper is about benchmark methodology for coding agents." 307 }, 308 "drama_conflict": { 309 "score": 1, 310 "justification": "Mild implicit critique of existing benchmarks (FEA-Bench, NoCode-bench) for being unrealistic, but framed diplomatically without direct confrontation." 311 }, 312 "demo_ability": { 313 "score": 2, 314 "justification": "Code and benchmark released on GitHub (https://github.com/TsinghuaISE/FeatBench); requires significant setup (Docker, API keys) but is reproducible." 315 }, 316 "brand_recognition": { 317 "score": 1, 318 "justification": "From Tsinghua University (well-known in AI research); evaluates GPT-5 and DeepSeek which have moderate brand recognition." 319 } 320 }, 321 "hn_data": { 322 "threads": [ 323 { 324 "hn_id": "44157561", 325 "title": "Yambda-5B – A Large-Scale Multi-Modal Dataset for Ranking and Retrieval", 326 "points": 3, 327 "comments": 0, 328 "url": "https://news.ycombinator.com/item?id=44157561" 329 }, 330 { 331 "hn_id": "44427694", 332 "title": "Can Large Language Models Help Students Prove Software Correctness?", 333 "points": 1, 334 "comments": 0, 335 "url": "https://news.ycombinator.com/item?id=44427694" 336 } 337 ], 338 "top_points": 3, 339 "total_points": 4, 340 "total_comments": 0 341 } 342 }