scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19144B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "FeatBench: Towards More Realistic Evaluation of Feature-level Code Generation",
      6     "authors": [
      7       "Haorui Chen",
      8       "Chengze Li",
      9       "Jia Li"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2509.22237",
     14     "doi": "10.1145/nnnnnnn.nnnnnnn"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims are verified: 29.94% top resolved rate (Table 4), aggressive implementation driving regressions (Section 5.3, Fig 11), and benchmark design rationale supported throughout.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The causal claim that autonomous planning enables superior performance is supported by direct head-to-head comparison across 4 models; the 'aggressive implementation causes regressions' finding is backed by manual inspection of 122 failure cases.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 6.2 explicitly bounds scope to Python repositories and notes findings may not extrapolate to statically typed languages; future work explicitly plans Java and Go expansion.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper attributes agent performance gaps to dynamic planning but does not discuss that autonomous agents use 30x more tokens than pipeline-based agents as a confound; alternative explanations for regression failures (e.g., requirement ambiguity) are not examined.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly distinguishes Resolved Rate (overall task completion via tests), Feature Validation Pass Rate (F2P tests only), and Regression Tests Pass Rate (P2P tests), carefully separating what each metric measures.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6.2 'Threats to Validity' is a dedicated subsection covering three specific threats with mitigation strategies.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Three specific threats are addressed: LLM hallucinations in requirement synthesis, false positives in test-based evaluation, and Python-only generalizability — each with concrete mitigations rather than generic disclaimers.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Explicit scope limits stated: Python only, modifying existing functions (not adding/deleting), 27 actively maintained repositories, feature implementation only (not bug-fixing).",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment section appears in the paper; no grants, industry support, or funding sources are mentioned anywhere.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are fully disclosed in the paper header: Tsinghua University, UESTC, and Nanjing University.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funder is identified, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement appears in the paper; no declaration of patents, equity, or consulting relationships.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: 'feature-level code generation' (Section 3.1), 'Resolved Rate' (Section 4.4), 'aggressive implementation' and 'scope creep' (Section 5.3), 'F2P' and 'P2P' tests (Section 3.3).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contributions are explicitly enumerated at end of Section 1: a benchmark with realistic NL inputs, an evolving automated pipeline, and extensive experiments revealing current agent limitations.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 and Table 1 directly compare FeatBench against HumanEval, ClassEval, CoderEval, DevEval, EvoCodeBench, FEA-Bench, and NoCode-bench, explaining specifically how each falls short on code hints and static data.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper argues construct validity by contrast: existing benchmarks provide function signatures that bypass the core challenge of bridging user intent to code; FeatBench removes hints to measure this capability directly (Section 1 and 3.2).",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "There is no upfront difficulty tiering of the 157 tasks; difficulty is only analyzed post-hoc through performance correlations with repository size and patch complexity (Figures 8 and 10), not characterized as a property of the benchmark items themselves.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper does not explicitly check for ceiling or floor effects; while results show 7–30% resolved rates (implying no ceiling), this is not framed as a ceiling/floor analysis and no systematic check is reported.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "There is no human performance baseline on the benchmark tasks; human evaluation in Section 6.1 only assesses requirement solvability (30 tasks, 2 annotators), not actual implementation performance.",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Resolved Rate is justified by reference to SWE-bench and NoCode-bench standards; the dual F2P+P2P validation strategy is explicitly justified as preventing false positives from sparse test suites.",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Contamination resistance is a core design goal: a June 2024 cutoff, tasks from latest repository releases, automated 6-month update pipeline, and empirical validation via consistent performance across time periods (Fig 9).",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper explicitly discusses the automated pipeline for 6-month updates, plans to expand to more languages, and validates temporal robustness empirically by showing stable resolved rates across five creation-time periods.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 6.2 discusses three failure modes: LLM hallucination in requirement synthesis, false positives from sparse test coverage, and limited scope (Python only); each is addressed with mitigations.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Results for all 4 models × 2 frameworks are fully reported (Table 4), and the benchmark, pipeline, and all experimental results are released at the GitHub URL provided.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Detailed collection methodology covers all three pipeline stages (data curation, environment configuration, test validation), filtering criteria at repository/PR levels, and full repository list with licenses in Appendix A.1.",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "GitHub URL is provided; Appendix A.1 lists licenses for all 27 source repositories (MIT, Apache-2.0, BSD-3-Clause, LGPL-3.0); the benchmark and pipeline are released as open source.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The paper specifies the benchmark is intended for evaluating feature-level code generation agents, not bug-fixing; Python only; and explicitly states limitations on extrapolating conclusions to other languages or task types.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "The top-performing agent configuration (Trae-agent + GPT-5) achieves only 29.94% resolved rate on FeatBench.",
    201       "evidence": "Table 4 reports Trae-agent + GPT-5 at 29.94% Resolved%; all other configurations are lower.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Autonomous planning-based agents substantially outperform rigid pipeline-based agents on feature implementation.",
    206       "evidence": "Trae-agent average 22.13% vs. Agentless average 10.83% across all models (Table 4); also superior on FV% (41.72% vs 21.66%) and File% (76.42% vs 48.90%).",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Regressive implementation accounts for 73.6% of analyzed failure cases, driven by 'aggressive implementation' / scope creep.",
    211       "evidence": "Manual inspection of 122 failure cases by 2 researchers (Fig 11); qualitative case studies support the pattern (Fig 12).",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "Performance degrades sharply with repository complexity; resolved rates reach 60–70% for small repos but 10–30% for repos over 800 files or 300k LOC.",
    216       "evidence": "Fig 8 shows clear inverse correlation between repository file count/LOC and resolved rate across all 4 models.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Consistent resolved rates across task creation time periods validate the absence of data leakage.",
    221       "evidence": "Fig 9 shows stable resolved rate for Trae-agent + Doubao-Seed-1.6 across 5 time periods from 2023-08 to 2025-09.",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "93.3% of benchmark tasks have comprehensive and unambiguous synthesized requirements (human evaluation).",
    226       "evidence": "2 annotators evaluated 30 randomly sampled tasks (19% of total); 28/30 scored 2 (fully solvable), average 1.93/2.",
    227       "supported": "moderate"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval",
    232     "benchmark-creation",
    233     "case-study"
    234   ],
    235   "key_findings": "FeatBench reveals that current SOTA coding agents struggle with realistic feature implementation, achieving a maximum resolved rate of only 29.94% (Trae-agent + GPT-5), with most configurations below 20%. Autonomous planning agents substantially outperform rigid pipeline-based agents but consume 30x more tokens. The dominant failure mode is regressive implementation (73.6% of failures), caused by agents exhibiting scope creep — proactively refactoring beyond the stated requirement — though this same behavior occasionally produces architecturally superior solutions. Agent performance is tightly constrained by repository and patch complexity, with near-zero success on large repositories (>800 files, >300k LOC) or multi-file patches (>5 files, >50 LOC).",
    236   "red_flags": [
    237     {
    238       "flag": "Tiny failure analysis sample",
    239       "detail": "The 73.6% regressive failure finding is based on manual inspection of only 122 cases by 2 researchers with no inter-rater reliability reported; the overall benchmark has 157 tasks so this covers most but the annotation process lacks rigor checks."
    240     },
    241     {
    242       "flag": "Human solvability validation underpowered",
    243       "detail": "Only 30 of 157 tasks (~19%) received human solvability evaluation; extrapolating 93.3% quality to the full set is tenuous, especially given the automated LLM-based requirement synthesis."
    244     },
    245     {
    246       "flag": "No human performance baseline",
    247       "detail": "Without human performance on the benchmark tasks, it is impossible to know whether 29.94% represents impressive or poor agent performance relative to what the tasks require."
    248     },
    249     {
    250       "flag": "Agentless evaluated with modified pipeline",
    251       "detail": "The regression-testing reranking stage was omitted from Agentless evaluation 'because supporting this stage requires substantial infrastructure adaptation beyond our scope,' which may systematically disadvantage Agentless and inflate the performance gap."
    252     },
    253     {
    254       "flag": "Token budget confound for agent comparison",
    255       "detail": "Trae-agent uses 1.07M–2.90M tokens vs. Agentless at ~0.06M; attributing performance differences solely to 'dynamic planning capability' without controlling for token budget is an alternative explanation not discussed."
    256     },
    257     {
    258       "flag": "No funding disclosure",
    259       "detail": "No funding source or competing interests are declared despite the work involving proprietary models from ByteDance and Qwen/Alibaba, whose models are evaluated."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    265       "relevance": "Primary baseline benchmark for software engineering agents; FeatBench is positioned as complementary with a dedicated focus on feature implementation rather than bug-fixing."
    266     },
    267     {
    268       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    269       "relevance": "Foundational function-level benchmark that FeatBench explicitly supersedes with repository-level, hint-free tasks."
    270     },
    271     {
    272       "title": "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation",
    273       "relevance": "Direct predecessor that FeatBench addresses by removing code hints (signatures) that FEA-Bench provides."
    274     },
    275     {
    276       "title": "NoCode-bench: A Benchmark for Evaluating Natural Language-Driven Feature Addition",
    277       "relevance": "Most direct comparison; FeatBench argues NoCode-bench still uses identifier hints whereas FeatBench uses pure NL requirements."
    278     },
    279     {
    280       "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-world Code Repositories",
    281       "relevance": "Earlier evolving benchmark approach; FeatBench extends the evolving paradigm to feature-level tasks."
    282     },
    283     {
    284       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    285       "relevance": "One of the two agent frameworks evaluated on FeatBench; represents the pipeline-based paradigm."
    286     },
    287     {
    288       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    289       "relevance": "Autonomous agent paradigm representative; provides context for the two agent paradigms compared in experiments."
    290     },
    291     {
    292       "title": "SWE-bench Goes Live!",
    293       "relevance": "Live benchmark methodology that FeatBench's evolving pipeline draws on for environment configuration approach."
    294     },
    295     {
    296       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-world Code Repositories",
    297       "relevance": "Repository-level benchmark that provides function signatures (code hints); cited as example of limitations FeatBench addresses."
    298     },
    299     {
    300       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    301       "relevance": "Intermediate-scope benchmark in the evolution from function-level to repository-level evaluation; cited in benchmark lineage."
    302     }
    303   ],
    304   "engagement_factors": {
    305     "practical_relevance": {
    306       "score": 3,
    307       "justification": "Directly addresses a gap practitioners using GitHub Copilot/Cursor face: existing benchmarks don't reflect real development workflows; released code enables immediate reuse."
    308     },
    309     "surprise_contrarian": {
    310       "score": 2,
    311       "justification": "The 'aggressive implementation' finding is genuinely surprising — agents fail not by misunderstanding but by doing too much, and this occasionally produces architecturally better code than the human patch."
    312     },
    313     "fear_safety": {
    314       "score": 0,
    315       "justification": "No AI safety or risk concerns raised; purely a software engineering evaluation benchmark."
    316     },
    317     "drama_conflict": {
    318       "score": 1,
    319       "justification": "Mild conflict angle: existing popular benchmarks (FEA-Bench, NoCode-bench) are critiqued as unrealistic and contaminated, but framing is constructive rather than confrontational."
    320     },
    321     "demo_ability": {
    322       "score": 2,
    323       "justification": "Code and pipeline released at GitHub; practitioners can run agents on the benchmark immediately, though Docker environment setup adds friction."
    324     },
    325     "brand_recognition": {
    326       "score": 1,
    327       "justification": "Tsinghua University affiliation is recognized but not a top-tier brand name; no major lab or product co-authorship."
    328     }
    329   },
    330   "hn_data": {
    331     "threads": [
    332       {
    333         "hn_id": "44157561",
    334         "title": "Yambda-5B – A Large-Scale Multi-Modal Dataset for Ranking and Retrieval",
    335         "points": 3,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=44157561"
    338       },
    339       {
    340         "hn_id": "44427694",
    341         "title": "Can Large Language Models Help Students Prove Software Correctness?",
    342         "points": 1,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=44427694"
    345       }
    346     ],
    347     "top_points": 3,
    348     "total_points": 4,
    349     "total_comments": 0
    350   }
    351 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs