scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32581B)
      1 {
      2   "paper": {
      3     "title": "FeatBench: Towards More Realistic Evaluation of Feature-level Code Generation",
      4     "authors": ["Haorui Chen", "Chengze Li", "Jia Li"],
      5     "year": 2026,
      6     "venue": "ACM TOSEM (submitted)",
      7     "arxiv_id": "2509.22237"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "FeatBench introduces an evolving benchmark for feature-level code generation with 157 tasks from 27 repositories, using only natural language requirements without code hints. The best agent configuration (Trae-agent + GPT-5) achieves only 29.94% resolved rate, with autonomous planning-based agents substantially outperforming pipeline-based approaches. The dominant failure mode (73.6%) is regressive implementation driven by 'scope creep,' where agents over-engineer beyond the user's explicit intent, breaking existing features.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper states 'Our code is available at https://github.com/TsinghuaISE/FeatBench' and 'We release FeatBench, our automated pipeline, and all experimental results to facilitate further community research.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The benchmark dataset (157 tasks) is released along with all experimental results via the GitHub repository. The paper explicitly states all data is released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Each benchmark task includes a Docker container (image_key field in Table 2) providing a reproducible runtime environment. Model settings are specified: temperature=0.0, top-p=1.0. The environment configuration pipeline (Section 3.3, Stage 2) is documented in detail."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper provides a GitHub link and describes the pipeline in detail, but does not contain step-by-step reproduction instructions (no 'Reproducing Results' section or specific commands to run the evaluation)."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Table 4 reports only point estimates (e.g., 29.94% resolved rate) with no confidence intervals, error bars, or uncertainty measures."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims ('Trae-agent consistently outperforms Agentless') based solely on comparing raw percentages without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper provides sufficient context for effect sizes: 'average FV% of 41.72% vs 21.66%', 'File% of 76.42% vs 48.90%', and performance breakdowns showing resolution rates of 60-70% for small repos degrading to 10-30% for large ones."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The final benchmark size of 157 tasks from 27 repositories is a result of their filtering pipeline, not a justified sample size. No power analysis or discussion of whether 157 tasks is sufficient for the conclusions drawn."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "All results appear to be single-run evaluations. No variance, standard deviation, or spread measures are reported across experimental runs. While temperature=0.0 reduces randomness, agentic systems with tool calls and environment interactions can still vary."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Two agent frameworks (Trae-agent and Agentless) are compared across four LLMs, providing cross-comparisons between frameworks and models."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All evaluated models are very recent: GPT-5 (2025-08-07), DeepSeek V3.1, Doubao-Seed-1.6 (250615), Qwen3-Coder-Flash (2025-07-28). Both agent frameworks are recent SOTA systems."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is performed. The paper does not ablate benchmark design choices (e.g., effect of removing code hints vs. including them) or agent components. The comparison across agents and models is not an ablation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Six metrics are reported: Resolved%, Patch Apply Rate (Applied%), Regression Tests Pass Rate (RT%), Feature Validation Pass Rate (FV%), File-level Localization Success Rate (File%), and Token consumption."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 5.3 describes manual inspection of 122 failure cases by two computer science researchers from the author team, categorizing failures into three types. Section 6.1 has human solvability evaluation of 30 sampled tasks."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The benchmark test suites (F2P and P2P tests) are explicitly 'withheld from the agent during inference to prevent data leakage' (Section 3.1). The agents are not tuned on FeatBench; they are evaluated as-is."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by repository complexity (Figures 8, 10), patch complexity (Figure 10), creation time period (Figure 9), and per-repository statistics (Appendix A.1)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.3 (RQ3) is a dedicated failure analysis. Figure 11 categorizes failures (73.6% regressive, 17.8% incomplete, 8.5% misunderstood). Figure 12 provides detailed case studies of both failure and success patterns."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports significant negative findings: agents struggle with realistic tasks (max 29.94%), introduce widespread regressions (73.6% of failures), and performance collapses for large repos and patches. The 'aggressive implementation' pattern is presented as both positive and negative."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims — 29.94% max resolved rate (Table 4), scope creep leading to regressions (Section 5.3, Figure 11), 157 tasks from 27 repos (Table 3) — are all directly supported by results in the paper."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims: 'This high regression rate is driven by a behavioral pattern termed aggressive implementation' (Section 5.3). This causal link is inferred from manual case analysis of 122 failures by two author-evaluators, not from controlled experiments isolating the cause."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title frames results broadly as 'Feature-level Code Generation' but the benchmark is Python-only with 27 repositories. The abstract does not mention the Python limitation. Only Section 6.2 acknowledges 'FeatBench focuses primarily on Python repositories' and 'our findings may not fully extrapolate to statically typed languages.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 6.2 discusses three specific threats: quality of LLM-generated requirements (hallucination risk), reliability of test-based evaluation (false positives), and generalizability limitations. The temporal analysis in Section 5.2 considers data leakage as an alternative explanation for performance patterns."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures test pass rates (Resolved%, FV%, RT%) and frames them as measuring agent performance on feature implementation tasks. The claims match the granularity of the measurements without inflating to broader constructs like 'software engineering capability.'"
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Three of four models have specific API identifiers with dates: gpt-5-2025-08-07, doubao-seed-1-6-250615, qwen3-coder-flash-2025-07-28. However, DeepSeek V3.1 is identified only as 'deepseek-chat' — a rolling API endpoint without a snapshot date or version pin."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix A.2 provides full prompt templates for the benchmark construction pipeline (feature identification, requirement synthesis, relevant file identification, environment configuration). The evaluation uses published open-source agent frameworks (Agentless, Trae-agent) whose prompts are in their cited repositories."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.2: 'all LLMs are configured with a temperature of 0.0 and a top-p value of 1.0.' Section 4.3: 'we imposed a maximum limit of 150 steps per task' for Trae-agent and describe supplementary tools provided."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Agentless is described as 'a rigid, two-stage pipeline consisting of feature localization and patch generation' (Section 4.3). Trae-agent is described as 'an autonomous planning agent, utilizing a ReAct loop to dynamically plan execution paths and utilize tools' with 'supplementary tools, including ckg and json_edit_tool.'"
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.3 documents the full three-stage pipeline with filtering counts: 44 repos → 27 after repo-level filtering, 675 releases analyzed, 297 PRs extracted after PR-level filtering, → 157 final tasks after F2P validation. Each filtering criterion is specified."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6.2 'Threats to Validity' provides substantive discussion of three specific threats: quality of LLM-generated requirements, reliability of test-based evaluation, and generalizability/scope."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6.2 discusses threats specific to this study: LLM hallucination risk in requirement synthesis (mitigated by human verification showing 93.3% quality), false positive risk from sparse test suites (mitigated by dual F2P+P2P validation averaging 1694.6 tests), and Python-only scope limitation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6.2 explicitly states 'Currently, FeatBench focuses primarily on Python repositories' and 'our findings may not fully extrapolate to statically typed languages like Java or C++.' Section 4.3 notes the Agentless regression-testing stage was omitted. Future work section states plans to expand beyond Python."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states 'We release FeatBench, our automated pipeline, and all experimental results.' The benchmark data (task instances, test suites, Docker images) and experimental outputs are released via GitHub."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.3 describes the three-stage collection pipeline in detail: repository selection criteria (3+ releases, test suite, no tutorials, cutoff June 2024), release filtering, PR-level criteria (modifies Python files, includes tests, modifies functions without adding/deleting), and F2P validation."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Repository selection is described (44 candidate repos filtered to 27 based on 4 criteria, Section 3.3). Human evaluators for solvability are described: 'two evaluators from the author team, both computer science researchers with over 2 years of Python development experience' (Section 6.1), paid $7.5/hour."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Full pipeline documented: 44 repos → 27 (repo-level filtering) → 675 releases → filtered releases → 297 PRs (PR-level filtering) → 157 tasks (F2P validation). Each stage has explicit criteria and the pipeline is illustrated in Figure 3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. No grants, sponsors, or funding agencies are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Tsinghua University, University of Electronic Science and Technology of China, and Nanjing University. The authors evaluate third-party products (GPT-5, DeepSeek, Trae-agent, Agentless) rather than their own."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Since no funding is disclosed, independence of funding cannot be verified. The lack of any funding statement makes this unanswerable."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff dates for any of the four evaluated models (GPT-5, DeepSeek V3.1, Doubao-Seed-1.6, Qwen3-Coder-Flash). They discuss contamination conceptually but not the specific cutoffs."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 5.2 directly analyzes whether performance varies by task creation period (Figure 9), arguing 'consistent performance across task creation times validates the absence of data leakage.' The evolving benchmark design explicitly addresses train/test overlap."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Contamination is a core focus of the paper. The evolving benchmark design (new versions every 6 months from latest repositories) specifically mitigates contamination. Section 5.2 validates this with temporal analysis showing no performance trend correlated with task age."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The paper has no human subjects study. The solvability evaluation (Section 6.1) uses two author-team members as evaluators in an internal quality check, not a human subjects experiment."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human subjects study is conducted. The evaluators in Section 6.1 are from the author team performing an internal quality assessment."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human subjects study. The paper describes evaluator qualifications (CS researchers with 2+ years Python experience) but this is for an internal quality check, not a human subjects study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study is conducted."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects experiment is conducted."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects experiment is conducted."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study is conducted."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 4 reports token consumption per task for each configuration (ranging from 0.05M to 2.90M). Section 3.2 reports benchmark construction cost of '$0.28 per sample using DeepSeek V3.1.' Figure 7 plots token consumption vs. resolved rate."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Per-task token averages are reported but total computational budget (total API spend, total GPU hours, wall-clock time for all experiments) is not stated."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "All results appear to be from single runs. While temperature=0.0 reduces randomness, agentic systems with tool calls and environment interactions can still exhibit non-determinism. No multi-run analysis is performed."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not explicitly stated. Results are implicitly single-run evaluations with no indication of repeated trials."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search is described. The agent frameworks are used with their default settings plus specified temperature=0.0 and max_steps=150. No search budget or justification for these choices is provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "All 8 configurations (2 agents × 4 models) are evaluated and reported in Table 4. No cherry-picking — all results shown, including poor-performing configurations."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple comparisons are made across 8 configurations and various breakdown dimensions, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors evaluate third-party agents on their own benchmark. They do not discuss potential bias in benchmark construction that could favor certain agent paradigms or the risk that their benchmark design choices systematically advantage or disadvantage particular approaches."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 7 explicitly plots token consumption vs. resolved rate for all configurations. The paper discusses the cost-performance tradeoff: 'Trae-agent exhibits a near-linear trend where achieving a higher resolved rate requires proportionally greater token consumption' (Section 5.1)."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The entire paper is framed around benchmark construct validity — arguing existing benchmarks fail to measure realistic feature implementation due to code hints and static data. Section 6.1 validates solvability of task requirements (93.3% rated fully solvable). Section 6.2 discusses validity threats."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "The paper systematically compares 4 models across 2 frameworks (Table 4), allowing isolation of scaffold effects from model effects. The framework paradigm difference (autonomous vs. pipeline) is explicitly analyzed as a variable, not a confound."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section 5.2 analyzes performance across five creation time periods (2308 to 2509) in Figure 9, showing consistent resolved rates. The evolving benchmark design explicitly addresses temporal leakage by sourcing tasks from recent repository updates."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Eliminating feature leakage is a core contribution. Task inputs are 'strictly devoid of code hints' (Section 3.2). Test cases are 'withheld from the agent during inference' (Section 3.1). The paper contrasts this against existing benchmarks that leak through function signatures."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Tasks are drawn from 27 repositories with highly uneven distribution: conan-io/conan contributes 56/157 tasks (35.7%). The paper does not discuss whether tasks from the same repository are independent or whether repository-level clustering biases results."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 5.2 applies temporal analysis as a concrete detection method: comparing resolved rates across five task creation periods to check for performance trends that would indicate training data overlap. Figure 9 shows the analysis."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "FeatBench poses a significant challenge to SOTA agents, with the highest resolved rate reaching only 29.94% (Trae-agent + GPT-5).",
    364       "evidence": "Table 4 shows Trae-agent + GPT-5 achieves 29.94% Resolved%, the highest among all 8 configurations tested. Average across all Trae-agent configs is 22.13%, Agentless average is 10.83%.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Autonomous planning-based agents substantially outperform pipeline-based agents on feature implementation tasks.",
    369       "evidence": "Table 4: Trae-agent averages 22.13% resolved vs. Agentless 10.83%. FV% averages 41.72% vs 21.66%, File% averages 76.42% vs 48.90%. Consistent advantage across all four models (Section 5.1).",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Agent performance is strictly constrained by repository and patch complexity, with resolved rates degrading sharply for large repos.",
    374       "evidence": "Figures 8 and 10: Resolved rates reach 60-70% for repos with <200 files but drop to 10-30% for repos with >800 files. Performance collapses for patches spanning >5 files or >50 LOC (Section 5.2).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Consistent performance across task creation periods confirms the absence of data leakage.",
    379       "evidence": "Figure 9 shows resolved rates across five time periods (2308-2509) remain remarkably consistent for Trae-agent + Doubao-Seed-1.6, with no upward trend that would suggest memorization (Section 5.2).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Regressive implementation is the predominant failure reason, accounting for 73.6% of analyzed failure cases.",
    384       "evidence": "Manual analysis of 122 Trae-agent failure cases by two CS researchers classified 73.6% as regressive implementation, 17.8% as incomplete implementation, and 8.5% as misunderstood requirements (Section 5.3, Figure 11).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "93.3% of synthesized task requirements are comprehensive and unambiguous.",
    389       "evidence": "Human evaluation of 30 sampled tasks (19% of benchmark): 28/30 scored 2 (fully solvable), 2/30 scored 1, none scored 0. Average score 1.93/2.0 (Section 6.1).",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Severe repository distribution imbalance",
    396       "detail": "conan-io/conan contributes 56 of 157 tasks (35.7%), heavily skewing benchmark results toward a single DevOps project. Overall results may disproportionately reflect agent performance on Conan specifically rather than feature implementation broadly."
    397     },
    398     {
    399       "flag": "No statistical significance testing",
    400       "detail": "All comparative claims (Trae-agent vs. Agentless, model rankings) are based on comparing raw percentages without any significance tests. With only 157 tasks and no repeated runs, observed differences could reflect noise."
    401     },
    402     {
    403       "flag": "Author-team evaluators for human validation",
    404       "detail": "Both the solvability study (Section 6.1) and the failure analysis (Section 5.3) were conducted by researchers from the author team. No independent evaluators were used, creating potential bias in the quality assessment and failure categorization."
    405     },
    406     {
    407       "flag": "Single-run evaluation without variance",
    408       "detail": "All experimental results are from single runs. While temperature=0.0 reduces randomness, agentic systems with 150-step ReAct loops, tool calls, and environment interactions can exhibit non-deterministic behavior. No stability analysis is provided."
    409     },
    410     {
    411       "flag": "Temporal analysis on single model only",
    412       "detail": "The data leakage temporal analysis (Figure 9) is shown only for Trae-agent + Doubao-Seed-1.6, not for all 8 configurations. This selective reporting weakens the data leakage absence claim."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    418       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    419       "year": 2024,
    420       "relevance": "Foundational benchmark for repository-level code generation that FeatBench extends to feature implementation with stricter task formulations."
    421     },
    422     {
    423       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    424       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    425       "year": 2024,
    426       "relevance": "Pipeline-based agent framework evaluated in FeatBench, representing the rigid localization-repair paradigm for code generation."
    427     },
    428     {
    429       "title": "Trae Agent: An LLM-based Agent for Software Engineering with Test-time Scaling",
    430       "authors": ["Trae Research Team", "Pengfei Gao", "Zhao Tian"],
    431       "year": 2025,
    432       "arxiv_id": "2507.23370",
    433       "relevance": "Autonomous planning agent evaluated in FeatBench, achieving the highest resolved rate and demonstrating the ReAct-based paradigm."
    434     },
    435     {
    436       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    437       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    438       "year": 2024,
    439       "relevance": "Influential autonomous agent framework using ReAct loop for software engineering tasks."
    440     },
    441     {
    442       "title": "Evaluating Large Language Models Trained on Code",
    443       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    444       "year": 2021,
    445       "arxiv_id": "2107.03374",
    446       "relevance": "HumanEval benchmark — seminal function-level code generation benchmark that FeatBench builds upon by moving to feature-level."
    447     },
    448     {
    449       "title": "Evocodebench: An evolving code generation benchmark aligned with real-world code repositories",
    450       "authors": ["Jia Li", "Ge Li", "Xuanming Zhang", "Yihong Dong", "Zhi Jin"],
    451       "year": 2024,
    452       "relevance": "Evolving code generation benchmark concept that FeatBench extends to feature-level with an automated renewal pipeline."
    453     },
    454     {
    455       "title": "Fea-bench: A benchmark for evaluating repository-level code generation for feature implementation",
    456       "authors": ["Wei Li", "Xin Zhang", "Zhongxin Guo"],
    457       "year": 2025,
    458       "relevance": "Prior feature-level benchmark that FeatBench directly addresses limitations of (reliance on code hints like function signatures)."
    459     },
    460     {
    461       "title": "NoCode-bench: A Benchmark for Evaluating Natural Language-Driven Feature Addition",
    462       "authors": ["Le Deng", "Zhonghao Jiang", "Jialun Cao", "Michael Pradel", "Zhongxin Liu"],
    463       "year": 2025,
    464       "relevance": "Concurrent feature-level benchmark using documentation-update specifications; FeatBench argues its requirement format is more realistic."
    465     },
    466     {
    467       "title": "SWE-bench Goes Live!",
    468       "authors": ["Linghao Zhang", "Shilin He", "Chaoyun Zhang"],
    469       "year": 2025,
    470       "relevance": "Live evolution of SWE-bench that inspired FeatBench's evolving benchmark approach and environment configuration methodology."
    471     },
    472     {
    473       "title": "SWE-PolyBench: A multi-language benchmark for repository level evaluation of coding agents",
    474       "authors": ["Muhammad Shihab Rashid", "Christian Bock", "Yuan Zhuang"],
    475       "year": 2025,
    476       "relevance": "Multi-language benchmark extension that FeatBench does not yet address; FeatBench is currently Python-only."
    477     },
    478     {
    479       "title": "ReAct: Synergizing reasoning and acting in language models",
    480       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"],
    481       "year": 2022,
    482       "relevance": "Agent reasoning framework underlying Trae-agent's architecture; the autonomous planning paradigm evaluated in FeatBench."
    483     },
    484     {
    485       "title": "Classeval: A manually-crafted benchmark for evaluating llms on class-level code generation",
    486       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    487       "year": 2023,
    488       "relevance": "Class-level code generation benchmark representing the intermediate complexity step between function-level and feature-level evaluation."
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 2,
    494       "justification": "Directly useful for evaluating coding agents on realistic tasks; the benchmark and pipeline are released, but practitioners need to set up Docker infrastructure to use it."
    495     },
    496     "surprise_contrarian": {
    497       "score": 1,
    498       "justification": "Confirms the expectation that agents struggle with realistic tasks; the 'aggressive implementation' finding about scope creep is a moderately novel observation."
    499     },
    500     "fear_safety": {
    501       "score": 0,
    502       "justification": "No safety or AI risk angle; the paper is about benchmark methodology for coding agents."
    503     },
    504     "drama_conflict": {
    505       "score": 1,
    506       "justification": "Mild implicit critique of existing benchmarks (FEA-Bench, NoCode-bench) for being unrealistic, but framed diplomatically without direct confrontation."
    507     },
    508     "demo_ability": {
    509       "score": 2,
    510       "justification": "Code and benchmark released on GitHub (https://github.com/TsinghuaISE/FeatBench); requires significant setup (Docker, API keys) but is reproducible."
    511     },
    512     "brand_recognition": {
    513       "score": 1,
    514       "justification": "From Tsinghua University (well-known in AI research); evaluates GPT-5 and DeepSeek which have moderate brand recognition."
    515     }
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs