ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30483B)


      1 {
      2   "paper": {
      3     "title": "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation",
      4     "authors": [
      5       "Wei Li",
      6       "Xin Zhang",
      7       "Zhongxin Guo",
      8       "Shaoguang Mao",
      9       "Wen Luo",
     10       "Guangyue Peng",
     11       "Yangyu Huang",
     12       "Houfeng Wang",
     13       "Scarlett Li"
     14     ],
     15     "year": 2025,
     16     "venue": "Annual Meeting of the Association for Computational Linguistics",
     17     "arxiv_id": "2503.06680",
     18     "doi": "10.48550/arXiv.2503.06680"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "FEA-Bench introduces 1,401 benchmark tasks from 83 GitHub repositories for evaluating LLMs on repository-level feature implementation, requiring substantially more code generation than bug-fix-oriented SWE-bench. The best model (DeepSeek-R1) resolves only 9.92% of tasks in the oracle setting, demonstrating that feature implementation is significantly harder than bug fixing. Natural output format substantially outperforms patch format, and increasing context length from 27K to 40K tokens does not improve performance. Task difficulty scales with complexity: resolve rate drops from ~19% for single-function tasks to ~5% for 3+ function tasks.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper states 'Our code will soon be publicly available at https://github.com/microsoft/FEA-Bench' — this is a future promise, not an actual release. Per scan criteria, 'will be released' counts as NO."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper promises future release: 'We will publicly release our data collection and evaluation codebase, allowing FEA-Bench to be continuously updated.' The dataset is described but not yet available."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions vLLM framework on 8-GPU NVIDIA A100, and gives generation settings (Appendix B.2), but no requirements.txt, Dockerfile, or detailed dependency specification is provided. Mentioning hardware and a framework is insufficient to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The paper describes the pipeline and evaluation methodology but does not include a README or script-level instructions for replicating experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 2-5 are point estimates (e.g., '9.92%') with no confidence intervals, error bars, or uncertainty measures."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper makes comparative claims (e.g., 'DeepSeek-V3 and R1 models achieve the best performance, significantly outperforming OpenAI's GPT-4 and o1 series') based solely on comparing raw numbers with no statistical significance tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are reported as raw resolved ratios in tables. No effect sizes (Cohen's d, odds ratios, or framed percentage improvements) are explicitly computed for model comparisons."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The benchmark contains 1,401 task instances and the lite version 200, but no justification is given for why these sizes are adequate. No power analysis is discussed."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The Limitations section explicitly acknowledges: 'the experimental results are based on a single round generation, akin to Pass@1, which may introduce a certain level of bias.' No variance across runs is reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares 13 models across multiple settings (Table 2) and also compares direct generation against the Agentless framework (Table 5). Multiple models serve as baselines for each other."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Evaluated models include DeepSeek-R1, DeepSeek-V3, GPT-4o, o1, and Qwen2.5-Coder — all recent and competitive models at time of writing."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper systematically varies context settings (Oracle vs BM25), hint levels (Detailed vs Brief), output formats (Natural vs Patch), and context lengths (27K vs 40K), analyzing the effect of each factor independently."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper reports resolved ratio (unit test pass rate), %Apply (patch application success rate), and retrieval metrics (precision, recall) across different experiments."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Evaluation is entirely automated via unit test execution. No human evaluation of generated code quality is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "FEA-Bench is a fixed benchmark — models are pre-trained and not tuned on the benchmark data. The evaluation uses the benchmark as a held-out test set."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 4 shows resolved ratios across repository categories (Testing, Internet, AI, etc.). Section 6.5 and Figure 5 provide breakdown by number of added functions. The lite version is also separately reported."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The paper discusses quantitative failure patterns (format issues in Section 6.2, complexity in Section 6.5) but does not show qualitative examples of specific failures — no examples of what models generated incorrectly vs the expected output."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports several negative findings: increasing context from 27K to 40K tokens decreases performance (Table 3), patch output format performs significantly worse than natural format (Table 4), and the main finding itself — LLMs resolve at most ~10% of tasks — is a negative result."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims 'LLMs perform significantly worse in the FEA-Bench' and 'the best-performing LLM, DeepSeek-R1, successfully resolves only about 10%' are directly supported by Table 2 (9.92% resolved ratio)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal-like claims (e.g., 'the format of code edits is a critical factor limiting performance') are supported by controlled comparisons varying one factor at a time (same model, different formats in Table 4; same model, different context lengths in Table 3)."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The title scopes to 'repository-level code generation for feature implementation.' The Limitations section explicitly states 'our benchmark includes only Python repositories' and acknowledges 'certain scenario limitations.'"
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "While the paper explores different experimental factors (retrieval, format, context length), it does not substantively discuss alternative explanations for the main finding of low performance — e.g., whether benchmark construction choices, test quality, or task formulation artificially inflate difficulty."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures unit test pass rate and frames claims tightly around 'resolved ratio' of task instances. Claims match the granularity of measurement — they do not extrapolate from test-passing to broader software quality claims."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B.2 provides exact API versions: 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13', 'o1-2024-12-17', 'o1-mini-2024-09-12'. Open-source models specify sizes (e.g., 'Qwen2.5-Coder 14B/32B')."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Figure 6 provides the complete prompt structure with actual text for all sections (request, definitions, code, instructions). Figure 7 provides the full output instruction text for both Natural and Patch formats."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix B.2 reports: 'temperature and top-p settings are fixed at 0.2 and 0.95, respectively.' Max output tokens are stated for each model family (4096, 8192, 100K, 64K)."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The paper uses the Agentless framework (Section 6.3) but only cites it without describing its internal pipeline (localization, repair steps, iterative refinement). A reader cannot understand the scaffold from this paper alone."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Sections 3.2-3.3 and Appendix A.2 describe the full pipeline: repository selection from Top PyPI, PR filtering criteria, new component extraction, intent-based filtering with GPT-4o, and unit test verification. Table 7 provides instance counts at each stage."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section is present between Section 7 (Conclusion) and the Ethics Statement, with substantive discussion across multiple paragraphs."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section discusses specific threats: Python-only scope, scarcity of high-quality new-feature PRs, early-stage repo development not captured, single-round generation bias, and resource constraints limiting DeepSeek experiments."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper explicitly states Python-only scope, acknowledges 'certain scenario limitations' from the PR-based construction, and notes that 'high-quality and usable pull requests for new feature development are relatively scarce.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The benchmark data is not yet released. The paper promises future release but no data is currently available for independent verification."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Sections 3.2-3.3 describe data collection in detail: sourcing from Top PyPI repositories, fast validation of 20 PRs per repo, PR crawling, new component extraction via AST parsing, intent-based GPT-4o classification, and unit test verification."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Repository selection criteria are clearly described: Top PyPI packages with licenses and >1,000 PRs, plus the 18 SWE-bench repositories. The filtering process from ~8,000 packages to ~600 candidates to 119 repositories is documented."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Table 7 documents the full pipeline with exact counts at each stage for all 83 repositories: merged PRs → tasks with tests → candidates after filtering → final instances → lite instances. Appendix A.2 describes each step."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Acknowledgments state: 'This work was supported by National Science and Technology Major Project (No. 2022ZD0116308) and National Natural Science Foundation of China (62036001).'"
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Peking University and Microsoft Research Asia. The paper header marks these with symbols (♡ and ♠)."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Funding is from Chinese government agencies (NSFC, national project) which have no direct financial stake in the benchmark results or the relative performance of any evaluated model."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interest statement is present. Authors from Microsoft Research Asia could have interests related to AI code generation products, but no declaration is made."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper does not state training data cutoff dates for any of the evaluated models, despite evaluating them on a benchmark constructed from public GitHub PRs."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether the GitHub PR data used for FEA-Bench appears in the training data of the evaluated models. Given these are public PRs from popular repositories, overlap is likely."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "The paper does not discuss whether the benchmark data was available online before the models' training cutoffs. Popular repositories like scikit-learn, django, and sympy are highly likely to appear in training corpora."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. The paper evaluates LLMs on an automated benchmark."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The study uses publicly available GitHub data and evaluates LLMs."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants involved."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants involved."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants involved."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants involved."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No API costs, per-example costs, or total inference costs are reported. The paper mentions hardware (A100 workstation) and token limits but not actual cost figures."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper mentions using '8-GPU NVIDIA A100 workstation' but does not report total GPU hours, API spend, or total computational budget for the experiments."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Results are from single-run generation. The Limitations section acknowledges: 'experimental results are based on a single round generation, akin to Pass@1.' No seed sensitivity analysis."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper explicitly states 'LLMs perform a single generation for each task instance' (Appendix B.2) and acknowledges this as a limitation."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Temperature 0.2 and top-p 0.95 are used without justification for these choices. No hyperparameter search or sensitivity analysis is reported."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "The paper reports results across all configurations tested (Oracle/BM25, Detailed/Brief, Natural/Patch) rather than cherry-picking a single best configuration. Table 2 shows all combinations."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors construct FEA-Bench and design the evaluation, but do not discuss potential bias in benchmark construction (e.g., whether their filtering choices systematically disadvantage certain model architectures)."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Models ranging from 13B to 671B parameters are compared without normalization for compute. The paper notes 'models with larger parameter sizes demonstrate better results' but does not control for compute budget."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Section 3.4 provides substantial construct validity discussion: FEA-Bench's characteristics vs SWE-bench, the nature of feature implementation (new components + complementary changes), and statistics showing the benchmark captures what it claims (67.8% new component lines)."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "In Table 2, all models use the same prompt/retrieval framework. In Table 5, the Agentless framework is compared against direct retrieval for the same models. Model comparisons are not confounded by scaffold differences."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper does not discuss whether the GitHub PRs in FEA-Bench predate the training data of the evaluated models. Given popular repos like scikit-learn and django, temporal overlap is highly likely."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper provides new component signatures as hints but does not discuss whether this constitutes feature leakage or how it affects fair comparison. The 'Detailed' setting additionally provides non-Python file changes that could leak implementation details."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether training data of evaluated models contains code from the same repositories used in FEA-Bench. Models trained on GitHub data likely saw code from django, scikit-learn, sympy, etc."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection method (canary strings, n-gram overlap, membership inference, temporal splits) is applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "The best-performing LLM (DeepSeek-R1) resolves only about 10% of FEA-Bench tasks in the Oracle+Detailed setting.",
    375       "evidence": "Table 2 shows DeepSeek-R1 achieves 9.92% resolved ratio under Oracle/Detailed settings on the full FEA-Bench.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "FEA-Bench tasks require substantially more code generation than SWE-bench, with 8x more lines of new components on average.",
    380       "evidence": "Table 1 shows FEA-Bench averages 87.1 lines of added components vs 10.9 in SWE-bench, and 128.5 total edited lines vs 37.71.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Natural output format significantly outperforms Patch format for code edit generation.",
    385       "evidence": "Table 4 shows Natural format yields higher resolved ratios across all models (e.g., DeepSeek-R1: 9.92% Natural vs not tested, GPT-4: 4.71% vs 3.07%) and much higher patch application rates (e.g., GPT-4o: 66.38% vs 19.49%).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Increasing context length from 27K to 40K tokens does not improve and may decrease model performance.",
    390       "evidence": "Table 3 shows GPT-4 performance stays at 3.14% and GPT-4o drops from 5.28% to 4.78% when expanding from 27K to 40K BM25 retrieval, despite slight recall improvement.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Task difficulty increases with the number of new components, with resolved ratio dropping from ~19% for 1 function to ~5% for 3+ functions.",
    395       "evidence": "Section 6.5 and Figure 5 show DeepSeek-R1's resolved ratio drops from 18.96% (1 added function) to 8.24% (2 functions) to 5.47% (≥3 functions).",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Agentless framework improves performance over direct BM25 retrieval for most models.",
    400       "evidence": "Table 5 shows improvements for GPT-4o (4.0%→9.5%), o1-mini (1.0%→4.5%), and o1 (5.0%→10.0%) on the lite subset, though the improvement correlates with better patch application rates.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No statistical significance tests",
    407       "detail": "All model comparisons are based on point estimates without any statistical tests. Claims like 'significantly outperforming' are made without p-values or confidence intervals, making it impossible to assess whether observed differences are meaningful or within noise."
    408     },
    409     {
    410       "flag": "Single-run evaluation",
    411       "detail": "All results are from single-run generation (acknowledged in Limitations). With stochastic models using temperature=0.2, results could vary across runs, but no variance is reported."
    412     },
    413     {
    414       "flag": "No contamination analysis for benchmark using popular public repositories",
    415       "detail": "FEA-Bench uses PRs from highly popular repositories (scikit-learn, django, sympy) that are almost certainly in the training data of evaluated models. No contamination analysis, temporal split, or decontamination is performed."
    416     },
    417     {
    418       "flag": "Company affiliation evaluating competitor products",
    419       "detail": "Authors from Microsoft Research Asia evaluate OpenAI and DeepSeek models but notably exclude Microsoft's own models (e.g., Phi, GitHub Copilot). This asymmetry is not discussed."
    420     },
    421     {
    422       "flag": "Missing results in main experiments",
    423       "detail": "Table 2 has missing entries (× marks for CodeLlama BM25 results, missing rows for some models). The paper attributes this to 'scarcity of API resources' but this means the experimental matrix is incomplete."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    429       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    430       "year": 2024,
    431       "relevance": "Primary comparison benchmark for repository-level code tasks; FEA-Bench extends this to feature implementation rather than bug fixing."
    432     },
    433     {
    434       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    435       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    436       "year": 2024,
    437       "arxiv_id": "2407.01489",
    438       "relevance": "Agent framework evaluated on FEA-Bench; represents state-of-the-art approach for repository-level code tasks."
    439     },
    440     {
    441       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    442       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    443       "year": 2024,
    444       "arxiv_id": "2405.15793",
    445       "relevance": "Agentic approach to repository-level code tasks; demonstrates how agent scaffolding affects LLM performance on SWE-bench."
    446     },
    447     {
    448       "title": "AutoCodeRover: Autonomous Program Improvement",
    449       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    450       "year": 2024,
    451       "relevance": "Autonomous program improvement agent evaluated on SWE-bench; relevant to LLM-based repository-level code modification."
    452     },
    453     {
    454       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    455       "authors": ["Jia Li", "Ge Li", "Yunfei Zhao"],
    456       "year": 2024,
    457       "arxiv_id": "2405.19856",
    458       "relevance": "Repository-level code completion benchmark; provides context for evaluating LLMs on real-world code generation."
    459     },
    460     {
    461       "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
    462       "authors": ["Jia Li", "Ge Li", "Xuanming Zhang", "Yihong Dong", "Zhi Jin"],
    463       "year": 2024,
    464       "arxiv_id": "2404.00599",
    465       "relevance": "Evolving repository-level code benchmark addressing temporal contamination through continuous updates."
    466     },
    467     {
    468       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    469       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    470       "year": 2024,
    471       "arxiv_id": "2403.07974",
    472       "relevance": "Code generation benchmark designed to address contamination through temporal filtering."
    473     },
    474     {
    475       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    476       "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"],
    477       "year": 2024,
    478       "arxiv_id": "2406.15877",
    479       "relevance": "Large-scale code benchmark evaluating LLMs on complex coding scenarios with diverse function calls."
    480     },
    481     {
    482       "title": "CodePlan: Repository-Level Coding Using LLMs and Planning",
    483       "authors": ["Ramakrishna Bairi", "Atharv Sonwane", "Aditya Kanade"],
    484       "year": 2024,
    485       "relevance": "Combines LLMs with planning for repository-level coding tasks, relevant to multi-file code changes."
    486     },
    487     {
    488       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    489       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    490       "year": 2025,
    491       "arxiv_id": "2501.12948",
    492       "relevance": "Best-performing model on FEA-Bench; demonstrates impact of reasoning-enhanced LLMs on repository-level code tasks."
    493     },
    494     {
    495       "title": "Evaluating Large Language Models Trained on Code",
    496       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    497       "year": 2021,
    498       "arxiv_id": "2107.03374",
    499       "relevance": "Introduced HumanEval benchmark and Codex; foundational work for evaluating code generation capabilities of LLMs."
    500     },
    501     {
    502       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    503       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang"],
    504       "year": 2023,
    505       "relevance": "Iterative retrieval approach for repository-level code completion; directly relevant to context retrieval strategies evaluated in FEA-Bench."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "The benchmark is directly useful for researchers evaluating code LLMs on feature implementation, though not immediately actionable for practitioners."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Finding that LLMs struggle with complex repository-level tasks is expected; the specific quantification (~10% max) adds some informational value but is not surprising."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No safety or security concerns are raised by this benchmark paper."
    520     },
    521     "drama_conflict": {
    522       "score": 1,
    523       "justification": "Mild drama in showing that even the best models fail 90%+ of feature implementation tasks, but no controversy or adversarial framing."
    524     },
    525     "demo_ability": {
    526       "score": 0,
    527       "justification": "Code and data are not yet publicly available — only a promise of future release at a GitHub URL."
    528     },
    529     "brand_recognition": {
    530       "score": 2,
    531       "justification": "From Microsoft Research Asia and Peking University; evaluates well-known models (GPT-4, DeepSeek-R1) and positions against the prominent SWE-bench."
    532     }
    533   }
    534 }

Impressum · Datenschutz