ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28533B)


      1 {
      2   "paper": {
      3     "title": "SWE-Effi: Re-Evaluating Software AI Agent System Effectiveness Under Resource Constraints",
      4     "authors": [
      5       "Zhiyu Fan",
      6       "Kirill Vasilevski",
      7       "Dayi Lin",
      8       "Boyuan Chen",
      9       "Yihao Chen",
     10       "Zhiqing Zhong",
     11       "Jie M. Zhang",
     12       "Pinjia He",
     13       "Ahmed E. Hassan"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv preprint",
     17     "arxiv_id": "2509.09853",
     18     "doi": "10.48550/arXiv.2509.09853"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "SWE-Effi introduces multi-dimensional effectiveness metrics (EuTB, EuITB, EuCTB, EuCB) to re-evaluate AI software engineering systems beyond resolve rate alone. The study finds that scaffold-model synergy matters more than scaffold design alone, with the same scaffold varying dramatically in effectiveness across models (e.g., SWE-Agent EuTB: 21.8% with Qwen3-32B vs 5.1% with GPT-4o-mini). Two systemic issues are identified: a 'Token Snowball' effect where naive memory accumulation causes linear context growth, and 'expensive failures' where unresolved attempts consume 4x+ more resources than successful ones. The 50-issue evaluation on SWE-bench-Verified is explicitly acknowledged as exploratory and not a definitive verdict.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper provides a URL (https://centre-for-software-excellence.github.io/SWE-Effi/) and states 'releasing our code, data, and hosting a public leaderboard' (Section 6). Code release is indicated."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper states 'Leaderboard, data, and code at [URL]' and 'We will make our subset publicly available on HuggingFace' (Section 4). The SWE-bench-Verified base data is public, and they indicate their subset and collected metrics data are released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specifications are provided in the paper. Hardware and API provider details are mentioned but not a reproducible environment setup."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. They describe the experimental setup conceptually but do not provide commands or scripts to replicate the experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 1, 2, and 3 report only point estimates. No confidence intervals, error bars, or uncertainty measures are provided for any result."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper makes numerous comparative claims (e.g., 'performance drops significantly', '4x increase') without any statistical significance tests. All comparisons are raw number comparisons."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports relative effect sizes throughout, e.g., '18× the token cost' (Section 5.1), 'over 4 times more resources' (Section 5.4), 'nearly in half' (Section 5.2), with baseline context provided."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The sample of 50 issues is acknowledged as a limitation ('we limited the SWE-bench-Verified dataset down to the 50 samples from initial 500' due to cost, Section 6) but no power analysis or formal justification for 50 being sufficient is provided."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No standard deviations, variance, or spread measures are reported. Tables show means ('mean across samples') without any indication of variability across the 50 issues."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Five scaffolds are compared against each other across three LLMs, providing 15 system configurations as mutual baselines (Tables 1, 2, 3)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Scaffolds were 'selected five popular, actively maintained open-source SWE scaffolds from the top of the SWE-bench leaderboard (as of May 2025)' (Section 4). All are recent and actively maintained."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No ablation study is performed on the SWE-Effi metric framework or on any scaffold component. The study compares different systems but does not isolate the contribution of individual components."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper reports five metrics: EuTB, EuITB, EuCTB, EuCB, and Resolve Rate (Table 1), plus detailed core metrics in Table 2 (CPU time, inference time, tokens, LLM requests)."
     94       },
     95       "human_evaluation": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Human evaluation is not relevant to measuring resource efficiency of AI systems on automated benchmarks with pass/fail test suites."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are reported on a subset of SWE-bench-Verified. No system was tuned or developed using these issues — all scaffolds were run with their default configurations on the test set."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by scaffold (5 scaffolds), by model (3 models), and by resolved vs unresolved attempts (Table 3). Multiple dimensions of per-category analysis are provided."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5.4 ('Failing is Far More Expensive Than Succeeding') extensively discusses failure behavior. Section 5.3 discusses the Token Snowball effect as a failure mode. Figure 1c shows SWE-Agent/GPT-4o-mini entering unproductive loops."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports several negative results: SWE-Agent with GPT-4o-mini collapses to 5.1% EuTB (Section 5.1), Agentless-Mini with Llama-3.3-70B achieves only 4% resolve rate (Table 1), and the 'expensive failures' pattern shows systemic inefficiency."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract's claims about scaffold-model synergy, Token Snowball effect, expensive failures, and token-vs-time trade-off are all supported by corresponding observations in Sections 5.1-5.4 with data from Tables 1-3 and Figures 1-2."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 5.2 claims 'High-Quality Reasoning Minimizes Iterations and Saves Tokens' — this is a causal claim but Qwen3-32B differs from Llama-3.3-70B in many ways beyond reasoning quality. The study is observational with respect to model properties and cannot isolate reasoning quality as the causal factor."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title ('Re-Evaluating Software AI Agent System Effectiveness') and abstract ('any AI system should be more than correct') are broader than the evidence from 5 scaffolds, 3 budget-tier models, and 50 SWE-bench issues. Section 6 hedges ('initial insights') but the framing extends well beyond the tested setting."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not systematically consider alternative explanations. For instance, the 'expensive failures' pattern could partly reflect budget caps forcing termination rather than inherent system behavior. The Qwen3-32B advantage could stem from recency or architecture rather than 'reasoning quality.' No confounds are discussed."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper precisely defines 'effectiveness' as AUC of resolve-rate vs. resource-consumption curves (Section 3.2) and is careful to note this measures efficiency on a specific benchmark subset, not general AI capability. The measurement matches the defined metric."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4 specifies 'GPT-4o-mini-2024-07-18', 'Llama-3.3-70B-Instruct' (quantized to FP8), and 'Qwen3-32B'. The GPT model includes a snapshot date; the open-source models are specific named versions."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper uses five open-source scaffolds that contain prompts, but the actual prompts used are not provided in the paper or appendix. The reader must consult each scaffold's source code to find the prompts."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states scaffolds were configured per 'official guidelines' with 'default iteration or generation limits' and SWE-Agent had a $1 budget cap (Section 4), but temperature, top-p, max tokens, and other LLM hyperparameters are not reported."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 2.2 and Section 4 describe each scaffold: SWE-Agent uses a ReAct loop with editing/testing tools, OpenHands adds IPython/browser/editing tools, Agentless uses hierarchical localization then patch generation, AutoCodeRover uses AST-based localization. Agentic vs procedural distinction is clearly made."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4 describes the subset selection: '50 issues randomly drawn from the well-respected SWE-bench-Verified dataset' using 'stratified sampling, preserving the original distribution of issues across different software projects.'"
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6 (Discussion and Conclusion) contains substantive discussion of limitations: 'we had to limit the scope of our evaluation down to the 15 permutations... we limited the SWE-bench-Verified dataset down to the 50 samples from initial 500.'"
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 6 discusses specific threats: the 50-sample limitation from 500, cost and time constraints limiting to 15 of many possible configurations, the fact that 'initial runs sometimes took upwards of two weeks to complete with several hundred dollars in API costs.'"
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 6 states 'this work is intended to provide an introduction and initial insights' and 'The findings we present are not intended to be a final, definitive verdict on these five scaffolds.' They explicitly aim to expand with community help."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper indicates all data is released at the project URL. Section 3.1.1 mentions 515,041 API call logs used for the regression model. Section 6 states they are 'releasing our code, data.'"
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4 describes the data collection: scaffolds were augmented with profiling code, parallel processing was disabled to ensure accurate time measurements, fine-grained metrics were collected for CPU time, inference time, and token usage."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data source is SWE-bench-Verified, a standard public benchmark."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is described: select scaffolds and models → configure per official guidelines → instrument with profiling → run on stratified 50-issue subset → collect core metrics → derive effectiveness scores via AUC (Sections 3-4)."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding or acknowledgments section is present in the paper. Five authors are affiliated with Huawei/Huawei Canada but no funding source is disclosed."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Huawei, Huawei Canada, CUHK Shenzhen, King's College London, Queen's University. The affiliations are prominently displayed under author names."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding is disclosed, so independence cannot be assessed. Five of nine authors are at Huawei, a company with commercial interest in AI coding tools, though the paper evaluates open-source scaffolds rather than Huawei products."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Training data cutoff dates are not stated for any of the three models (GPT-4o-mini, Llama-3.3-70B, Qwen3-32B). This is relevant because SWE-bench issues are from public GitHub repositories."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether models may have seen SWE-bench solutions or related GitHub data during training."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "SWE-bench issues are derived from public GitHub repositories. All three models likely trained on GitHub data. This contamination risk is not discussed or addressed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "This is the paper's core contribution. Table 2 reports average CPU time, inference time, input/output tokens, and LLM requests per system. Appendix 7.1 provides per-token dollar costs. Table 3 breaks down costs by resolved vs unresolved."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 6 states 'initial runs sometimes took upwards of two weeks to complete with several hundred dollars in API costs.' Section 3.1.1 notes 515,041 API call logs were collected. Per-token API costs are in Appendix 7.1."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Each scaffold-model combination appears to be run once on the 50 issues. No multiple-seed experiments or seed sensitivity analysis is reported."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper does not explicitly state how many times each experiment was run. Results appear to be from single runs with no discussion of run count."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper uses default scaffold configurations but does not report the hyperparameter details or justify why defaults are optimal. No search budget is discussed."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 4 explicitly states 'We configured those scaffolds based on their official guidelines and adhered to their default iteration or generation limits' — the selection of default configurations is justified as providing an 'out-of-the-box' baseline."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No formal statistical tests are performed, so the question of correction for multiple comparisons does not arise."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors do not discuss potential bias in their metric framework design, such as whether the AUC-based effectiveness scores inherently favor certain scaffold architectures (e.g., lightweight procedural approaches) over others."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "This is the paper's central contribution. Performance (resolve rate) is explicitly plotted as a function of compute budget in Figures 5-7, and effectiveness metrics (EuTB, EuCB, etc.) formalize this relationship."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper's core argument questions SWE-bench's single-metric (resolve rate) evaluation paradigm. Section 1 argues 'current benchmarks create a disconnect from the realities of practical AI deployment' and proposes multi-dimensional metrics to address this validity gap."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Observation 1 (Section 5.1) is explicitly about the scaffold-model confound: 'SWE Scaffold Performance Is Highly Model-Dependent.' The factorial design (5 scaffolds × 3 models) directly addresses this by testing all combinations."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "SWE-bench issues are from public GitHub repositories that predate the models' training. No discussion of whether model training data includes these issues or their solutions."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether scaffolds or models receive information during evaluation that would not be available in real deployment scenarios."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The 50 issues are from SWE-bench-Verified with stratified sampling by project, but no discussion of whether models' training data shares structure with these specific GitHub repositories."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention methods are employed. No canary strings, membership inference, or decontamination analysis."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "AI system effectiveness depends on scaffold-model synergy, not scaffold design alone — SWE-Agent EuTB drops from 21.8% with Qwen3-32B to 5.1% with GPT-4o-mini",
    375       "evidence": "Table 1 shows the full 15-system comparison. SWE-Agent with Qwen3-32B achieves 28% resolve rate with 35.5 API calls, vs 10% with GPT-4o-mini requiring 181 calls and 18x more tokens (Section 5.1).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Reasoning models that consume more tokens per call can resolve issues with fewer total tokens by reducing the number of iterations",
    380       "evidence": "Table 2 shows AutoCodeRover with Qwen3-32B uses 14.7 API calls and 55.5K input tokens vs Llama-3.3-70B requiring 38.3 calls and 416.1K tokens (Section 5.2).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "A 'Token Snowball' effect causes input token consumption to grow linearly with API calls due to naive memory accumulation strategies",
    385       "evidence": "Figure 1 shows the relationship between API calls and input tokens for one selected instance per scaffold. Linear growth is demonstrated (Section 5.3).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Unresolved attempts are far more expensive than resolved ones — SWE-Agent/GPT-4o-mini failures consume 4x+ more tokens and time than successes",
    390       "evidence": "Table 3 shows resolved vs unresolved resource consumption. SWE-Agent/GPT-4o-mini: 8.8M tokens and 658s for failures vs 1.8M tokens and 167s for successes. Pattern holds across most scaffold-model pairs (Section 5.4).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Agentless leads in resolve rate (48% with Qwen3-32B) but at the cost of highest CPU time (727.9s average)",
    395       "evidence": "Tables 1 and 2 show Agentless/Qwen3-32B achieves 48% resolve rate with 83.1 API calls and 727.9s CPU time (Section 5.1).",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Tiny sample size",
    402       "detail": "Only 50 SWE-bench-Verified issues used. With 15 system configurations, some cells have very few resolved instances (e.g., Agentless-Mini/Llama achieves 4% = 2 issues). Results may be highly sensitive to the specific issues sampled."
    403     },
    404     {
    405       "flag": "Single run without variance",
    406       "detail": "Each scaffold-model configuration appears to be run once. No repeated runs, no variance reported, no error bars. With stochastic LLM outputs, results could vary substantially across runs."
    407     },
    408     {
    409       "flag": "No statistical significance tests",
    410       "detail": "All comparative claims ('drops significantly', 'outperforming') are based on raw number comparisons. No statistical tests are performed to determine whether observed differences exceed random variation."
    411     },
    412     {
    413       "flag": "Missing frontier models",
    414       "detail": "The three selected models (GPT-4o-mini, Llama-3.3-70B, Qwen3-32B) are all budget-tier. The scaffolds were designed and optimized for frontier models (GPT-4, Claude). Evaluating only on weaker models may not reflect realistic scaffold performance."
    415     },
    416     {
    417       "flag": "No contamination analysis",
    418       "detail": "SWE-bench issues come from public GitHub repositories. All three evaluated models likely trained on GitHub data. The complete absence of contamination discussion is a significant gap for a benchmark evaluation paper."
    419     },
    420     {
    421       "flag": "Potential commercial interest",
    422       "detail": "Five of nine authors are at Huawei/Huawei Canada. While they evaluate open-source scaffolds rather than Huawei products, no funding or competing interests statement is provided."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    428       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"],
    429       "year": 2024,
    430       "relevance": "The foundational benchmark for evaluating AI systems on software engineering issue resolution, which SWE-Effi builds upon."
    431     },
    432     {
    433       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    434       "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"],
    435       "year": 2024,
    436       "relevance": "Major agentic scaffold for SWE tasks using ReAct loop, evaluated in this paper's efficiency analysis."
    437     },
    438     {
    439       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    440       "authors": ["X. Wang", "B. Li", "Y. Song"],
    441       "year": 2025,
    442       "relevance": "Open-source agentic scaffold with extensive tooling evaluated in the SWE-Effi efficiency comparison."
    443     },
    444     {
    445       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    446       "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"],
    447       "year": 2024,
    448       "arxiv_id": "2407.01489",
    449       "relevance": "Procedural (non-agentic) scaffold for SWE tasks that led in resolve rate but had highest CPU costs in the evaluation."
    450     },
    451     {
    452       "title": "AutoCodeRover: Autonomous Program Improvement",
    453       "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"],
    454       "year": 2024,
    455       "relevance": "AST-based procedural scaffold that showed consistently competitive efficiency across model pairings."
    456     },
    457     {
    458       "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    459       "authors": ["Y. Wei", "O. Duchenne", "J. Copet"],
    460       "year": 2025,
    461       "arxiv_id": "2502.18449",
    462       "relevance": "RL-based approach to SWE tasks that uses lightweight Agentless-Mini scaffold, illustrating efficiency needs for RL training."
    463     },
    464     {
    465       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    466       "authors": ["S. Yao", "J. Zhao", "D. Yu"],
    467       "year": 2023,
    468       "relevance": "Foundational reasoning-and-acting paradigm used by agentic SWE scaffolds like SWE-Agent and OpenHands."
    469     },
    470     {
    471       "title": "SWE-Rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents",
    472       "authors": ["I. Badertdinov", "A. Golubev", "M. Nekrashevich"],
    473       "year": 2025,
    474       "arxiv_id": "2505.20411",
    475       "relevance": "Extends SWE-bench with 21,000 new tasks and decontamination, addressing benchmark limitations relevant to evaluation methodology."
    476     },
    477     {
    478       "title": "SWT-BENCH: Testing and Validating Real-World Bug-Fixes with Code Agents",
    479       "authors": ["N. Mündler", "M. Müller", "J. He", "M. Vechev"],
    480       "year": 2024,
    481       "relevance": "Evaluates LLM agents on test generation for bug reproduction, complementing SWE-bench's issue resolution focus."
    482     },
    483     {
    484       "title": "Trae Agent: An LLM-Based Agent for Software Engineering with Test-Time Scaling",
    485       "authors": ["P. Gao", "Z. Tian", "X. Meng"],
    486       "year": 2025,
    487       "arxiv_id": "2507.23370",
    488       "relevance": "SWE agent using test-time compute scaling, directly relevant to the efficiency-vs-accuracy trade-off SWE-Effi measures."
    489     }
    490   ]
    491 }

Impressum · Datenschutz