scan.json (26208B)
1 { 2 "paper": { 3 "title": "SWE-EVO: Benchmarking Coding Agents in Long-Horizon Software Evolution Scenarios", 4 "authors": ["Minh Vu Thai Pham", "Tue Le", "Dung Nguyen Manh", "Huy Nhat Phan", "Nghi D. Q. Bui"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.18470" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "SWE-EVO is a benchmark of 48 long-horizon software evolution tasks constructed from release notes of 7 open-source Python projects. Even the best model (GPT-5) resolves only 21% of SWE-EVO tasks versus 65% on SWE-Bench Verified, revealing a large capability gap for multi-file, multi-step coding tasks. The paper proposes Fix Rate as a soft metric capturing partial progress and provides trajectory-level failure analysis showing stronger models fail primarily on instruction following while weaker models fail on tool use and syntax errors.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "GitHub repository linked: https://github.com/SWE-EVO/SWE-EVO (Section 1, under title)." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The benchmark dataset (48 tasks) is released via the GitHub repository. The paper states it is constructed from public open-source repositories and is designed to be 'plug-and-play' for existing SWE agents." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper inherits execution environments from SWE-bench and SWE-gym (Section 3.1), and Table 6 shows each instance includes a Docker image field for the execution environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub link is given but no README or reproduction section is described in the text." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Tables 2, 3, and 4 report only point estimates (e.g., '18.75% Resolved') with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims about model performance (e.g., GPT-5 vs. others) but uses no statistical significance tests. Rankings are based solely on comparing raw percentages." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports performance differences with baseline context, e.g., 'GPT-5 achieves only 21% on SWE-EVO versus 65% on SWE-Bench Verified' (Section 1, Tables 2-3), providing magnitude of the gap." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The benchmark has only 48 instances. The Limitations section acknowledges 'The 48-instance scale, while ensuring quality, limits statistical power for fine-grained comparisons' but does not justify why 48 was the resulting size beyond the filtering pipeline." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs. Each model-scaffold combination appears to be a single run." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "SWE-Bench Verified scores are included as comparison points for all models (Tables 2-3). Multiple models serve as baselines against each other." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include GPT-5, o3, DeepSeek-R1, Kimi-K2, Qwen3-Coder — all 2025-era state-of-the-art models." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper evaluates two input settings (release-note only vs. release-note + PR/issue context) as a form of ablation on input information (Tables 2-3), and analyzes performance vs. difficulty (PR count) in Section 4.3.2." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Three metrics are used: Resolved Rate, Patch Apply Rate, and Fix Rate (Section 3.2, Tables 2-4)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of agent outputs is conducted. Evaluation is entirely automated via test suites. The failure mode analysis uses LLM-as-a-judge (Section 4.3.1), not human judges." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a benchmark paper — the entire dataset is the test set. There is no train/test split relevant here." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by agent framework (OpenHands vs. SWE-agent), by input setting (Tables 2-3), and by difficulty level (Figure 7). Failure modes are broken down per model family (Figure 6)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4.3 provides detailed trajectory-level failure mode analysis with a taxonomy of 7 failure categories (Table 5, Figure 6), showing how different models fail differently." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The core finding is a negative result: even the best model solves only 21% of tasks. The paper also shows that providing PR/issue context yields only 'modest gains' (Section 4.2)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims GPT-5 achieves 21% on SWE-EVO vs. 65% on SWE-Bench Verified, which matches Table 2 (20.83% with SWE-agent). The claim about multi-step modifications spanning 21 files matches Table 1." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper claims PR count is a 'reliable proxy for instance difficulty' (Section 4.3.2) based on correlation, but this is presented as near-causal ('instances that aggregate many such changes are expected to require deeper reasoning'). The correlation could be confounded by repository-specific effects." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The benchmark covers only 7 Python repositories with 48 instances, yet the paper frames findings broadly as about 'autonomous software evolution' and 'long-horizon agent capabilities' (Section 5). The Limitations section acknowledges Python-only coverage but the title and abstract do not bound the claims." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not discuss alternative explanations for the performance gap. For example, it does not consider whether the gap is due to context window limitations, task formulation differences (release notes vs. issues), or repository-specific effects rather than fundamental capability limitations." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures test-suite pass rates and frames this as evaluating 'software evolution capability.' It does not discuss whether passing tests on 48 tasks is an adequate proxy for the broader claim of autonomous software evolution capability." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model versions are listed in Section 4.1: 'GPT-5-pro-10-06, GPT-5-08-07, GPT-5-mini-08-07, GPT-5-nano-08-07', 'O3-2025-04-16', 'GPT-4.1-2025-04-14', 'GPT-4o-2024-11-20', 'Deepseek-R1-0528'." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper describes problem statements at a high level and provides two examples (Appendix), but does not provide the actual system prompts or instructions given to the agent frameworks." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 4.1 states: 'For OpenHands, we set a maximum of 100 iterations per instance. For SWE-Agent, we limit the number of LLM calls to 100.' Footnote 1 specifies 'medium reasoning effort' for GPT-5 and o3 models." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper evaluates existing third-party agent frameworks (OpenHands, SWE-agent) as black boxes. It cannot be expected to describe their internal scaffolding." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3.1 documents the three-stage construction pipeline: repository selection from SWE-bench/SWE-gym, candidate selection by version tags, and execution-based filtering with specific criteria at each stage." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "A Limitations paragraph is present in Section 5 (Conclusion): 'SWE-EVO currently covers only Python projects and relies on release notes as specifications...'" 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The Limitations section identifies specific threats: Python-only coverage, reliance on release notes (missing security patches and performance optimizations), and the 48-instance scale limiting statistical power." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The Limitations section states specific things not covered: 'may not capture all evolution scenarios (e.g., security patches, performance optimizations without explicit notes)' and calls for expanding language coverage and instance count." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The benchmark is released via GitHub (https://github.com/SWE-EVO/SWE-EVO) with task instances including patches, test patches, and problem statements (Table 6)." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.1 describes three stages of data collection: sampling from SWE-bench and SWE-gym, filtering to version-tag commits, and execution-based validation." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data source is standard public repositories via SWE-bench." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 3.1 documents the full pipeline from seed pool (SWE-bench + SWE-gym) through version-tag filtering to execution-based filtering, resulting in 48 instances. Each stage's criteria are described." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding acknowledgment or disclosure section found in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: FPT Software AI Center and University of Melbourne." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Authors are from FPT Software AI Center, a software company. No funding disclosure is present, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement found in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state training data cutoff dates for any of the 11 models evaluated, despite this being critical for assessing whether models may have seen the benchmark repositories." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether the open-source repositories (scikit-learn, pydantic, etc.) and their release notes are in model training data. These are extremely popular repos that are very likely in training corpora." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The benchmark is constructed from well-known open-source projects (scikit-learn, pydantic, dask, etc.) whose release notes and patches are publicly available on GitHub. The paper does not discuss contamination risk despite this being a central concern." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No API costs, token counts, or wall-clock times reported for running the 11 models × 2 scaffolds × 48 tasks." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total computational budget stated. The paper evaluates 11 models across 2 frameworks on 48 tasks (1,056+ runs) but does not quantify the compute required." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multiple-seed runs reported. Each model-scaffold pair appears to produce a single set of results." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper does not state how many times each experiment was run. Results appear to be single-run." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget reported. The paper uses default agent configurations without discussing whether alternatives were tried." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper uses 'medium' reasoning effort for GPT-5 and o3 models (footnote 1) but does not justify this choice or compare against other settings." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors evaluate their own benchmark but do not acknowledge potential author-evaluation bias in benchmark design or task selection." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "Different models have vastly different compute costs (GPT-5-pro vs GPT-5-nano, proprietary vs open-source) but performance is never compared at matched compute budgets." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "Section 3.3 discusses construct validity by comparing SWE-EVO features against SWE-Bench (Figure 4), analyzing PR count as difficulty proxy (Section 4.3.2), and showing intuitive scaling behavior that validates the benchmark measures meaningful capability." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "The paper evaluates each model across both OpenHands and SWE-agent scaffolds (Tables 2-4), allowing readers to see scaffold effects. Results are reported separately per scaffold." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "The benchmark uses release notes from mature open-source projects (scikit-learn, pydantic, dask). The paper does not discuss whether these releases predate model training, despite this being critical." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the problem statement (release notes with PR/issue links) provides information that leaks the solution." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The benchmark is constructed from SWE-bench seed data. No discussion of whether SWE-EVO instances overlap with SWE-bench training examples used by models post-trained on SWE-bench data (e.g., SWE-RL, DeepSWE)." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention method is used despite the benchmark being derived from public repositories and release notes." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "GPT-5 achieves only 21% on SWE-EVO versus 65% on SWE-Bench Verified, revealing a significant capability gap for long-horizon software evolution tasks.", 364 "evidence": "Table 2 shows GPT-5-08-07 resolves 20.83% (SWE-agent) and 18.75% (OpenHands) on SWE-EVO, compared to 65% on SWE-Bench Verified.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Model performance on SWE-EVO exhibits intuitive scaling: larger models consistently outperform smaller variants.", 369 "evidence": "Tables 2-3 show GPT-5 > GPT-5-mini > GPT-5-nano consistently across both scaffolds (e.g., 20.83% > 10.42% > 4.17% with SWE-agent).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Providing PR/issue context yields measurable but modest gains over release-note-only input.", 374 "evidence": "Comparing Tables 2 and 3: GPT-5 goes from 16.67% to 20.83% (SWE-agent) with context. Most models show small improvements.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Stronger models fail primarily on instruction following while weaker models struggle with tool use and syntax errors.", 379 "evidence": "Figure 6 and Section 4.3.1 show GPT-5 failures are >60% instruction following, while GPT-5-nano has substantial tool-use and syntax errors.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "PR count is a reliable proxy for instance difficulty.", 384 "evidence": "Figure 7a shows monotonic trend: easier instances (frequently solved) average 1.67 PRs, hardest average 14.84 PRs (Section 4.3.2).", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Tiny benchmark size", 391 "detail": "Only 48 instances across 7 repositories. With such small N, individual instance variation dominates. A single instance changing outcome could shift resolved rates by ~2 percentage points. The paper acknowledges this in limitations." 392 }, 393 { 394 "flag": "No contamination analysis", 395 "detail": "The benchmark uses well-known open-source repos (scikit-learn, pydantic, dask) whose release notes, commits, and patches are publicly available on GitHub and likely in LLM training data. Several evaluated models (DeepSWE, SWE-RL) were explicitly trained on SWE-bench data, from which SWE-EVO is derived. No contamination analysis is performed." 396 }, 397 { 398 "flag": "LLM-as-a-judge for failure analysis", 399 "detail": "Section 4.3.1 uses GPT-5-mini as an automated judge for failure mode classification. No validation of judge accuracy against human labels is provided." 400 }, 401 { 402 "flag": "No variance or repeat runs", 403 "detail": "All results appear to be single-run. With stochastic LLM outputs and only 48 instances, single-run results are unreliable. A different random seed could change which instances are solved." 404 }, 405 { 406 "flag": "Company affiliation without disclosure", 407 "detail": "Authors are from FPT Software AI Center. No funding, competing interests, or financial interests declarations are provided." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 413 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 414 "year": 2023, 415 "arxiv_id": "2310.06770", 416 "relevance": "Foundational benchmark for evaluating coding agents on real-world GitHub issues; SWE-EVO extends this to multi-step evolution tasks." 417 }, 418 { 419 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 420 "authors": ["Xingyao Wang"], 421 "year": 2024, 422 "arxiv_id": "2407.16741", 423 "relevance": "One of two agent frameworks used for evaluation in SWE-EVO experiments." 424 }, 425 { 426 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 427 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 428 "year": 2024, 429 "arxiv_id": "2405.15793", 430 "relevance": "Second agent framework used for evaluation; emphasizes importance of agent-computer interfaces." 431 }, 432 { 433 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 434 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 435 "year": 2024, 436 "arxiv_id": "2407.01489", 437 "relevance": "Simpler localization-repair approach that challenges complex agent scaffolding, relevant to agent architecture evaluation." 438 }, 439 { 440 "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution", 441 "authors": ["Yuxiang Wei"], 442 "year": 2025, 443 "arxiv_id": "2502.18449", 444 "relevance": "RL-based post-training for software engineering; raises contamination concerns when evaluated on SWE-EVO." 445 }, 446 { 447 "title": "AgileCoder: Dynamic Collaborative Agents for Software Development Based on Agile Methodology", 448 "authors": ["Minh Huynh Nguyen", "Thang Phan Chau"], 449 "year": 2025, 450 "relevance": "Multi-agent framework incorporating agile methodology for software development." 451 }, 452 { 453 "title": "AutoCodeRover: Autonomous Program Improvement", 454 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 455 "year": 2024, 456 "relevance": "Autonomous coding agent combining LLMs with AST-based code search for program repair." 457 }, 458 { 459 "title": "ChatDev: Communicative Agents for Software Development", 460 "authors": ["Unknown"], 461 "year": 2023, 462 "relevance": "Multi-agent framework for software development through role-based communication." 463 }, 464 { 465 "title": "SWE-bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?", 466 "authors": ["Scale AI"], 467 "year": 2025, 468 "arxiv_id": "2509.16941", 469 "relevance": "Concurrent benchmark targeting enterprise-level complexity; directly related to long-horizon evaluation." 470 }, 471 { 472 "title": "Evaluating Large Language Models Trained on Code", 473 "authors": ["Mark Chen"], 474 "year": 2021, 475 "arxiv_id": "2107.03374", 476 "relevance": "HumanEval benchmark paper; foundational code generation evaluation that SWE-EVO aims to extend beyond." 477 }, 478 { 479 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 480 "authors": ["Terry Yue Zhuo"], 481 "year": 2024, 482 "arxiv_id": "2406.15877", 483 "relevance": "Code generation benchmark with complex instructions, relevant to benchmark design methodology." 484 }, 485 { 486 "title": "SWE-Synth: Synthesizing Verifiable Bug-Fix Data to Enable Large Language Models in Resolving Real-World Bugs", 487 "authors": ["Minh VT Pham", "Huy N Phan"], 488 "year": 2025, 489 "arxiv_id": "2504.14757", 490 "relevance": "Synthetic bug-fix data generation for training coding agents; by same research group as SWE-EVO." 491 } 492 ] 493 }