scan.json (26942B)
1 { 2 "paper": { 3 "title": "SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?", 4 "authors": [ 5 "Xiang Deng", 6 "Jeff Da", 7 "Edwin Pan", 8 "Yannis Yiming He", 9 "Charles Ide", 10 "Kanak Garg", 11 "Niklas Lauffer", 12 "Andrew Park", 13 "Nitin Pasari", 14 "Chetan Rane", 15 "Karmini Sampath", 16 "Maya Krishnan", 17 "Srivatsa Kundurthy", 18 "Sean Hendryx", 19 "Zifan Wang", 20 "Vijay Bharadwaj", 21 "Jeff Holm", 22 "Raja Aluri", 23 "Chen Bo Calvin Zhang", 24 "Noah Jacobson", 25 "Bing Liu", 26 "Brad Kenstler" 27 ], 28 "year": 2025, 29 "venue": "arXiv", 30 "arxiv_id": "2509.16941" 31 }, 32 "scan_version": 2, 33 "active_modules": ["experimental_rigor", "data_leakage"], 34 "methodology_tags": ["benchmark-eval"], 35 "key_findings": "SWE-Bench Pro introduces 1,865 human-verified benchmark problems from 41 repositories spanning multiple languages, with tasks requiring multi-file modifications averaging 107.4 lines across 4.1 files. Top models (Claude Sonnet 4.5, GPT-5) achieve only ~43% on the public set and under 18% on the commercial set, compared to 70%+ on SWE-Bench Verified. The benchmark uses GPL-licensed and proprietary codebases to resist contamination. Failure analysis reveals that even frontier models struggle with semantic understanding and multi-file coordination.", 36 "checklist": { 37 "artifacts": { 38 "code_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper provides a GitHub repository URL: https://github.com/scaleapi/SWE-bench_Pro-os and a HuggingFace dataset link: https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro." 42 }, 43 "data_released": { 44 "applies": true, 45 "answer": true, 46 "justification": "The public set of 731 instances is released on HuggingFace. The held-out (858) and commercial (276) sets are kept private by design." 47 }, 48 "environment_specified": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 3.2 and 4.3 describe Docker-based containerized environments for each task. 'All environments will be released as pre-built docker images to ensure that they are fully reproducible.'" 52 }, 53 "reproduction_instructions": { 54 "applies": true, 55 "answer": false, 56 "justification": "While code and data are released, the paper does not include step-by-step reproduction instructions for replicating the evaluation results (e.g., specific commands to run)." 57 } 58 }, 59 "statistical_methodology": { 60 "confidence_intervals_or_error_bars": { 61 "applies": true, 62 "answer": false, 63 "justification": "Tables 1, 2, and 5 report only point estimates (e.g., '43.6%') with no confidence intervals or error bars." 64 }, 65 "significance_tests": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper compares model performance (e.g., 'Claude Sonnet 4.5 achieves 43.6% vs GPT-5 at 41.8%') without any statistical significance tests." 69 }, 70 "effect_sizes_reported": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper reports absolute resolve rates and contextualizes them against SWE-Bench Verified (e.g., '23% on SWE-Bench Pro compared to over 70% on SWE-Bench Verified'), providing magnitude context." 74 }, 75 "sample_size_justified": { 76 "applies": true, 77 "answer": false, 78 "justification": "The benchmark contains 1,865 problems total (731 public), but there is no justification for why this size is sufficient or any power analysis." 79 }, 80 "variance_reported": { 81 "applies": true, 82 "answer": false, 83 "justification": "Results are reported as single-run Pass@1 scores without any variance, standard deviation, or multi-run statistics." 84 } 85 }, 86 "evaluation_design": { 87 "baselines_included": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple models are compared including frontier (Claude Sonnet 4.5, GPT-5, Claude Opus 4.1), mid-tier (Claude Haiku 4.5, Gemini 2.5), and open-source (Qwen3 32B, GPT-OSS 120B) models." 91 }, 92 "baselines_contemporary": { 93 "applies": true, 94 "answer": true, 95 "justification": "Models evaluated include Claude Sonnet 4.5, GPT-5, and other frontier models as of September 2025." 96 }, 97 "ablation_study": { 98 "applies": true, 99 "answer": true, 100 "justification": "Table 3 presents an ablation removing human augmentations (requirements and interface), showing performance drops from 25.9% to 8.4% for GPT-5 and 22.7% to 8.2% for Opus 4.1." 101 }, 102 "multiple_metrics": { 103 "applies": true, 104 "answer": false, 105 "justification": "Only Pass@1 (resolve rate) is reported. No other metrics such as partial solve rates, code quality, or cost-efficiency are measured." 106 }, 107 "human_evaluation": { 108 "applies": true, 109 "answer": false, 110 "justification": "The benchmark construction involves human verification, but evaluation of model outputs is entirely automated via test suites. No human evaluation of generated patches is performed." 111 }, 112 "held_out_test_set": { 113 "applies": true, 114 "answer": true, 115 "justification": "The benchmark has a held-out set of 858 problems 'to test for overfitting in the future,' and the commercial set is also private. Results are reported on the public and commercial sets." 116 }, 117 "per_category_breakdown": { 118 "applies": true, 119 "answer": true, 120 "justification": "Figure 3 provides breakdowns by programming language, repository, file count, and lines of code. Table 4 provides detailed failure mode breakdowns per model." 121 }, 122 "failure_cases_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 6.3 provides detailed trajectory failure mode analysis with categories (wrong solution, tool-use errors, syntax errors, etc.) quantified per model in Table 4." 126 }, 127 "negative_results_reported": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper reports that Agentless scaffold 'has difficulty in multi-file editing, thus, produces low evaluation scores.' The overall low performance of all models (<45%) is itself a negative result." 131 } 132 }, 133 "claims_and_evidence": { 134 "abstract_claims_supported": { 135 "applies": true, 136 "answer": true, 137 "justification": "The abstract claims performance 'below 45% (Pass@1)' which matches Table 1 (highest is 43.6%). The contamination-resistant design claim is supported by the GPL/commercial strategy described in Sections 3-4." 138 }, 139 "causal_claims_justified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The ablation in Table 3 makes a causal claim about human augmentations improving performance, supported by a controlled single-variable manipulation (with vs. without augmentations)." 143 }, 144 "generalization_bounded": { 145 "applies": true, 146 "answer": false, 147 "justification": "The title asks about 'Long-Horizon Software Engineering Tasks' broadly, but the benchmark covers only Python, JavaScript, TypeScript, and Go. Section 7.1 acknowledges limited language coverage but the title and abstract still frame results broadly." 148 }, 149 "alternative_explanations_discussed": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper does not discuss alternative explanations for why models fail (e.g., could the test suites be overly strict? could the human augmentations favor certain solution styles?). The failure mode analysis categorizes errors but doesn't consider confounds." 153 }, 154 "proxy_outcome_distinction": { 155 "applies": true, 156 "answer": false, 157 "justification": "The paper measures pass/fail on test suites as a proxy for 'solving software engineering tasks,' but does not discuss the gap between test-suite passing and actual issue resolution quality. Section 7.1 briefly notes 'real software engineering tasks may have a variety of correct solutions' but doesn't discuss this as a proxy limitation." 158 } 159 }, 160 "setup_transparency": { 161 "model_versions_specified": { 162 "applies": true, 163 "answer": false, 164 "justification": "Models are identified by marketing names (e.g., 'Claude Sonnet 4.5', 'OpenAI GPT-5 (High)', 'Kimi K2 Instruct') without specific version IDs or snapshot dates. The paper states 'latest versions as of September 18th, 2025' but does not provide API version strings." 165 }, 166 "prompts_provided": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper says 'We use the same prompt for all models, which is the default prompt from [16]' but does not include the actual prompt text. The failure mode analysis prompt is provided in Appendix C but the evaluation prompt is not." 170 }, 171 "hyperparameters_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the model evaluations. The only setting mentioned is 'maximum of 50 turns' and cost limit of $2." 175 }, 176 "scaffolding_described": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper states 'We use the SWE-Agent scaffold' and references [16] but does not describe the scaffold's internals (tool descriptions, retry logic, context management). The tools are listed in Appendix C as part of the LLM-as-judge prompt, not the evaluation scaffold." 180 }, 181 "data_preprocessing_documented": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 4 describes the full data creation pipeline: commit pair identification, test patch extraction, human augmentation process, environment construction, and test verification with clear filtering criteria." 185 } 186 }, 187 "limitations_and_scope": { 188 "limitations_section_present": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 7.1 is titled 'Limitations' and discusses limited language coverage and dependency on test suites." 192 }, 193 "threats_to_validity_specific": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 7.1 identifies specific threats: limited language coverage ('Java, C++, and Rust are underrepresented') and that test suites may reject valid alternative solutions." 197 }, 198 "scope_boundaries_stated": { 199 "applies": true, 200 "answer": false, 201 "justification": "While limitations are mentioned, the paper does not explicitly state what the results do NOT show. For example, it does not state that the benchmark cannot assess code quality, maintainability, or real-world development workflow integration." 202 } 203 }, 204 "data_integrity": { 205 "raw_data_available": { 206 "applies": true, 207 "answer": true, 208 "justification": "The public set of 731 instances is released on HuggingFace, including problem statements, test patches, and gold patches." 209 }, 210 "data_collection_described": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 4 describes the full data collection procedure: identifying commit pairs, extracting test patches, the human augmentation process, environment construction, and verification." 214 }, 215 "recruitment_methods_described": { 216 "applies": true, 217 "answer": false, 218 "justification": "The paper does not describe how the professional software engineers who created environments and augmented tasks were recruited, nor how the startup companies for commercial repositories were selected." 219 }, 220 "data_pipeline_documented": { 221 "applies": true, 222 "answer": true, 223 "justification": "Sections 4.1-4.3 document the pipeline from commit scraping → task description creation → environment construction → environment verification → test verification, with filtering criteria at each stage." 224 } 225 }, 226 "conflicts_of_interest": { 227 "funding_disclosed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No funding source is disclosed. All authors are from Scale AI, a commercial entity, but no funding acknowledgments section exists." 231 }, 232 "affiliations_disclosed": { 233 "applies": true, 234 "answer": true, 235 "justification": "All authors are clearly listed as affiliated with Scale AI." 236 }, 237 "funder_independent_of_outcome": { 238 "applies": true, 239 "answer": false, 240 "justification": "Scale AI is a commercial company that sells AI evaluation and data services. They have a direct financial interest in benchmarks being adopted and in demonstrating limitations of AI models that their services could help address." 241 }, 242 "financial_interests_declared": { 243 "applies": true, 244 "answer": false, 245 "justification": "No competing interests statement is provided. Scale AI is a for-profit company that sells benchmark evaluation as a service, creating a potential financial interest in benchmark adoption." 246 } 247 }, 248 "contamination": { 249 "training_cutoff_stated": { 250 "applies": true, 251 "answer": false, 252 "justification": "No training data cutoff dates are stated for any of the evaluated models. The paper only notes 'latest versions as of September 18th, 2025.'" 253 }, 254 "train_test_overlap_discussed": { 255 "applies": true, 256 "answer": true, 257 "justification": "The paper extensively discusses contamination risk and designs the benchmark to mitigate it through GPL licenses and commercial codebases (Section 1, Section 3.1), though no empirical overlap analysis is performed." 258 }, 259 "benchmark_contamination_addressed": { 260 "applies": true, 261 "answer": true, 262 "justification": "Contamination resistance is a core contribution. The paper uses GPL-licensed repos (legal barrier to training inclusion) and proprietary commercial repos to reduce contamination risk. Section 3.1 discusses this extensively." 263 } 264 }, 265 "human_studies": { 266 "pre_registered": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this benchmark evaluation study." 270 }, 271 "irb_or_ethics_approval": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this benchmark evaluation study." 275 }, 276 "demographics_reported": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this benchmark evaluation study." 280 }, 281 "inclusion_exclusion_criteria": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this benchmark evaluation study." 285 }, 286 "randomization_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants in this benchmark evaluation study." 290 }, 291 "blinding_described": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants in this benchmark evaluation study." 295 }, 296 "attrition_reported": { 297 "applies": false, 298 "answer": false, 299 "justification": "No human participants in this benchmark evaluation study." 300 } 301 }, 302 "cost_and_practicality": { 303 "inference_cost_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "A cost cap of $2 per instance is mentioned for the analysis runs (Table 5), but actual inference costs per model are not reported." 307 }, 308 "compute_budget_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 5 states 'Models are hosted on a single node, with 8 H100 Nvidia GPUs' for open-source models. A cost limit of $2 per instance and 50-turn limit are specified." 312 } 313 }, 314 "experimental_rigor": { 315 "seed_sensitivity_reported": { 316 "applies": true, 317 "answer": false, 318 "justification": "Results appear to be single-run Pass@1 scores with no mention of multiple seeds or seed sensitivity." 319 }, 320 "number_of_runs_stated": { 321 "applies": true, 322 "answer": false, 323 "justification": "The paper does not state the number of runs. Results appear to be single runs given they report Pass@1 without variance." 324 }, 325 "hyperparameter_search_budget": { 326 "applies": true, 327 "answer": false, 328 "justification": "No hyperparameter search is described. The paper uses default SWE-Agent settings without discussing whether alternatives were explored." 329 }, 330 "best_config_selection_justified": { 331 "applies": true, 332 "answer": false, 333 "justification": "The paper uses default SWE-Agent configuration without justifying why this is optimal or comparing alternative configurations." 334 }, 335 "multiple_comparison_correction": { 336 "applies": false, 337 "answer": false, 338 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 339 }, 340 "self_comparison_bias_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "Scale AI created the benchmark and evaluated models on it. No discussion of author-evaluation bias or potential for task design favoring certain model behaviors." 344 }, 345 "compute_budget_vs_performance": { 346 "applies": true, 347 "answer": false, 348 "justification": "Table 5 uses a cost cap of $2, but no performance-vs-compute curves are shown. Table 1 vs Table 5 show different results under different cost constraints but this is not explicitly analyzed as a compute budget comparison." 349 }, 350 "benchmark_construct_validity": { 351 "applies": true, 352 "answer": false, 353 "justification": "The paper does not discuss whether pass/fail on test suites actually measures 'software engineering capability.' Section 7.1 notes test-suite dependency but does not analyze construct validity." 354 }, 355 "scaffold_confound_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "All models are evaluated using the same SWE-Agent scaffold, which is good practice. However, the paper notes Agentless performs poorly but doesn't systematically evaluate scaffold confounds. The paper mentions using 'a unified scaffold' but does not discuss how scaffold choice affects results beyond dismissing Agentless." 359 } 360 }, 361 "data_leakage": { 362 "temporal_leakage_addressed": { 363 "applies": true, 364 "answer": true, 365 "justification": "The paper addresses temporal leakage through its contamination-resistant design: using GPL-licensed repos (legal barrier), commercial proprietary repos, and a held-out set. Section 3.1 discusses this explicitly." 366 }, 367 "feature_leakage_addressed": { 368 "applies": true, 369 "answer": false, 370 "justification": "The paper does not discuss whether the human augmentations (requirements, interface) provide information that would not be available in real-world usage, potentially inflating results." 371 }, 372 "non_independence_addressed": { 373 "applies": true, 374 "answer": false, 375 "justification": "No discussion of whether problems from the same repository share structural similarities that could inflate aggregate performance metrics." 376 }, 377 "leakage_detection_method": { 378 "applies": true, 379 "answer": true, 380 "justification": "The benchmark uses GPL licensing as a legal contamination prevention method and proprietary commercial repos as a structural prevention method. A held-out set is reserved for future overfitting detection." 381 } 382 } 383 }, 384 "claims": [ 385 { 386 "claim": "Top models achieve below 45% Pass@1 on SWE-Bench Pro public set", 387 "evidence": "Table 1: Claude Sonnet 4.5 at 43.6%, Claude Sonnet 4 at 42.7%, GPT-5 at 41.8%", 388 "supported": "strong" 389 }, 390 { 391 "claim": "SWE-Bench Pro is significantly harder than SWE-Bench Verified, with top models scoring 23% vs 70%+", 392 "evidence": "Section 8 states '23% success rate on SWE-Bench Pro compared to over 70% on benchmarks like SWE-Bench Verified'", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Human augmentations (requirements, interface) substantially improve model performance and reduce false negatives", 397 "evidence": "Table 3: GPT-5 drops from 25.9% to 8.4%, Opus 4.1 drops from 22.7% to 8.2% without augmentations", 398 "supported": "strong" 399 }, 400 { 401 "claim": "GPL licensing and commercial repositories make the benchmark contamination-resistant", 402 "evidence": "Section 3.1 argues GPL licenses create legal barriers to training inclusion, and commercial repos are private. No empirical contamination analysis is provided.", 403 "supported": "moderate" 404 }, 405 { 406 "claim": "Performance degrades sharply with increasing file count, and frontier models maintain performance better than smaller models on complex tasks", 407 "evidence": "Figure 3 shows performance-vs-file-count curves with widening gaps between frontier and open-source models beyond 3 files", 408 "supported": "moderate" 409 }, 410 { 411 "claim": "Commercial set is significantly harder than public set, with best models scoring under 18%", 412 "evidence": "Table 2: Claude Opus 4.1 at 17.8% on commercial vs Table 5 showing 22.7% on public (with cost cap). Different models/settings make comparison imprecise.", 413 "supported": "weak" 414 } 415 ], 416 "red_flags": [ 417 { 418 "flag": "Company evaluating its own benchmark", 419 "detail": "Scale AI created SWE-Bench Pro and evaluates all models on it. As a company selling AI evaluation services, Scale has a financial interest in benchmarks being widely adopted. No independent evaluation or self-comparison bias acknowledgment." 420 }, 421 { 422 "flag": "No variance or multi-run statistics", 423 "detail": "All results appear to be single-run Pass@1 with no error bars, confidence intervals, or multi-seed evaluation. LLM outputs are stochastic, so single-run results may not be stable." 424 }, 425 { 426 "flag": "Inconsistent comparison across tables", 427 "detail": "Table 1 and Table 5 report different results for the same models on the same public set due to different cost/turn constraints ($2 cap vs uncapped), making it unclear which is the authoritative result. The conclusion cites 23% (Table 5) while the abstract claims below 45% (Table 1)." 428 }, 429 { 430 "flag": "LLM-as-judge for failure analysis", 431 "detail": "Failure mode analysis uses GPT-5 as judge with only 87% alignment with human categorization cited from prior work. No validation of the judge's accuracy on SWE-Bench Pro's specific failure patterns." 432 }, 433 { 434 "flag": "GPL contamination argument is weak", 435 "detail": "The claim that GPL licensing prevents training inclusion is a legal argument, not an empirical one. GPL code is widely available on GitHub and could be scraped regardless of license terms. No empirical contamination testing was performed." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 441 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"], 442 "year": 2024, 443 "relevance": "Original SWE-bench benchmark that SWE-Bench Pro builds upon." 444 }, 445 { 446 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 447 "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"], 448 "year": 2024, 449 "relevance": "SWE-Agent scaffold used for all evaluations in this paper." 450 }, 451 { 452 "title": "Agentless: Demystifying LLM-Based Software Engineering Agents", 453 "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"], 454 "year": 2024, 455 "arxiv_id": "2407.01489", 456 "relevance": "Alternative scaffold evaluated but found to perform poorly on multi-file tasks." 457 }, 458 { 459 "title": "Evaluating Large Language Models Trained on Code", 460 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 461 "year": 2021, 462 "arxiv_id": "2107.03374", 463 "relevance": "HumanEval benchmark, foundational code generation evaluation." 464 }, 465 { 466 "title": "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving", 467 "authors": ["D. Zan", "Z. Huang", "W. Liu"], 468 "year": 2024, 469 "arxiv_id": "2404.02605", 470 "relevance": "Extended SWE-bench to multiple programming languages." 471 }, 472 { 473 "title": "AutoCodeRover: Autonomous Program Improvement", 474 "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"], 475 "year": 2024, 476 "relevance": "SWE agent using AST-based code search, evaluated on SWE-bench." 477 }, 478 { 479 "title": "SWE-bench+: Enhanced Coding Benchmark for LLMs", 480 "authors": ["R. Aleithan"], 481 "year": 2024, 482 "arxiv_id": "2410.06992", 483 "relevance": "Enhanced version of SWE-bench addressing known limitations." 484 }, 485 { 486 "title": "SWE-bench Goes Live!", 487 "authors": ["C. Zhang"], 488 "year": 2025, 489 "arxiv_id": "2505.23419", 490 "relevance": "Live version of SWE-bench addressing contamination through temporal freshness." 491 }, 492 { 493 "title": "Agent-RLVR: Training Software Engineering Agents via Guidance and Environment Rewards", 494 "authors": ["J. Da", "C. J. Wang", "X. Deng"], 495 "year": 2025, 496 "arxiv_id": "2506.11425", 497 "relevance": "RL training approach for SWE agents using benchmark instances." 498 }, 499 { 500 "title": "A Careful Examination of Large Language Model Performance on Grade School Arithmetic", 501 "authors": ["H. Zhang", "J. Da", "D. Lee"], 502 "year": 2024, 503 "arxiv_id": "2405.00332", 504 "relevance": "Demonstrates contamination risks in LLM benchmarks." 505 }, 506 { 507 "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models", 508 "authors": ["C. Deng", "Y. Zhao", "X. Tang", "M. Gerstein", "A. Cohan"], 509 "year": 2024, 510 "relevance": "Study on data contamination in LLM benchmarks." 511 }, 512 { 513 "title": "Program Synthesis with Large Language Models", 514 "authors": ["J. Austin", "A. Odena", "M. Nye"], 515 "year": 2021, 516 "arxiv_id": "2108.07732", 517 "relevance": "MBPP benchmark for code generation evaluation." 518 } 519 ] 520 }