scan.json (26135B)
1 { 2 "paper": { 3 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 4 "authors": [ 5 "Carlos E. Jimenez", 6 "John Yang", 7 "Alexander Wettig", 8 "Shunyu Yao", 9 "Kexin Pei", 10 "Ofir Press", 11 "Karthik Narasimhan" 12 ], 13 "year": 2023, 14 "venue": "ICLR 2024", 15 "arxiv_id": "2310.06770" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "SWE-bench introduces a benchmark of 2,294 real GitHub issues across 12 Python repositories for evaluating LMs on software engineering tasks. The best model (Claude 2) resolves only 1.96% of issues with BM25 retrieval, and 4.8% with oracle retrieval. Performance degrades with longer contexts, and models generate simpler, shorter edits than gold solutions. Fine-tuned SWE-Llama models are competitive with proprietary models in oracle settings but degrade significantly with BM25 retrieval due to context distribution shift.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper states 'Data, code, and leaderboard at swebench.com' and the reproducibility statement mentions uploading source code and planning open-source release." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper releases the 2,294 SWE-bench task instances, the 19,000-instance training set (SWE-bench-train), and SWE-Llama model weights. 'We include the full set of 2294 SWE-bench task instances' (Reproducibility Statement)." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper describes executable contexts created per release version with Python versions, dependencies, and installation commands (Appendix A.3). Conda environments are specified per repository version." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": true, 41 "justification": "The reproducibility statement (Section 9) describes organized source code with inline documentation, and separate directories for dataset collection, evaluation, inference, and training. The evaluation pipeline is described in detail in Appendix A.4." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Results are reported as point estimates (e.g., '1.96% of issues') with no confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "Model comparisons are made by comparing raw percentages without any statistical significance tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Performance is reported as percentage resolved with baseline context (e.g., 'Claude 2 from 4.8% to 5.9%' with oracle-collapsed), providing enough context to judge magnitude." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The benchmark size of 2,294 instances is not explicitly justified. No power analysis or justification for why this number is sufficient. GPT-4 was evaluated on a 25% random subset due to budget constraints without justifying adequacy." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Single-run results only. The paper uses greedy decoding (Appendix D.2) and reports single-run numbers with no variance across runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Multiple models compared (ChatGPT-3.5, GPT-4, Claude 2, SWE-Llama 7b/13b) across multiple retrieval settings (BM25, oracle, oracle-collapsed)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Claude 2, GPT-4, and ChatGPT-3.5 were state-of-the-art models at submission time (2023). CodeLlama was the best open model for long contexts." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper ablates retrieval method (BM25 vs oracle vs oracle-collapsed), context length, and output format (patch vs whole file generation). Table 6 shows oracle-collapsed results." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Reports both % Resolved and % Apply (patch application rate). Appendix C.5 provides detailed F2P/P2P analysis with six outcome categories." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 5.1 and Appendix F provide qualitative analysis of 11 generations from SWE-Llama and Claude 2 with in-depth manual inspection of patches." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Training data for SWE-Llama uses 37 repositories disjoint from the 12 evaluation repositories. A development set of 225 instances from 6 separate repositories is also provided (Appendix A.6)." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Figure 4 and Table 19 show per-repository breakdowns. Table 7/21 show temporal breakdowns. Table 22/23 provide detailed outcome categorization." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Extensive failure analysis in Section 5.1 and Appendix F with 11 qualitative case studies. Discussion of models writing primitive code, greedy problem-solving, and struggling with multi-file changes." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that generating whole files performs worse than patches (2.2% vs 4.8% for Claude 2), SWE-Llama degrades with BM25 retrieval, and longer contexts hurt performance." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims are well-supported: 2,294 instances from 12 repos (Table 10), Claude 2 at 1.96% (Table 5), and claims about difficulty are backed by results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper is cautious with causal language. Claims like 'performance drops considerably' with longer context are supported by controlled comparisons (same model, varying context). Ablation of retrieval method and output format use controlled single-variable manipulation." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title asks 'Can Language Models Resolve Real-World GitHub Issues?' but the benchmark is Python-only from 12 repositories. The paper acknowledges this in Section 7 ('SWE-bench task instances are all in Python') but the framing is broader than the evidence." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper discusses multiple factors: context length effects, retrieval quality, distribution shift for fine-tuned models, image content in issues (Section 5), and the role of code style vs. functional correctness in qualitative analyses." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper's metric (% of issues resolved via test-passing) is clearly defined and the paper discusses its limitations: 'relying solely on this method is insufficient to guarantee reliable performance' and notes that LM solutions are 'less comprehensive, efficient, or readable compared to human-written solutions' (Section 7)." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model versions stated: 'gpt-3.5-turbo-16k-0613', 'gpt-4-32k-0613', 'Claude 2', and 'CodeLlama-Python 7b/13b' (Section 4.3)." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full prompt template provided in Appendix D.3 with complete text including instructions, issue formatting, and example patch format." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Greedy decoding stated for all models (Appendix D.2). SWE-Llama training uses LoRA r=16, α=16, dropout=0.05, learning rate 6e-4, batch size 32, max 4 epochs (Appendix B.1)." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. Models are given input and generate a single patch output with no tools, feedback loops, or multi-step interaction." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "The three-stage pipeline is described in detail (Section 2.1, Appendix A.1-A.3): repo selection, attribute filtering, execution-based filtering, with counts at each stage (Table 10: 93,139 PRs → 11,407 candidates → 2,294 final)." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 7 'Discussion' contains 'Limitations and future directions' paragraph discussing Python-only scope, baseline simplicity, and evaluation limitations." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific limitations discussed: Python-only (Section 7), reliance on test-passing as sole evaluation metric, popular-repository bias, models' inability to process images in issues (Section 5), GPT-4 evaluated on 25% subset due to budget (Table 5)." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 7 explicitly states: benchmark is Python-only, experiments establish baselines of 'simplest and most straight-forward approaches', and test-based evaluation is 'insufficient to guarantee reliable performance.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "Full task instances released as JSON, repository mirrors created for reproducibility, and ground truth test results cached. 'We save all finalized task instances to a single .json file that is open sourced' (Appendix A.3)." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Detailed three-stage collection pipeline in Section 2.1 and Appendix A.1-A.3: source repository selection from top 5,000 PyPI packages, PR scraping via GitHub API, attribute and execution-based filtering." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data sourced from public GitHub repositories using a standard public benchmark construction methodology." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Table 10 shows exact counts at each pipeline stage for all 12 repositories: 93,139 total PRs → 11,407 post-conversion → 2,294 post-validation. Appendix A.2-A.3 detail each transformation step." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Section 10 Acknowledgements: 'We acknowledge support from the National Science Foundation under Grant No. 2239363 and an Oracle Collaborative Research award.'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors list Princeton University and Princeton Language and Intelligence, except Kexin Pei at University of Chicago. No conflict with evaluated products." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "NSF and Oracle Collaborative Research award have no direct financial interest in the performance of Claude 2, GPT-4, or other evaluated models." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is included in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No explicit training data cutoff dates stated for Claude 2, GPT-4, or ChatGPT-3.5. The paper does not specify when these models' training data ends." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": true, 242 "justification": "Table 7 analyzes performance on instances before vs. after 2023, finding 'little difference in performance' and concluding models are 'unlikely to cheat to address issues simply by generating a more recent version of the repository.' SWE-Llama training uses disjoint repositories." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": true, 247 "justification": "The paper addresses contamination: temporal analysis in Table 7/21, disjoint repo splits for SWE-Llama, and the benchmark's 'continually updatable' nature allows evaluation on post-training instances. The paper argues temporal analysis shows models don't benefit from memorization." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. Ethics statement (Section 8) confirms 'Our contributions do not involve any human subject participation.'" 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No API costs or per-instance inference costs reported. GPT-4 was limited to a 25% subset 'due to budget constraints' but the actual cost is not quantified." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "SWE-Llama 7b training: 20 hours on 4 NVIDIA A100s. SWE-Llama 13b: 47 hours on 8 NVIDIA A100s (Appendix B.1). However, inference compute for API models is not stated." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "Greedy decoding used (single deterministic run). No seed sensitivity analysis performed." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": true, 308 "justification": "The paper explicitly states greedy decoding with a single generation per instance (Appendix D.2: 'we only generate a single patch file per instance')." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "SWE-Llama training uses a specific LoRA configuration but no search budget is reported. Prompt experiments are mentioned ('slightly more or fewer lines of instructions or examples seemed to not affect overall performance substantially') but not quantified." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "SWE-Llama checkpoint selected based on validation loss on 100 held-out instances (Appendix B.1). For BM25, best context length is selected transparently (Table 2 shows all three)." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors introduce the benchmark and evaluate models on it. While they don't implement baselines of others' systems (they use API models directly), they don't discuss potential biases in benchmark design favoring certain model types." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Models have vastly different context windows and compute costs. GPT-4 was evaluated on only 25% of instances. These compute differences are not systematically compared to performance." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": true, 338 "justification": "Section 2.3 extensively discusses what SWE-bench measures vs. prior benchmarks. Section 7 acknowledges that test-passing alone is 'insufficient to guarantee reliable performance.' Appendix C.7 explores software engineering metrics as complementary evaluation." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is used. All models receive the same retrieval-based input format without agentic components." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "Table 7 and Table 21 analyze performance by temporal partition (before/after 2023, and by year). The paper discusses that models may have seen older codebases in training and finds 'little difference in performance before or after this date.'" 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": true, 355 "justification": "The oracle retrieval setting is explicitly acknowledged as unrealistic. The paper discusses how issue text may contain hints, and the hints_text field separates PR comments created before the initial commit to avoid solution leakage (Appendix A.2)." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether task instances from the same repository are independent. Multiple instances from the same codebase may share structural similarities. Django contributes 850/2,294 instances (37%), creating strong non-independence." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": true, 365 "justification": "Temporal analysis (Table 7) serves as an empirical leakage detection method. SWE-Llama uses disjoint repository splits. The continually-updatable design allows post-training-cutoff evaluation." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Claude 2 resolves only 1.96% of SWE-bench issues with BM25 retrieval, making it the best-performing model in this setting.", 372 "evidence": "Table 5 shows Claude 2 at 1.96% resolved with BM25, compared to ChatGPT-3.5 (0.17%), GPT-4-turbo (1.31%), and SWE-Llama 13b (0.70%).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Performance degrades as total context length increases.", 377 "evidence": "Figure 5 shows Claude 2 performance drops with longer inputs. Table 2 shows performance decreasing from 1.96% (13k) to 1.22% (50k) for Claude 2.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Oracle retrieval improves performance: Claude 2 achieves 4.80% with oracle files.", 382 "evidence": "Table 18 shows Claude 2 at 4.80% with oracle retrieval vs. 1.96% with BM25 (Table 5).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "SWE-Llama models are competitive with Claude 2 in oracle settings but degrade significantly with BM25 retrieval.", 387 "evidence": "Table 18 shows SWE-Llama 13b at 3.97% vs Claude 2 at 4.80% (oracle). Table 5 shows SWE-Llama 13b at 0.70% vs Claude 2 at 1.96% (BM25). Section 5 attributes this to context distribution shift.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Model performance does not correlate with issue resolution date, suggesting models are not 'cheating' by memorizing solutions.", 392 "evidence": "Table 7 shows similar performance before and after 2023 for most models, with extended analysis in Table 21 across 6 temporal partitions.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Models generate shorter, simpler edits than gold patches.", 397 "evidence": "Table 8 shows model patches average 30.1 total lines vs 74.5 for gold patches, edit fewer files (1.0 vs 1.7), and fewer functions.", 398 "supported": "strong" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "GPT-4 evaluated on 25% subset only", 404 "detail": "Due to budget constraints, GPT-4 was evaluated on only 574 of 2,294 instances in BM25 27k and oracle settings. This may affect reliability of GPT-4 comparisons, though Table 20 suggests the subset is representative." 405 }, 406 { 407 "flag": "No error bars or variance measures", 408 "detail": "All results are single-run with greedy decoding. While greedy decoding is deterministic, no uncertainty quantification is provided for the aggregate metrics over the dataset." 409 }, 410 { 411 "flag": "Heavy repository imbalance", 412 "detail": "Django contributes 850/2,294 (37%) of all task instances, potentially biasing aggregate results toward Django-style issues. Performance varies significantly across repos (Figure 4)." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Evaluating large language models trained on code", 418 "authors": ["Mark Chen", "Jerry Tworek"], 419 "year": 2021, 420 "relevance": "HumanEval benchmark — the dominant code generation benchmark that SWE-bench is designed to move beyond." 421 }, 422 { 423 "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models", 424 "authors": ["Aarohi Srivastava"], 425 "year": 2023, 426 "relevance": "BIG-bench — large-scale LM evaluation framework that SWE-bench positions against as a more focused benchmark." 427 }, 428 { 429 "title": "Code llama: Open foundation models for code", 430 "authors": ["Baptiste Rozière"], 431 "year": 2023, 432 "relevance": "Base model for SWE-Llama fine-tuning; key open-source code model." 433 }, 434 { 435 "title": "AgentBench: Evaluating LLMs as agents", 436 "authors": ["Xiao Liu"], 437 "year": 2023, 438 "relevance": "Agent evaluation benchmark that SWE-bench relates to for agent-based code editing." 439 }, 440 { 441 "title": "An analysis of the automatic bug fixing performance of chatgpt", 442 "authors": ["Dominik Sobania"], 443 "year": 2023, 444 "relevance": "LLM-based automated program repair evaluation, directly related to SWE-bench's task." 445 }, 446 { 447 "title": "Conversational automated program repair", 448 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 449 "year": 2023, 450 "relevance": "Conversational approach to LLM-based program repair — related methodology to iterative SWE-bench solving." 451 }, 452 { 453 "title": "Lost in the middle: How language models use long contexts", 454 "authors": ["Nelson F. Liu"], 455 "year": 2023, 456 "arxiv_id": "2307.03172", 457 "relevance": "Context utilization study that corroborates SWE-bench finding that models struggle with long-context code localization." 458 }, 459 { 460 "title": "Holistic evaluation of language models", 461 "authors": ["Percy Liang"], 462 "year": 2022, 463 "relevance": "HELM benchmark framework — multi-task LM evaluation that SWE-bench positions as alternative to potpourri-style evaluation." 464 }, 465 { 466 "title": "AI safety subproblems for software engineering researchers", 467 "authors": ["David Gros"], 468 "year": 2023, 469 "relevance": "Discusses safety implications of AI-driven software engineering, referenced in SWE-bench societal impact discussion." 470 }, 471 { 472 "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation", 473 "authors": ["Federico Cassano"], 474 "year": 2022, 475 "relevance": "Multi-language code generation benchmark that extends HumanEval; SWE-bench positions against as more realistic." 476 }, 477 { 478 "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation", 479 "authors": ["Xueying Du"], 480 "year": 2023, 481 "relevance": "Class-level code generation benchmark representing more complex coding tasks than HumanEval." 482 }, 483 { 484 "title": "WebArena: A realistic web environment for building autonomous agents", 485 "authors": ["Shuyan Zhou"], 486 "year": 2023, 487 "relevance": "Realistic agent evaluation environment; parallel to SWE-bench's approach of grounding evaluation in real-world tasks." 488 } 489 ] 490 }