scan.json (24754B)
1 { 2 "paper": { 3 "title": "SWE-Bench+: Enhanced Coding Benchmark for LLMs", 4 "authors": ["Reem Aleithan", "Haoran Xue", "Mohammad Mahdi Mohajer", "Elijah Nnorom", "Gias Uddin", "Song Wang"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2410.06992" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "The SWE-bench+ dataset is released on Zenodo (https://zenodo.org/records/13879453), referenced in the 'Artifacts' paragraph of Section 3." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The SWE-bench+ dataset of 548 task instances is released on Zenodo for replication and extension." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "No environment specifications, requirements files, or dependency details are provided in the paper." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper describes the methodology at a high level ('we followed the same data collection methodology outlined in the SWE-Bench study') but provides no step-by-step reproduction instructions or scripts." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "All resolution rates are reported as point estimates (e.g., 12.47%, 3.97%, 0.55%) with no confidence intervals or error bars." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper claims significant drops in resolution rates (e.g., 12.47% to 3.97%) but uses no statistical tests to determine whether differences are significant." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "Resolution rate drops are reported with baseline context: e.g., 'drops from 12.47% to 3.97%' and 'from 18.83% to 3.83% for AutoCodeRover', providing magnitude of change from explicit baselines." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "The 251 resolved patches and 548 SWE-bench+ instances are used without justification for whether these sample sizes are adequate for the claims made." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Single-run numbers only." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Four models are compared: SWE-Agent+GPT-4, SWE-RAG+GPT-4, SWE-RAG+GPT-3.5, and AutoCodeRover+GPT-4o. Original SWE-bench leaderboard numbers serve as baselines." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper acknowledges that 'other top approaches (e.g., Honeycomb, Amazon Q Developer Agent, and Factory Code Droid) were either closed-sourced commercial tools or not verified.' The evaluated models were at the top during the study period but are not the most competitive approaches available." 71 }, 72 "ablation_study": { 73 "applies": false, 74 "answer": false, 75 "justification": "This is a benchmark quality analysis, not a system with components to ablate." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper reports resolution rate, effectiveness-aware cost per issue fixed, average cost per instance, and average time per instance (Table 4)." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": true, 85 "justification": "Three authors independently performed manual patch validation comparing gold patches to model-generated patches. Disagreements were resolved through discussion (Section 2)." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "SWE-bench+ is constructed from issues created after model training cutoffs (post October 2023), serving as a temporally held-out test set." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 1 breaks down the 251 patches into 6 patterns. Table 2 breaks down patterns for Lite and Verified. Table 3 breaks down per model on SWE-bench+. Figure 7 shows per-project distribution." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Sections 2.1.1 discusses four failure patterns with specific examples (Figures 3-6): solution leak, incorrect fixes, different files changed, and incomplete fixes." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "The main finding is negative: resolution rates drop dramatically when problematic instances are filtered. This is inherently a negative-results paper about SWE-bench quality." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims of 32.67% solution leakage, 31.08% weak tests, and the drop from 12.47% to 3.97% are all supported by Table 1 and Figure 1b. The 0.55% SWE-bench+ rate is supported by Section 4." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper makes causal claims (solution leakage causes inflated resolution rates) supported by a reasonable design: manually classifying patches and showing the rate drops after filtering. The causal mechanism (models copy leaked solutions) is demonstrated with concrete examples." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The title frames this as evaluating 'LLMs' generally, but only 4 model configurations (GPT-3.5, GPT-4, GPT-4o with specific scaffolds) are tested. Claims about 'LLMs' are not bounded to the tested models." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper does not discuss alternative explanations for the performance drops. For example, SWE-bench+ uses different repositories/time periods which could introduce difficulty confounds beyond just removing leakage." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper directly addresses the gap between what SWE-bench measures (test pass rate) and what it claims (issue resolution capability). The core contribution is showing this proxy is unreliable due to weak tests and leaked solutions." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Specific model versions are provided: 'GPT-3.5 (turbo-16k-0613)', 'GPT-4 (1106)', 'GPT-4o (2024-05-13)', and 'AutoCodeRover (v20240620)'." 140 }, 141 "prompts_provided": { 142 "applies": false, 143 "answer": false, 144 "justification": "The paper evaluates existing tools (SWE-Agent, SWE-RAG, AutoCodeRover) following their published instructions. The authors did not design custom prompts." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "The paper evaluates third-party tools (SWE-Agent, AutoCodeRover) as black boxes, following their published instructions. It cannot be expected to describe their internal scaffolding." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 3 describes the data collection pipeline: same 12 projects minus Django, temporal filter (post 2023-11-01), attribute filtering, execution filter, manual screening for solution leakage. Steps with counts are provided (548 final instances)." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions 'weak test cases continue to pose challenges' but this is a finding, not a limitation of the study itself." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No specific threats to validity are discussed. The paper does not address potential biases in the manual patch classification or limitations of using only 4 model configurations." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper does not explicitly state what its results do NOT show. No discussion of scope limitations regarding the models tested, the Python-only focus, or the generalizability of findings to other benchmarks." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": true, 183 "justification": "The SWE-bench+ dataset is released on Zenodo (https://zenodo.org/records/13879453), enabling independent verification." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 3 describes the collection: same 12 projects as SWE-bench (minus Django), issues from 2023-11-01 to 2024-08-22, same filtering methodology as original SWE-bench, manual screening for solution leakage." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants. Data source is GitHub issues from a standard set of repositories." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 4 describes the 4-step evaluation pipeline: Step 1 store patches, Step 2 evaluate with SWE-bench scripts, Step 3 filter resolved instances, Step 4 manual patch validation. The collection pipeline in Section 3 also includes filtering stages." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding source or acknowledgments section is present in the paper." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All authors are from York University's Lassonde School of Engineering, clearly stated in the header." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding information is disclosed, so independence cannot be assessed." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement is present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": true, 227 "justification": "Training cutoffs are explicitly stated: GPT-3.5 September 2021, GPT-4 April 2023, GPT-4o October 2023 (Section 3)." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": true, 232 "justification": "The paper's core contribution addresses this: '94% of the instances in SWE-bench and their pull requests were created prior to the training cut-off dates of the LLMs' (Section 1). SWE-bench+ uses temporal splits to address this." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": true, 237 "justification": "The paper directly addresses contamination by constructing SWE-bench+ with issues from after October 2023, ensuring temporal separation from all model training data." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants. This is a benchmark quality analysis study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Table 4 reports average cost per instance and effectiveness-aware cost per issue for all four models (e.g., SWE-Agent+GPT-4: $0.24/instance, $655/fix)." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Table 4 reports average time per instance. Total compute is derivable: SWE-Agent+GPT-4 ~37 hours, AutoCodeRover ~41 hours for all 548 instances." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No mention of multiple runs or seed sensitivity. All results appear to be single-run." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The number of experimental runs is not stated. Results appear to be from a single execution of each model." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": false, 302 "answer": false, 303 "justification": "The paper evaluates existing tools with their default configurations; no hyperparameter search is performed." 304 }, 305 "best_config_selection_justified": { 306 "applies": false, 307 "answer": false, 308 "justification": "No configuration selection is performed. Models are run with their published default settings." 309 }, 310 "multiple_comparison_correction": { 311 "applies": false, 312 "answer": false, 313 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "The authors build SWE-bench+ and then evaluate models on it. They do not discuss potential bias from being both benchmark creators and evaluators." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": true, 323 "justification": "Section 5 and Table 4 explicitly compare cost vs performance across models, noting trade-offs (e.g., SWE-Agent most expensive but similar performance to cheaper RAG+GPT-4)." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": true, 328 "justification": "The entire paper is about benchmark construct validity — whether SWE-bench actually measures what it claims (issue resolution capability vs. test pass rate with leaked solutions)." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The paper compares SWE-Agent, SWE-RAG, and AutoCodeRover — different scaffolds paired with different models — without addressing the scaffolding confound. Performance differences are attributed to model+scaffold bundles without disentangling them." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "This is the paper's primary contribution. They show 94% of SWE-bench issues predate model training cutoffs and construct SWE-bench+ with post-cutoff issues." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": true, 345 "justification": "The paper identifies and addresses solution leakage (32.67% of resolved instances had solutions in issue descriptions/comments), which is a form of feature leakage." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether issues within the same repository or by the same contributors might be non-independent or share structural patterns." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": true, 355 "justification": "Manual inspection by three authors comparing gold patches to issue descriptions/comments to detect solution leakage. Temporal filtering used to prevent data leakage." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "32.67% of SWE-Agent+GPT-4 successful patches involve solution leakage where solutions were directly provided in issue reports or comments.", 362 "evidence": "Table 1: 82 of 251 patches classified as solution leak through manual review by three authors (Section 2.1.1).", 363 "supported": "strong" 364 }, 365 { 366 "claim": "31.08% of passed patches are suspicious due to weak test cases that fail to catch incorrect, incomplete, or wrong-file patches.", 367 "evidence": "Table 1: 32 incorrect fixes + 9 different files + 37 incomplete fixes = 78 of 251 patches (Section 2.1.1).", 368 "supported": "strong" 369 }, 370 { 371 "claim": "After filtering problematic instances, SWE-Agent+GPT-4 resolution rate drops from 12.47% to 3.97%.", 372 "evidence": "Figure 1b and Section 2.2. Only 91 of 251 patches classified as correct fixes (different from gold: 76, more comprehensive: 15).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "SWE-bench Lite and Verified also suffer from solution leakage (33.33% and 33.04% respectively).", 377 "evidence": "Table 2, Section 2.3. Manual review of all issue reports in both variants.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "On SWE-bench+, SWE-Agent+GPT-4 resolution rate drops to 0.55%.", 382 "evidence": "Section 4, Table 3: only 3 correct fixes out of 548 instances.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "AutoCodeRover+GPT-4o achieves the highest resolution rate on SWE-bench+ at 3.83% but drops from 18.83% on original leaderboard.", 387 "evidence": "Section 4, Table 3: 21 correct patches out of 548 instances.", 388 "supported": "moderate" 389 } 390 ], 391 "methodology_tags": ["benchmark-eval", "observational"], 392 "key_findings": "SWE-bench has critical quality problems: 32.67% of successfully resolved issues contain leaked solutions in issue descriptions, and 31.08% pass despite incorrect/incomplete patches due to weak test cases. After filtering these problems, SWE-Agent+GPT-4's resolution rate drops from 12.47% to 3.97%. On the new SWE-bench+ dataset (post-training-cutoff issues with no solution leakage), resolution rates drop further to 0.55% for SWE-Agent+GPT-4, raising serious questions about current LLM coding benchmark validity.", 393 "red_flags": [ 394 { 395 "flag": "No limitations section", 396 "detail": "The paper has no dedicated limitations or threats-to-validity section despite making strong claims about benchmark quality. Potential biases in manual classification (three annotators, no inter-rater reliability reported beyond 'disagreements resolved through discussion') are not discussed." 397 }, 398 { 399 "flag": "Confounded comparison", 400 "detail": "SWE-bench+ differs from SWE-bench in multiple ways simultaneously: different time period, different repositories (Django excluded), different issue difficulty distribution. The performance drop cannot be attributed solely to removing leakage/weak tests vs. inherently harder recent issues." 401 }, 402 { 403 "flag": "No statistical testing", 404 "detail": "Claims of 'significant' drops in resolution rates are made without any statistical significance tests. With small absolute numbers (e.g., 3 correct fixes out of 548), sampling variance could be substantial." 405 }, 406 { 407 "flag": "Single-run results", 408 "detail": "All model evaluations appear to be single runs with no assessment of variance. LLM outputs are stochastic and results could differ across runs." 409 }, 410 { 411 "flag": "Inter-rater reliability not quantified", 412 "detail": "Three authors classified 251 patches into patterns but no inter-rater agreement metric (Cohen's kappa, Fleiss' kappa) is reported. Only 'disagreements were resolved through discussion.'" 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 418 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 419 "year": 2024, 420 "arxiv_id": "2310.06770", 421 "relevance": "The original SWE-bench benchmark that this paper critiques and extends." 422 }, 423 { 424 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 425 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 426 "year": 2024, 427 "arxiv_id": "2405.15793", 428 "relevance": "Primary evaluated tool; top SWE-bench performer at time of study." 429 }, 430 { 431 "title": "AutoCodeRover: Autonomous program improvement", 432 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 433 "year": 2024, 434 "arxiv_id": "2404.05427", 435 "relevance": "Evaluated as one of the top open-source SWE-bench models." 436 }, 437 { 438 "title": "Agentless: Demystifying LLM-based software engineering agents", 439 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 440 "year": 2024, 441 "arxiv_id": "2407.01489", 442 "relevance": "Alternative approach to LLM-based issue resolution evaluated on SWE-bench." 443 }, 444 { 445 "title": "Evaluating large language models trained on code", 446 "authors": ["Mark Chen"], 447 "year": 2021, 448 "arxiv_id": "2107.03374", 449 "relevance": "Introduced HumanEval benchmark for code generation; foundational code evaluation work." 450 }, 451 { 452 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 453 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 454 "year": 2024, 455 "relevance": "EvalPlus framework showing overestimation of LLM performance due to insufficient test cases — directly relevant to benchmark quality." 456 }, 457 { 458 "title": "CODER: Issue resolving with multi-agent and task graphs", 459 "authors": ["Dong Chen"], 460 "year": 2024, 461 "arxiv_id": "2406.01304", 462 "relevance": "Multi-agent approach to SWE-bench issue resolution." 463 }, 464 { 465 "title": "Diversity empowers intelligence: Integrating expertise of software engineering agents", 466 "authors": ["Kexun Zhang"], 467 "year": 2024, 468 "arxiv_id": "2408.07060", 469 "relevance": "Multi-agent framework for code generation showing impact of data diversity on LLM performance." 470 }, 471 { 472 "title": "SWE-bench-java: A GitHub issue resolving benchmark for Java", 473 "authors": ["Daoguang Zan"], 474 "year": 2024, 475 "arxiv_id": "2408.14354", 476 "relevance": "Extension of SWE-bench to Java; addresses language diversity in benchmark evaluation." 477 }, 478 { 479 "title": "MAGIS: LLM-based multi-agent framework for GitHub issue resolution", 480 "authors": ["Wei Tao"], 481 "year": 2024, 482 "arxiv_id": "2403.17927", 483 "relevance": "Multi-agent approach evaluated on SWE-bench." 484 } 485 ] 486 }