scan.json (26581B)
1 { 2 "paper": { 3 "title": "The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason", 4 "authors": ["Shanchao Liang", "Spandan Garg", "Roshanak Zilouchian Moghaddam"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2506.12286" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Models achieve up to 76% accuracy identifying buggy file paths from SWE-Bench Verified issue descriptions alone without repository access, but only up to 53% on outside-repo tasks, suggesting memorization rather than reasoning. Function reproduction shows up to 35% consecutive 5-gram overlap on SWE-Bench Verified vs. only 18% on external benchmarks. Prefix completion reveals 12-32% of SWE-Bench Verified instances produce verbatim matches across models, with Claude 4 Opus reaching 31.6%.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, code archive, or supplementary materials link is provided in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "SWE-Bench Verified and Full are public, but the authors' custom benchmarks (SWE-Bench C# described as 'internal benchmark', SWE-Bench Extra, Outside-Repo Tasks) are not released." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, dependency lists, or setup instructions are provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No reproduction instructions are included. The prompt templates are shown but no runnable pipeline is provided." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Figures 8-11 and Tables 2-3 are point estimates with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims performance differences across benchmarks (e.g., 76% vs 53%) without any statistical tests. Differences are asserted by visual comparison of bar charts." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Percentage accuracies and 5-gram overlap ratios are reported with baselines, enabling readers to compute effect sizes (e.g., 76% on Verified vs. 53% on Outside-Repo is a 23 percentage point gap)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "Sample sizes vary across benchmarks (500, 200, 217, 245, 39, 75) with no justification for why these sizes were chosen. The 39-task RefactorBench subset is particularly small with no explanation for why only 39 of 100 tasks were used." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "All results appear to be single-run with 'default sampling settings.' No variance, standard deviation, or spread measures are reported." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The cross-benchmark comparison design uses external benchmarks (RefactorBench, Outside-Repo Tasks, SWE-Bench C#) as implicit baselines against SWE-Bench Verified performance." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Ten contemporary models evaluated including Claude 4 Opus/Sonnet (May 2025), GPT-4.1, o3, o4-mini — all state-of-the-art at time of writing." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "The paper presents three independent diagnostic tasks, not a single system with ablatable components." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Four metrics used: accuracy, filtered accuracy, 5-gram consecutive overlap, and instance-level verbatim match percentage (Sections 3.3, Table 3)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of model outputs. All evaluation is automated (exact match, n-gram overlap). Manual inspection of whether identified 'memorization' cases truly reflect memorization vs. legitimate reasoning could strengthen the claims." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The cross-benchmark design uses SWE-Bench Extra (post-cutoff issues) and Outside-Repo Tasks as held-out comparisons not subject to the memorization hypothesis." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results broken down by benchmark (6 sets), by model (10 models), and by filtered/unfiltered instances (Figures 8-11, Table 2)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No qualitative analysis of individual cases where models succeeded or failed. No examples of what memorized vs. reasoned responses look like." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 4.5 reports that on SWE-Bench Extra, most models produce negative Δ5 (closer to buggy than fixed code), supporting the claim that freshness removes memorization advantage." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of 76% file-path accuracy (Figure 8, o3), 53% on outside repos (Figure 8), and 35%/18% 5-gram overlap (Figure 10) are all supported by the experimental results." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper claims performance gaps are 'driven by memorization' (abstract) and that models 'remember instead of reason' (title), but this is inferred from observational performance differences across benchmarks. Confounds like task difficulty, description length (Table 1 shows RefactorBench has 14.6 token descriptions vs. 451+ for others), and language differences are acknowledged but not controlled." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "Claims are specifically about SWE-Bench and the tested models. The title refers to 'State-of-the-Art LLMs' which matches the 10 models tested. Claims focus on SWE-Bench ecosystem rather than all coding benchmarks." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper briefly acknowledges confounds (Table 1 shows different repo sizes and description lengths) but does not substantively discuss alternative explanations such as: task difficulty differences, structural differences between bug-fix and refactoring tasks, language-specific performance gaps for C#, or whether repo familiarity (vs. memorization) explains results." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper uses file-path identification accuracy and n-gram overlap as proxies for 'memorization' but does not discuss the gap between these measurements and the conclusion. High file-path accuracy could reflect general architectural knowledge of popular repos rather than instance-specific memorization. High n-gram overlap could reflect common coding patterns rather than verbatim recall." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section 3.4 specifies exact model versions with snapshot dates: 'gpt-4o-2024-08', 'gpt-4o-2024-05', 'gpt-4.1-2025-04-14', 'o3-2025-04-16', 'Claude 3.5 Sonnet (June 20, 2024)', 'Claude 4.0 Sonnet...May 22, 2025', etc." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt implementations are shown in Figures 2, 6, and 7, including the complete code for prompt construction. The function reproduction prompt template is in Figure 7." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Section 3.4 states max generation length (2048/4096 tokens) and 'default sampling settings' but does not specify temperature, top-p, or other sampling parameters. 'Default settings' is insufficient since defaults differ across providers and may change." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. Models are queried directly via API with single-turn prompts." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Sections 3.1-3.2 describe how each benchmark was constructed: SWE-Bench C# follows SWE-Bench pipeline (75 tasks, 11 repos), Outside-Repo Tasks collected from 7 named repos (245 instances), SWE-Bench Extra from recent issues (217 tasks). Section 2 describes how diagnostic tasks are derived from benchmark instances." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "No dedicated limitations or threats-to-validity section exists in the paper. The conclusion mentions the need for better benchmarks but does not discuss limitations of the study itself." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No specific threats to validity are discussed. The paper does not address whether its own methodology could produce false positives for memorization detection." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries stated. The paper does not clarify what its evidence does NOT show (e.g., it cannot quantify how much of SWE-Bench performance is memorization vs. reasoning)." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data (model outputs, per-instance results) is made available for independent verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Sections 3.1-3.2 describe data collection: SWE-Bench Extra from recent issues in SWE-Bench repos post-cutoff, Outside-Repo Tasks from 7 named popular repos, SWE-Bench C# from 11 C# repos following SWE-Bench pipeline." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard and custom benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The paper describes task construction at a high level but omits pipeline details: no filtering counts for SWE-Bench Extra (how many issues were collected vs. kept), no criteria for the 39-task RefactorBench subset selection, no description of quality checks on Outside-Repo Tasks." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding acknowledgment appears in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations clearly listed: Purdue University and Microsoft." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Two of three authors are Microsoft employees. Microsoft competes with OpenAI and Anthropic in AI. The paper evaluates products from these competitors for memorization/contamination, creating a potential conflict that is not disclosed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial disclosure statement in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state training data cutoff dates for any of the 10 models evaluated, despite this being central to the memorization argument." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": true, 234 "justification": "This is the paper's central topic. Section 1 discusses how GitHub repos in training corpora overlap with SWE-Bench. The cross-benchmark design is specifically intended to detect this overlap." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": true, 239 "justification": "The entire paper is about detecting and demonstrating benchmark contamination in SWE-Bench. The three diagnostic tasks are specifically designed to measure contamination effects." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference costs reported despite evaluating 10 models across 6 benchmarks on 3 tasks. Section 4.3 mentions 'limited resources' as reason for not running o3 on function reproduction but gives no cost figures." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total compute budget or API costs stated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Results are from single runs with 'default sampling settings' (Section 3.4). No seed sensitivity analysis despite using stochastic models." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Number of runs per experiment is not stated. Results appear to be single-run." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": false, 304 "answer": false, 305 "justification": "No hyperparameter tuning was performed; default API settings were used." 306 }, 307 "best_config_selection_justified": { 308 "applies": false, 309 "answer": false, 310 "justification": "No configuration selection — default settings used throughout." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper makes dozens of implicit comparisons across 10 models × 6 benchmarks without any statistical tests, let alone correction for multiple comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": false, 319 "answer": false, 320 "justification": "The paper evaluates existing third-party models rather than proposing its own system." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": false, 324 "answer": false, 325 "justification": "The paper compares models on diagnostic tasks, not competing methods with different compute requirements." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "The entire paper is about the construct validity of SWE-Bench — whether it measures reasoning ability or memorization. This is the core contribution." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding involved. Models are queried directly via single-turn API calls." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "SWE-Bench Extra (Section 3.2.2) uses recent issues from repos 'mostly created after the original SWE-Bench dataset cutoff date' as a temporal control." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "The diagnostic tasks deliberately remove context (repo structure, function signatures) to test whether models succeed via memorization vs. reasoning. Filtered accuracy metric (Section 3.3) additionally removes instances where file paths appear in issue descriptions." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "The cross-benchmark design explicitly compares same-repo (SWE-Bench variants) vs. different-repo (Outside-Repo Tasks) performance to test whether train/test dependence explains results." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": true, 357 "justification": "Three concrete detection methods applied: cross-benchmark performance analysis, 5-gram consecutive overlap measurement, and prefix completion verbatim matching." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "State-of-the-art models achieve up to 76% accuracy in identifying buggy file paths using only issue descriptions without repository access on SWE-Bench Verified.", 364 "evidence": "Figure 8 shows OpenAI o3 achieves 76% on SWE-Bench Verified. Performance drops across benchmarks: Full-SWE-Bench (69%), SWE-Bench Extra (59%), Outside-Repo Tasks (55%).", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Performance on outside-repo tasks is substantially lower (up to 53%) than on SWE-Bench Verified, indicating repository-bias memorization.", 369 "evidence": "Figure 8 shows all 10 models perform worse on Outside-Repo Tasks than SWE-Bench Verified, with drops of 7-21 percentage points. However, confounds exist: task construction differs, and the paper acknowledges varying repo sizes (Table 1).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Function reproduction shows much higher verbatim similarity on SWE-Bench Verified (up to 35% 5-gram overlap) than external benchmarks (up to 18%).", 374 "evidence": "Figure 10 shows Claude 4 Opus achieves 34.9% on SWE-Bench Verified vs. 13.9% on Outside-Repo Tasks and 18.1% on RefactorBench.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Up to 31.6% of SWE-Bench Verified instances show verbatim matches in prefix completion, indicating direct memorization.", 379 "evidence": "Table 3 shows Claude 4 Opus at 31.6% instance-level verbatim match. However, this is only tested on SWE-Bench Verified with no cross-benchmark control for this particular task.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Memorization patterns are consistent across model families and vendors, indicating systematic training data exposure.", 384 "evidence": "Section 4.1.3 shows all models follow the same performance hierarchy across benchmarks regardless of vendor (Figure 8).", 385 "supported": "strong" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Internal proprietary benchmark", 391 "detail": "SWE-Bench C# is described as 'an internal benchmark' (Section 3.1.2) and is not publicly available, making these results unverifiable by other researchers." 392 }, 393 { 394 "flag": "No statistical tests for core claims", 395 "detail": "All memorization claims rest on comparing point estimates across benchmarks (e.g., 76% vs 53%) without any significance tests. With stochastic models and varying sample sizes (39-500), some differences may not be statistically significant." 396 }, 397 { 398 "flag": "Uncontrolled confounds", 399 "detail": "RefactorBench has 14.6 token average issue descriptions vs. 451+ for SWE-Bench (Table 1), making performance drops partly attributable to information availability rather than memorization. SWE-Bench C# involves a different programming language. These confounds weaken the memorization interpretation." 400 }, 401 { 402 "flag": "Competitor evaluation by Microsoft authors", 403 "detail": "Two of three authors are Microsoft employees. The paper demonstrates flaws in benchmarks used to evaluate competing AI products (OpenAI, Anthropic). This potential conflict of interest is not disclosed." 404 }, 405 { 406 "flag": "No limitations section", 407 "detail": "The paper lacks any discussion of its own methodological limitations, threats to validity, or what its evidence does not show." 408 }, 409 { 410 "flag": "Single-run results with stochastic models", 411 "detail": "All experiments use 'default sampling settings' with no repeated runs. LLM outputs are stochastic; results could vary across runs, especially for the smaller benchmarks." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 417 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 418 "year": 2024, 419 "relevance": "The primary benchmark under investigation for memorization/contamination effects." 420 }, 421 { 422 "title": "Evaluating Large Language Models Trained on Code", 423 "authors": ["Mark Chen"], 424 "year": 2021, 425 "arxiv_id": "2107.03374", 426 "relevance": "Introduced HumanEval and Codex; foundational code generation benchmark also subject to contamination concerns." 427 }, 428 { 429 "title": "Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM", 430 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"], 431 "year": 2024, 432 "relevance": "Task mutation approach to contamination detection in coding benchmarks." 433 }, 434 { 435 "title": "Are Large Language Models Memorizing Bug Benchmarks?", 436 "authors": ["Daniel Ramos", "Claudia Mamede", "Kush Jain", "Paulo Canelas", "Catarina Gamboa", "Claire Le Goues"], 437 "year": 2025, 438 "arxiv_id": "2411.13323", 439 "relevance": "Directly related work on memorization detection in coding benchmarks using metric-based probing." 440 }, 441 { 442 "title": "RefactorBench: Evaluating Stateful Reasoning in Language Agents Through Code", 443 "authors": ["Dhruv Gautam", "Spandan Garg", "Jinu Jang", "Neel Sundaresan", "Roshanak Zilouchian Moghaddam"], 444 "year": 2025, 445 "arxiv_id": "2503.07832", 446 "relevance": "Alternative coding benchmark used as cross-benchmark comparison for memorization detection." 447 }, 448 { 449 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 450 "authors": ["Terry Yue Zhuo"], 451 "year": 2024, 452 "relevance": "Major code generation benchmark relevant to evaluating LLM coding capabilities." 453 }, 454 { 455 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 456 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 457 "year": 2024, 458 "arxiv_id": "2405.15793", 459 "relevance": "Major SWE-Bench agent system whose performance gains may be partially attributed to memorization." 460 }, 461 { 462 "title": "Don't Make Your LLM an Evaluation Benchmark Cheater", 463 "authors": ["Kun Zhou"], 464 "year": 2023, 465 "arxiv_id": "2311.01964", 466 "relevance": "Foundational work on benchmark contamination and data leakage in LLM evaluation." 467 }, 468 { 469 "title": "TaskEval: Assessing Difficulty of Code Generation Tasks for Large Language Models", 470 "authors": ["Florian Tambon", "Amin Nikanjam", "Cyrine Zid", "Foutse Khomh", "Giuliano Antoniol"], 471 "year": 2025, 472 "arxiv_id": "2407.21227", 473 "relevance": "Measures prompt sensitivity as contamination signal in code generation benchmarks." 474 }, 475 { 476 "title": "Memorize or Generalize? Evaluating LLM Code Generation with Evolved Questions", 477 "authors": ["Wentao Chen", "Lizhe Zhang", "Li Zhong", "Letian Peng", "Zilong Wang", "Jingbo Shang"], 478 "year": 2025, 479 "arxiv_id": "2503.02296", 480 "relevance": "AST-level mutation approach to detecting memorization vs. generalization in code generation." 481 }, 482 { 483 "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents", 484 "authors": ["Xingyao Wang"], 485 "year": 2024, 486 "arxiv_id": "2407.16741", 487 "relevance": "Open-source SWE-Bench agent platform whose performance may be affected by memorization." 488 }, 489 { 490 "title": "Training Software Engineering Agents and Verifiers with SWE-Gym", 491 "authors": ["Jiayi Pan", "Xingyao Wang", "Graham Neubig"], 492 "year": 2024, 493 "arxiv_id": "2412.21139", 494 "relevance": "Training framework for SWE agents, relevant to understanding how training on SWE-Bench-like data affects performance." 495 } 496 ] 497 }