scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24754B)
      1 {
      2   "paper": {
      3     "title": "SWE-Bench+: Enhanced Coding Benchmark for LLMs",
      4     "authors": ["Reem Aleithan", "Haoran Xue", "Mohammad Mahdi Mohajer", "Elijah Nnorom", "Gias Uddin", "Song Wang"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2410.06992"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "The SWE-bench+ dataset is released on Zenodo (https://zenodo.org/records/13879453), referenced in the 'Artifacts' paragraph of Section 3."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The SWE-bench+ dataset of 548 task instances is released on Zenodo for replication and extension."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment specifications, requirements files, or dependency details are provided in the paper."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper describes the methodology at a high level ('we followed the same data collection methodology outlined in the SWE-Bench study') but provides no step-by-step reproduction instructions or scripts."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "All resolution rates are reported as point estimates (e.g., 12.47%, 3.97%, 0.55%) with no confidence intervals or error bars."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper claims significant drops in resolution rates (e.g., 12.47% to 3.97%) but uses no statistical tests to determine whether differences are significant."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Resolution rate drops are reported with baseline context: e.g., 'drops from 12.47% to 3.97%' and 'from 18.83% to 3.83% for AutoCodeRover', providing magnitude of change from explicit baselines."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The 251 resolved patches and 548 SWE-bench+ instances are used without justification for whether these sample sizes are adequate for the claims made."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Single-run numbers only."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Four models are compared: SWE-Agent+GPT-4, SWE-RAG+GPT-4, SWE-RAG+GPT-3.5, and AutoCodeRover+GPT-4o. Original SWE-bench leaderboard numbers serve as baselines."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper acknowledges that 'other top approaches (e.g., Honeycomb, Amazon Q Developer Agent, and Factory Code Droid) were either closed-sourced commercial tools or not verified.' The evaluated models were at the top during the study period but are not the most competitive approaches available."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is a benchmark quality analysis, not a system with components to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper reports resolution rate, effectiveness-aware cost per issue fixed, average cost per instance, and average time per instance (Table 4)."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Three authors independently performed manual patch validation comparing gold patches to model-generated patches. Disagreements were resolved through discussion (Section 2)."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "SWE-bench+ is constructed from issues created after model training cutoffs (post October 2023), serving as a temporally held-out test set."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Table 1 breaks down the 251 patches into 6 patterns. Table 2 breaks down patterns for Lite and Verified. Table 3 breaks down per model on SWE-bench+. Figure 7 shows per-project distribution."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Sections 2.1.1 discusses four failure patterns with specific examples (Figures 3-6): solution leak, incorrect fixes, different files changed, and incomplete fixes."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The main finding is negative: resolution rates drop dramatically when problematic instances are filtered. This is inherently a negative-results paper about SWE-bench quality."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims of 32.67% solution leakage, 31.08% weak tests, and the drop from 12.47% to 3.97% are all supported by Table 1 and Figure 1b. The 0.55% SWE-bench+ rate is supported by Section 4."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper makes causal claims (solution leakage causes inflated resolution rates) supported by a reasonable design: manually classifying patches and showing the rate drops after filtering. The causal mechanism (models copy leaked solutions) is demonstrated with concrete examples."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title frames this as evaluating 'LLMs' generally, but only 4 model configurations (GPT-3.5, GPT-4, GPT-4o with specific scaffolds) are tested. Claims about 'LLMs' are not bounded to the tested models."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper does not discuss alternative explanations for the performance drops. For example, SWE-bench+ uses different repositories/time periods which could introduce difficulty confounds beyond just removing leakage."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper directly addresses the gap between what SWE-bench measures (test pass rate) and what it claims (issue resolution capability). The core contribution is showing this proxy is unreliable due to weak tests and leaked solutions."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Specific model versions are provided: 'GPT-3.5 (turbo-16k-0613)', 'GPT-4 (1106)', 'GPT-4o (2024-05-13)', and 'AutoCodeRover (v20240620)'."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "The paper evaluates existing tools (SWE-Agent, SWE-RAG, AutoCodeRover) following their published instructions. The authors did not design custom prompts."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The paper evaluates third-party tools (SWE-Agent, AutoCodeRover) as black boxes, following their published instructions. It cannot be expected to describe their internal scaffolding."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3 describes the data collection pipeline: same 12 projects minus Django, temporal filter (post 2023-11-01), attribute filtering, execution filter, manual screening for solution leakage. Steps with counts are provided (548 final instances)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions 'weak test cases continue to pose challenges' but this is a finding, not a limitation of the study itself."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "No specific threats to validity are discussed. The paper does not address potential biases in the manual patch classification or limitations of using only 4 model configurations."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what its results do NOT show. No discussion of scope limitations regarding the models tested, the Python-only focus, or the generalizability of findings to other benchmarks."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The SWE-bench+ dataset is released on Zenodo (https://zenodo.org/records/13879453), enabling independent verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3 describes the collection: same 12 projects as SWE-bench (minus Django), issues from 2023-11-01 to 2024-08-22, same filtering methodology as original SWE-bench, manual screening for solution leakage."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants. Data source is GitHub issues from a standard set of repositories."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4 describes the 4-step evaluation pipeline: Step 1 store patches, Step 2 evaluate with SWE-bench scripts, Step 3 filter resolved instances, Step 4 manual patch validation. The collection pipeline in Section 3 also includes filtering stages."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding source or acknowledgments section is present in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are from York University's Lassonde School of Engineering, clearly stated in the header."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding information is disclosed, so independence cannot be assessed."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Training cutoffs are explicitly stated: GPT-3.5 September 2021, GPT-4 April 2023, GPT-4o October 2023 (Section 3)."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "The paper's core contribution addresses this: '94% of the instances in SWE-bench and their pull requests were created prior to the training cut-off dates of the LLMs' (Section 1). SWE-bench+ uses temporal splits to address this."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The paper directly addresses contamination by constructing SWE-bench+ with issues from after October 2023, ensuring temporal separation from all model training data."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants. This is a benchmark quality analysis study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Table 4 reports average cost per instance and effectiveness-aware cost per issue for all four models (e.g., SWE-Agent+GPT-4: $0.24/instance, $655/fix)."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Table 4 reports average time per instance. Total compute is derivable: SWE-Agent+GPT-4 ~37 hours, AutoCodeRover ~41 hours for all 548 instances."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No mention of multiple runs or seed sensitivity. All results appear to be single-run."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The number of experimental runs is not stated. Results appear to be from a single execution of each model."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": false,
    302         "answer": false,
    303         "justification": "The paper evaluates existing tools with their default configurations; no hyperparameter search is performed."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "No configuration selection is performed. Models are run with their published default settings."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The authors build SWE-bench+ and then evaluate models on it. They do not discuss potential bias from being both benchmark creators and evaluators."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Section 5 and Table 4 explicitly compare cost vs performance across models, noting trade-offs (e.g., SWE-Agent most expensive but similar performance to cheaper RAG+GPT-4)."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": true,
    328         "justification": "The entire paper is about benchmark construct validity — whether SWE-bench actually measures what it claims (issue resolution capability vs. test pass rate with leaked solutions)."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The paper compares SWE-Agent, SWE-RAG, and AutoCodeRover — different scaffolds paired with different models — without addressing the scaffolding confound. Performance differences are attributed to model+scaffold bundles without disentangling them."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "This is the paper's primary contribution. They show 94% of SWE-bench issues predate model training cutoffs and construct SWE-bench+ with post-cutoff issues."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "The paper identifies and addresses solution leakage (32.67% of resolved instances had solutions in issue descriptions/comments), which is a form of feature leakage."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether issues within the same repository or by the same contributors might be non-independent or share structural patterns."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Manual inspection by three authors comparing gold patches to issue descriptions/comments to detect solution leakage. Temporal filtering used to prevent data leakage."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "32.67% of SWE-Agent+GPT-4 successful patches involve solution leakage where solutions were directly provided in issue reports or comments.",
    362       "evidence": "Table 1: 82 of 251 patches classified as solution leak through manual review by three authors (Section 2.1.1).",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "31.08% of passed patches are suspicious due to weak test cases that fail to catch incorrect, incomplete, or wrong-file patches.",
    367       "evidence": "Table 1: 32 incorrect fixes + 9 different files + 37 incomplete fixes = 78 of 251 patches (Section 2.1.1).",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "After filtering problematic instances, SWE-Agent+GPT-4 resolution rate drops from 12.47% to 3.97%.",
    372       "evidence": "Figure 1b and Section 2.2. Only 91 of 251 patches classified as correct fixes (different from gold: 76, more comprehensive: 15).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "SWE-bench Lite and Verified also suffer from solution leakage (33.33% and 33.04% respectively).",
    377       "evidence": "Table 2, Section 2.3. Manual review of all issue reports in both variants.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "On SWE-bench+, SWE-Agent+GPT-4 resolution rate drops to 0.55%.",
    382       "evidence": "Section 4, Table 3: only 3 correct fixes out of 548 instances.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "AutoCodeRover+GPT-4o achieves the highest resolution rate on SWE-bench+ at 3.83% but drops from 18.83% on original leaderboard.",
    387       "evidence": "Section 4, Table 3: 21 correct patches out of 548 instances.",
    388       "supported": "moderate"
    389     }
    390   ],
    391   "methodology_tags": ["benchmark-eval", "observational"],
    392   "key_findings": "SWE-bench has critical quality problems: 32.67% of successfully resolved issues contain leaked solutions in issue descriptions, and 31.08% pass despite incorrect/incomplete patches due to weak test cases. After filtering these problems, SWE-Agent+GPT-4's resolution rate drops from 12.47% to 3.97%. On the new SWE-bench+ dataset (post-training-cutoff issues with no solution leakage), resolution rates drop further to 0.55% for SWE-Agent+GPT-4, raising serious questions about current LLM coding benchmark validity.",
    393   "red_flags": [
    394     {
    395       "flag": "No limitations section",
    396       "detail": "The paper has no dedicated limitations or threats-to-validity section despite making strong claims about benchmark quality. Potential biases in manual classification (three annotators, no inter-rater reliability reported beyond 'disagreements resolved through discussion') are not discussed."
    397     },
    398     {
    399       "flag": "Confounded comparison",
    400       "detail": "SWE-bench+ differs from SWE-bench in multiple ways simultaneously: different time period, different repositories (Django excluded), different issue difficulty distribution. The performance drop cannot be attributed solely to removing leakage/weak tests vs. inherently harder recent issues."
    401     },
    402     {
    403       "flag": "No statistical testing",
    404       "detail": "Claims of 'significant' drops in resolution rates are made without any statistical significance tests. With small absolute numbers (e.g., 3 correct fixes out of 548), sampling variance could be substantial."
    405     },
    406     {
    407       "flag": "Single-run results",
    408       "detail": "All model evaluations appear to be single runs with no assessment of variance. LLM outputs are stochastic and results could differ across runs."
    409     },
    410     {
    411       "flag": "Inter-rater reliability not quantified",
    412       "detail": "Three authors classified 251 patches into patterns but no inter-rater agreement metric (Cohen's kappa, Fleiss' kappa) is reported. Only 'disagreements were resolved through discussion.'"
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    418       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    419       "year": 2024,
    420       "arxiv_id": "2310.06770",
    421       "relevance": "The original SWE-bench benchmark that this paper critiques and extends."
    422     },
    423     {
    424       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    425       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    426       "year": 2024,
    427       "arxiv_id": "2405.15793",
    428       "relevance": "Primary evaluated tool; top SWE-bench performer at time of study."
    429     },
    430     {
    431       "title": "AutoCodeRover: Autonomous program improvement",
    432       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    433       "year": 2024,
    434       "arxiv_id": "2404.05427",
    435       "relevance": "Evaluated as one of the top open-source SWE-bench models."
    436     },
    437     {
    438       "title": "Agentless: Demystifying LLM-based software engineering agents",
    439       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    440       "year": 2024,
    441       "arxiv_id": "2407.01489",
    442       "relevance": "Alternative approach to LLM-based issue resolution evaluated on SWE-bench."
    443     },
    444     {
    445       "title": "Evaluating large language models trained on code",
    446       "authors": ["Mark Chen"],
    447       "year": 2021,
    448       "arxiv_id": "2107.03374",
    449       "relevance": "Introduced HumanEval benchmark for code generation; foundational code evaluation work."
    450     },
    451     {
    452       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    453       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    454       "year": 2024,
    455       "relevance": "EvalPlus framework showing overestimation of LLM performance due to insufficient test cases — directly relevant to benchmark quality."
    456     },
    457     {
    458       "title": "CODER: Issue resolving with multi-agent and task graphs",
    459       "authors": ["Dong Chen"],
    460       "year": 2024,
    461       "arxiv_id": "2406.01304",
    462       "relevance": "Multi-agent approach to SWE-bench issue resolution."
    463     },
    464     {
    465       "title": "Diversity empowers intelligence: Integrating expertise of software engineering agents",
    466       "authors": ["Kexun Zhang"],
    467       "year": 2024,
    468       "arxiv_id": "2408.07060",
    469       "relevance": "Multi-agent framework for code generation showing impact of data diversity on LLM performance."
    470     },
    471     {
    472       "title": "SWE-bench-java: A GitHub issue resolving benchmark for Java",
    473       "authors": ["Daoguang Zan"],
    474       "year": 2024,
    475       "arxiv_id": "2408.14354",
    476       "relevance": "Extension of SWE-bench to Java; addresses language diversity in benchmark evaluation."
    477     },
    478     {
    479       "title": "MAGIS: LLM-based multi-agent framework for GitHub issue resolution",
    480       "authors": ["Wei Tao"],
    481       "year": 2024,
    482       "arxiv_id": "2403.17927",
    483       "relevance": "Multi-agent approach evaluated on SWE-bench."
    484     }
    485   ]
    486 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs