ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24006B)


      1 {
      2   "paper": {
      3     "title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems",
      4     "authors": [
      5       "Matias Martinez",
      6       "Xavier Franch"
      7     ],
      8     "year": 2026,
      9     "venue": "arXiv (Manuscript submitted to ACM)",
     10     "arxiv_id": "2506.17208"
     11   },
     12   "scan_version": 3,
     13   "active_modules": [
     14     "survey_methodology"
     15   ],
     16   "methodology_tags": [
     17     "meta-analysis",
     18     "observational",
     19     "qualitative"
     20   ],
     21   "key_findings": "The study analyzes 178 entries (80 unique approaches) across SWE-Bench Lite and Verified leaderboards. Industry submitters dominate (58% of distinct submitters), with small companies being the largest category. Proprietary LLMs, especially Claude 3.5/4 Sonnet, consistently achieve highest performance. No single architecture (agentic vs non-agentic, single vs multi-agent) consistently outperforms others, though submissions without agents (G1) tend to show lower precision.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The appendix data is available at https://github.com/UPCBarcelonaTech/DissectingSWEBench (referenced as [5])."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The complete list of approaches and submissions is provided in the appendix [5] at a GitHub repository, and the study is based on publicly available SWE-Bench leaderboard data."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No environment specifications or dependency requirements are mentioned for reproducing the analysis."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided for the data collection and coding process."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper reports medians and maximums but no confidence intervals or error bars on its aggregate statistics."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper uses Kruskal-Wallis tests and Dunn's post-hoc tests to compare precision across submitter types (Section 3.1.2) and architecture groups (Section 3.2.9), reporting H-statistics and p-values."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports absolute percentage differences in precision across groups (e.g., median 46.9% vs 31.5% for Verified vs Lite) providing baseline context for interpreting differences."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The study analyzes all entries on the leaderboards (79 + 99 = 178), which is a census not a sample, but no discussion of statistical power for the subgroup comparisons is provided."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Only medians and maximums are reported for % Resolved across groups. No standard deviations, IQR, or other spread measures are provided."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "This is the first study of its kind on SWE-Bench leaderboards, and no comparison against prior surveys or analyses is made."
     77       },
     78       "baselines_contemporary": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No prior comprehensive study of SWE-Bench leaderboards exists to serve as a baseline."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "This is a survey/observational study with no system components to ablate."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The study analyzes submissions across multiple dimensions: % Resolved, submitter type distribution, product availability, open-source status, LLM usage, and architectural classification."
     92       },
     93       "human_evaluation": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "Human evaluation of system outputs is not relevant to this survey of leaderboard submissions."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "This is a survey study, not an experiment with train/test splits."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Extensive breakdowns are provided by submitter type, product availability, open-source status, LLM family, architecture group (G1-G8), and per-leaderboard (Lite vs Verified)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4 discusses patch overfitting problems, multi-agent failure modes (citing Cemri et al.'s 14 failure modes), benchmark limitations, and data contamination concerns."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that no single architecture consistently achieves SOTA, that open-source-only LLM submissions achieve lower performance, and that some multi-agent approaches were abandoned in favor of single-agent (nFactorial, Warp examples in Section 4.3)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims about dominance of proprietary LLMs (especially Claude 3.5), presence of both agentic/non-agentic designs, and diverse contributor base are all supported by detailed results in Section 3."
    124       },
    125       "causal_claims_justified": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "The paper is descriptive/observational and does not make causal claims. Statements like 'progress has been driven by advances in AI' are framing, not testable causal claims."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly bounds its scope to SWE-Bench Lite and Verified leaderboards only, explains why other leaderboards (Full, Multimodal) were excluded (Section 2.1), and the threats to validity (Section 5) notes findings may not apply to other benchmarks."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 4 discusses alternative explanations including patch overfitting inflating scores, data contamination, benchmark saturation, and that performance differences may reflect LLM capability rather than architectural choices."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 4.1 explicitly discusses that % Resolved (the proxy) may not reflect patch correctness (the outcome), citing Wang et al.'s finding that resolution rates are overstated by 6.2 percentage points due to overfitting patches."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "This paper does not use LLMs in its methodology. It analyzes other submissions' LLM usage."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use prompting. It is a manual survey study."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No LLMs or models are run as part of this study's methodology."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used in this study."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 2.1.1 describes the detailed data collection procedure: visiting leaderboard pages, following links, conducting Google searches with specific query templates, inspecting README/metadata files, and checking LinkedIn. Section 2.1.2 describes the coding schema development using inductive/deductive coding."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 5 (Threats to Validity) provides a dedicated multi-page discussion covering external, internal, construct, and conclusion validity."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5 discusses specific threats: risk of missing documents for some approaches, challenges classifying submissions with limited descriptions, exclusion of monetary cost analysis due to changing token prices, and ambiguity in LLM usage information from informal sources."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper explicitly states it only studies SWE-Bench Lite and Verified (not Full or Multimodal), explains why, and notes 'we do not claim that our findings can be applied to' other benchmarks (Section 5, External Validity)."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The complete list of approaches and submissions is provided in the appendix [5] at a GitHub repository, and the underlying SWE-Bench leaderboard data is publicly accessible."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 2.1.1 provides a detailed step-by-step data collection procedure including visiting leaderboard pages, following links, Google searches, inspecting metadata files, and LinkedIn profiles."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The study analyzes publicly available leaderboard submissions."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 2 describes the full pipeline: leaderboard scraping → artifact collection → content analysis with coding schema → classification across multiple dimensions. Table 1 shows distribution of artifact types found."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source or acknowledgments section is present in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Both authors are affiliated with Universitat Politècnica de Catalunya, Spain. Neither appears to have conflicts with the evaluated systems."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding information is disclosed, so independence cannot be assessed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This paper does not evaluate a pre-trained model on any benchmark. It is a survey of leaderboard submissions."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "This paper does not evaluate a pre-trained model on any benchmark."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "This paper does not evaluate a pre-trained model on any benchmark. However, it does discuss contamination as a limitation of SWE-Bench itself (Section 4.4)."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "This is a survey paper with no computational method to cost."
    293       },
    294       "compute_budget_stated": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "This is a survey paper with no computational experiments."
    298       }
    299     },
    300     "survey_methodology": {
    301       "prisma_or_structured_protocol": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper does not follow PRISMA or any named structured review protocol. While the data collection is systematic (Section 2.1.1), there is no PRISMA flow diagram, no protocol registration, and the search strategy is specific to leaderboard scraping rather than a reproducible literature search."
    305       },
    306       "quality_assessment_of_sources": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The survey does not assess the quality of the source papers or blog posts used to characterize submissions. All sources are treated equally regardless of whether they are peer-reviewed papers, blog posts, or README files. The paper discusses patch overfitting concerns (Section 4.1) but does not systematically assess source quality."
    310       },
    311       "publication_bias_discussed": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The survey does not discuss publication bias in the leaderboard context — e.g., whether submissions with poor results are less likely to be submitted, creating a survivorship bias in the analyzed entries."
    315       }
    316     }
    317   },
    318   "claims": [
    319     {
    320       "claim": "The majority of submissions come from industry, particularly small companies and large publicly traded corporations, with industry accounting for 58% of distinct submitters.",
    321       "evidence": "Table 2 and Figure 4 show 41 out of 71 distinct submitters (58%) are from industry. Section 3.1.2 provides detailed breakdown.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Proprietary LLMs, especially Claude 3.5/4 Sonnet, consistently achieve the highest precision on both leaderboards.",
    326       "evidence": "Table 5 shows Claude 4 Sonnet achieving 60.33% max on Lite and Claude 4 variants dominating top entries on Verified (up to 75.2%). Section 3.1.5 details the LLM analysis.",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "No single architecture consistently achieves state-of-the-art performance across both leaderboards.",
    331       "evidence": "Table 6 shows top performance achieved by G3 (75.2% on Verified), G4 (60% on Lite), and G6 (73.2% on Verified). Kruskal-Wallis test on Lite showed no significant difference (p=0.0579). Section 3.2.9.",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "Closed-source solutions exhibit higher median % Resolved, but open-source tools are showing competitive and sometimes state-of-the-art performance.",
    336       "evidence": "Table 4 shows closed-source median of 35.67% vs 30.33% on Lite and 50% vs 44.2% on Verified, but top entries on both are open-source.",
    337       "supported": "strong"
    338     },
    339     {
    340       "claim": "Since the introduction of SWE-Bench Verified, companies (especially small ones) have been driving progress toward state-of-the-art performance.",
    341       "evidence": "Figure 5b and Section 3.1.2 show small companies consistently pushing top results, with Refact.ai (74.4%), Bytedance (75.2%) on Verified.",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "SWE-Bench may be approaching saturation, with submissions achieving up to 75% precision.",
    346       "evidence": "Section 4.5 notes 20+ percentage point improvement over previous year, with all >70% submissions using Claude 4 models. Argument is based on trend extrapolation.",
    347       "supported": "weak"
    348     }
    349   ],
    350   "red_flags": [
    351     {
    352       "flag": "No quality assessment of sources",
    353       "detail": "The survey treats blog posts, LinkedIn posts, and README files as equivalent to peer-reviewed papers when characterizing approaches. No assessment of source reliability or completeness is performed."
    354     },
    355     {
    356       "flag": "Survivorship bias in leaderboard analysis",
    357       "detail": "Only submissions that were actually made to the leaderboard are analyzed. Approaches that performed poorly and were never submitted, or submissions that were withdrawn, are invisible. This could bias conclusions about what architectures work."
    358     },
    359     {
    360       "flag": "Classification subjectivity with limited inter-rater reliability data",
    361       "detail": "The architectural classification (G1-G8) relies on manual content analysis by two authors, but no inter-rater reliability metrics (Cohen's kappa, etc.) are reported. The paper states annotations were 'cross-validated among the authors' but provides no quantitative agreement measure."
    362     }
    363   ],
    364   "cited_papers": [
    365     {
    366       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    367       "authors": [
    368         "Carlos E. Jimenez",
    369         "John Yang",
    370         "Alexander Wettig",
    371         "Shunyu Yao",
    372         "Kexin Pei",
    373         "Ofir Press",
    374         "Karthik Narasimhan"
    375       ],
    376       "year": 2024,
    377       "relevance": "The benchmark being studied; foundational to the entire SWE-bench ecosystem analyzed in this paper."
    378     },
    379     {
    380       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    381       "authors": [
    382         "Chunqiu Steven Xia",
    383         "Yinlin Deng",
    384         "Soren Dunn",
    385         "Lingming Zhang"
    386       ],
    387       "year": 2024,
    388       "relevance": "Pioneering non-agentic approach on SWE-bench that spawned multiple extensions and variants."
    389     },
    390     {
    391       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    392       "authors": [
    393         "John Yang",
    394         "Carlos E. Jimenez",
    395         "Alexander Wettig",
    396         "Kilian Lieret",
    397         "Shunyu Yao",
    398         "Karthik Narasimhan",
    399         "Ofir Press"
    400       ],
    401       "year": 2024,
    402       "relevance": "Foundational single-agent system for SWE-bench, demonstrating emergent workflow with ReAct."
    403     },
    404     {
    405       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    406       "authors": [
    407         "Xingyao Wang",
    408         "Boxuan Li"
    409       ],
    410       "year": 2024,
    411       "relevance": "Open platform for AI coding agents, advocates single-agent architecture, competitive SWE-bench results."
    412     },
    413     {
    414       "title": "AutoCodeRover: Autonomous Program Improvement",
    415       "authors": [
    416         "Yuntong Zhang",
    417         "Haifeng Ruan",
    418         "Zhiyu Fan",
    419         "Abhik Roychoudhury"
    420       ],
    421       "year": 2024,
    422       "relevance": "Multi-agent scaffolded workflow for SWE-bench, later acquired by Sonar and extended to SpecRover."
    423     },
    424     {
    425       "title": "MASAI: Modular Architecture for Software-Engineering AI Agents",
    426       "authors": [
    427         "Daman Arora"
    428       ],
    429       "year": 2024,
    430       "relevance": "Multi-agent modular architecture with specialized sub-agents for different repair phases."
    431     },
    432     {
    433       "title": "Large Language Model-based Agents for Software Engineering: A Survey",
    434       "authors": [
    435         "Junwei Liu",
    436         "Kaixin Wang"
    437       ],
    438       "year": 2024,
    439       "relevance": "Provides the end-to-end software maintenance pipeline taxonomy used as the analytical framework for RQ3."
    440     },
    441     {
    442       "title": "Why Do Multi-Agent LLM Systems Fail?",
    443       "authors": [
    444         "Mert Cemri"
    445       ],
    446       "year": 2025,
    447       "relevance": "Empirical study of multi-agent failure modes in SWE-bench systems, identifying 14 distinct failure types."
    448     },
    449     {
    450       "title": "Are 'Solved Issues' in SWE-Bench Really Solved Correctly? An Empirical Study",
    451       "authors": [
    452         "You Wang",
    453         "Michael Pradel",
    454         "Zhongxin Liu"
    455       ],
    456       "year": 2025,
    457       "relevance": "Found SWE-bench resolution rates are overstated by 6.2 percentage points due to overfitting patches."
    458     },
    459     {
    460       "title": "PatchPilot: A Stable and Cost-Efficient Agentic Patching Framework",
    461       "authors": [
    462         "Hongwei Li",
    463         "Yuheng Tang",
    464         "Shiqi Wang",
    465         "Wenbo Guo"
    466       ],
    467       "year": 2025,
    468       "relevance": "Multi-stage agentic patching framework with refinement component, achieving competitive SWE-bench results."
    469     },
    470     {
    471       "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    472       "authors": [
    473         "Yuxiang Wei"
    474       ],
    475       "year": 2025,
    476       "relevance": "Uses reinforcement learning to train LLMs for autonomous developer reasoning, evaluated on SWE-bench."
    477     },
    478     {
    479       "title": "The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason",
    480       "authors": [
    481         "Shanchao Liang",
    482         "Spandan Garg",
    483         "Roshanak Zilouchian Moghaddam"
    484       ],
    485       "year": 2025,
    486       "relevance": "Investigates whether SWE-bench performance is driven by memorization rather than genuine coding capability."
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 1,
    492       "justification": "Provides a taxonomy of repair architectures but no directly usable tool or technique practitioners can apply at work."
    493     },
    494     "surprise_contrarian": {
    495       "score": 1,
    496       "justification": "The finding that no single architecture dominates is mildly surprising given hype around agentic systems, but most other findings confirm conventional wisdom."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "No safety, security, or risk angle is discussed."
    501     },
    502     "drama_conflict": {
    503       "score": 2,
    504       "justification": "Directly examines SWE-Bench limitations including potential saturation, patch overfitting inflating scores by ~6pp, and questions whether industry submitters account for correctness."
    505     },
    506     "demo_ability": {
    507       "score": 0,
    508       "justification": "Pure observational study with no code, demo, or interactive artifact to try."
    509     },
    510     "brand_recognition": {
    511       "score": 2,
    512       "justification": "Centers on the widely-discussed SWE-Bench benchmark and references major companies (Anthropic, Google, Amazon, OpenAI) and products (Claude, Copilot-adjacent tools)."
    513     }
    514   }
    515 }

Impressum · Datenschutz