ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (21003B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems",
      6     "authors": [
      7       "Matias Martinez",
      8       "Xavier Franch"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2506.17208",
     13     "doi": "10.1145/nnnnnnn.nnnnnnn"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims about dominance of proprietary LLMs (especially Claude 3.5), presence of both agentic/non-agentic designs, and diverse contributor base are all supported by detailed results in Section 3.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "The paper is descriptive/observational and does not make causal claims. Statements like 'progress has been driven by advances in AI' are framing, not testable causal claims.",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper explicitly bounds its scope to SWE-Bench Lite and Verified leaderboards only, explains why other leaderboards (Full, Multimodal) were excluded (Section 2.1), and the threats to validity (Section 5) notes findings may not apply to other benchmarks.",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 4 discusses alternative explanations including patch overfitting inflating scores, data contamination, benchmark saturation, and that performance differences may reflect LLM capability rather than architectural choices.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Section 4.1 explicitly discusses that % Resolved (the proxy) may not reflect patch correctness (the outcome), citing Wang et al.'s finding that resolution rates are overstated by 6.2 percentage points due to overfitting patches.",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5 (Threats to Validity) provides a dedicated multi-page discussion covering external, internal, construct, and conclusion validity.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 discusses specific threats: risk of missing documents for some approaches, challenges classifying submissions with limited descriptions, exclusion of monetary cost analysis due to changing token prices, and ambiguity in LLM usage information from informal sources.",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states it only studies SWE-Bench Lite and Verified (not Full or Multimodal), explains why, and notes 'we do not claim that our findings can be applied to' other benchmarks (Section 5, External Validity).",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding source or acknowledgments section is present in the paper.",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors are affiliated with Universitat Politècnica de Catalunya, Spain. Neither appears to have conflicts with the evaluated systems.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No funding information is disclosed, so independence cannot be assessed.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests statement is present in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key architectural dimensions are formally defined in Section 2.2: 'workflow authoring' (human vs. emergent), 'control flow autonomy' (emergent/scaffolded/fixed), and 'number of agents.' Submitter categories and product availability codes are also defined with explicit criteria.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper states explicitly that it presents 'the first comprehensive characterization of all submitters and solutions published on the SWE-Bench leaderboards,' guided by three clearly stated research questions (RQ1–RQ3).",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 6 contains substantive related work covering prior SWE-Bench empirical studies, approaches not in leaderboards, and APR on other benchmarks. The paper situates its contribution relative to patch quality studies (Wang et al., Aleithan et al.) and trajectory analyses (Ceka et al., Bouzenia et al.).",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "Primary corpus is the leaderboard (anchored at a specific commit SHA), but secondary information gathering relies on Google searches, LinkedIn, and blog posts whose results would vary over time and are not archived — not fully reproducible.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Section 2.1 explicitly excludes SWE-Bench Full (all solutions also in Lite/Verified) and Multimodal (restricted to language-based agents), with stated reasons. All entries in Lite and Verified up to July 17 are included.",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The paper uses content analysis methodology (deductive/inductive coding) but does not follow PRISMA or any other structured review protocol — no flow diagram, no formal protocol registration.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "The Google query format is specified: '\"<Name_Entry> + SWE-Bench\"' with description of how entry names are constructed (e.g., 'GRU' from 'Gru(2024-12-08)').",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Information sources are explicitly listed: SWE-Bench leaderboard (with commit SHA), submission README.md/metadata.yaml files, Google Scholar, arXiv, LinkedIn, company websites, and blog posts.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "Table 1 shows documentation type distribution across entries; the paper reports 79 Lite entries, 99 Verified entries, 80 unique approaches, and explicitly notes 3 entries per leaderboard with no associated artifact.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "The choice of SWE-Bench Lite and Verified is justified by the benchmark's 'substantial impact on both academia and industry' (780+ citations), and exclusions (Full, Multimodal) are explicitly reasoned in Section 2.1.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Section 4.3 explicitly presents the single-agent vs. multi-agent debate with opposing views from Cognition, Anthropic, OpenHands, nFactorial, and Warp, concluding 'there is no one-size-fits-all solution.'",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The paper characterizes documentation types (Table 1: scientific articles vs blog posts vs README) but applies no formal quality rubric to assess the rigor of the approaches or their documentation.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "The paper does not discuss selection bias in which approaches are submitted to leaderboards (e.g., only high-performing systems are likely to be submitted), nor whether negative results are systematically absent.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Kruskal-Wallis tests with Dunn's post-hoc comparisons are applied to % Resolved by submitter type and architecture; median and max statistics are tabulated across multiple dimensions in Tables 2, 3, 4, 5, and 6.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The paper's summary conclusions (e.g., proprietary LLMs lead, no single architecture dominates, open-source catching up) are each tied to specific tables and statistical tests rather than to author opinion alone.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Industry submitters dominate SWE-Bench leaderboards, accounting for 58% of distinct submitters and increasing to 66% of Verified entries.",
    198       "evidence": "Table 2 and Figures 2/4 with entry counts by submitter type across both leaderboards.",
    199       "supported": "strong"
    200     },
    201     {
    202       "claim": "Proprietary LLMs, especially Claude 3.5 Sonnet and more recently Claude 4, have consistently achieved the highest % Resolved scores on both leaderboards.",
    203       "evidence": "Table 5 showing LLM combinations sorted by % Resolved; temporal Figure 9 showing Claude models dominating recent SOTA.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "No single architecture (single agent, multi-agent, fixed workflow) consistently achieves state-of-the-art performance.",
    208       "evidence": "Table 6 showing overlapping performance ranges across G1–G7; Kruskal-Wallis results showing no significant difference in Lite (p=0.0579).",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Open-source solutions are achieving competitive performance, with several reaching SOTA on both leaderboards in 2025.",
    213       "evidence": "Table 4 and Figure 7 showing open-source entries holding top rank intermittently; ExpeRepair (60.33%) and Moatless (70.8%) as examples.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Current % Resolved metrics overstate true repair performance by ~6.2 percentage points due to patch overfitting.",
    218       "evidence": "Cited from Wang et al. (2025), not the authors' own analysis — used to motivate discussion in Section 4.1.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "SWE-Bench Verified may be approaching saturation, with July 2025 SOTA at 75.2% and all systems above 70% relying on Claude 4.",
    223       "evidence": "Figure 9b and Section 4.5 discussion; historical progression documented in Table 5.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Fine-tuned open-source models combined with closed-source LLMs can achieve near-SOTA performance (e.g., AgentScope at 63.4%).",
    228       "evidence": "Table 5 entry for AgentScope (Claude 3.5 Sonnet + Qwen2.5) and discussion in Section 3.1.5.",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "observational",
    234     "qualitative",
    235     "meta-analysis"
    236   ],
    237   "key_findings": "Industry actors — especially small companies — dominate SWE-Bench submissions and have driven most SOTA improvements, particularly on SWE-Bench Verified. Proprietary LLMs (Claude 3.5/4 family) are used in the highest-performing entries, though fully open-source approaches are increasingly competitive. No single architectural paradigm (fixed workflow, scaffolded, emergent; single vs. multi-agent) consistently wins — top performance is achieved across diverse designs, suggesting that LLM quality is a stronger determinant than architecture. The benchmark may be approaching saturation at ~75%, and the paper highlights a systematic gap in correctness validation: % Resolved overstates true repair rates due to patch overfitting that is inconsistently addressed across submitters, especially outside the SE research community.",
    238   "red_flags": [
    239     {
    240       "flag": "No inter-rater reliability metric",
    241       "detail": "Architecture classifications relied on subjective interpretation of incomplete documentation. The paper mentions 'cross-validation among authors' but reports no Cohen's kappa or other reliability coefficient."
    242     },
    243     {
    244       "flag": "16 architecturally unclassifiable entries (G8)",
    245       "detail": "20% of Verified entries lack sufficient architectural documentation for classification. Findings about architecture-performance relationships may be biased toward better-documented (often academic) submissions."
    246     },
    247     {
    248       "flag": "Google search results not archived",
    249       "detail": "Secondary information gathering via Google, LinkedIn, and blog posts is not reproducible — results would differ if the study were repeated as pages disappear or change."
    250     },
    251     {
    252       "flag": "No funding disclosure",
    253       "detail": "Neither author acknowledgment nor funding statement appears anywhere in the paper, preventing assessment of potential conflicts with the commercial systems evaluated."
    254     },
    255     {
    256       "flag": "Self-reported/inferred architecture data",
    257       "detail": "Architectural classifications depend on what submitters chose to disclose (blog posts, READMEs), creating a systematic bias toward more transparent (typically academic or open-source) systems."
    258     }
    259   ],
    260   "cited_papers": [
    261     {
    262       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    263       "relevance": "The primary benchmark being analyzed; provides the leaderboard infrastructure and defines the % Resolved metric."
    264     },
    265     {
    266       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    267       "relevance": "One of the most widely analyzed submissions; used as a baseline and extended by many other approaches."
    268     },
    269     {
    270       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    271       "relevance": "Another widely-extended submission representing the non-agentic workflow paradigm; many leaderboard entries build on it."
    272     },
    273     {
    274       "title": "AutoCodeRover: Autonomous Program Improvement",
    275       "relevance": "Key academic submission analyzed across both leaderboards; represents early SOTA in scaffolded multi-agent design."
    276     },
    277     {
    278       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    279       "relevance": "Key open-source platform with multiple submissions; represents single-agent emergent design paradigm."
    280     },
    281     {
    282       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    283       "relevance": "Provides the end-to-end software maintenance pipeline taxonomy used in RQ3 analysis."
    284     },
    285     {
    286       "title": "Are 'Solved Issues' in SWE-Bench Really Solved Correctly? An Empirical Study",
    287       "relevance": "Key related work on patch overfitting in SWE-Bench; provides the 6.2pp overstatement estimate cited in the discussion."
    288     },
    289     {
    290       "title": "Introducing SWE-bench Verified",
    291       "relevance": "Describes the construction of SWE-Bench Verified leaderboard and its curation criteria."
    292     },
    293     {
    294       "title": "Why Do Multi-Agent LLM Systems Fail?",
    295       "relevance": "Provides taxonomy of multi-agent failure modes, directly relevant to the single vs. multi-agent architectural debate."
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 1,
    301       "justification": "Provides a taxonomy of repair architectures but no directly usable tool or technique practitioners can apply at work."
    302     },
    303     "surprise_contrarian": {
    304       "score": 1,
    305       "justification": "The finding that no single architecture dominates is mildly surprising given hype around agentic systems, but most other findings confirm conventional wisdom."
    306     },
    307     "fear_safety": {
    308       "score": 0,
    309       "justification": "No safety, security, or risk angle is discussed."
    310     },
    311     "drama_conflict": {
    312       "score": 2,
    313       "justification": "Directly examines SWE-Bench limitations including potential saturation, patch overfitting inflating scores by ~6pp, and questions whether industry submitters account for correctness."
    314     },
    315     "demo_ability": {
    316       "score": 0,
    317       "justification": "Pure observational study with no code, demo, or interactive artifact to try."
    318     },
    319     "brand_recognition": {
    320       "score": 2,
    321       "justification": "Centers on the widely-discussed SWE-Bench benchmark and references major companies (Anthropic, Google, Amazon, OpenAI) and products (Claude, Copilot-adjacent tools)."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [
    326       {
    327         "hn_id": "44489690",
    328         "title": "Mercury: Ultra-fast language models based on diffusion",
    329         "points": 576,
    330         "comments": 242,
    331         "url": "https://news.ycombinator.com/item?id=44489690",
    332         "created_at": "2025-07-07T12:31:08Z"
    333       },
    334       {
    335         "hn_id": "44412427",
    336         "title": "Mercury: Ultra-Fast Language Models Based on Diffusion",
    337         "points": 10,
    338         "comments": 2,
    339         "url": "https://news.ycombinator.com/item?id=44412427",
    340         "created_at": "2025-06-29T12:05:48Z"
    341       },
    342       {
    343         "hn_id": "44358841",
    344         "title": "Machine Mental Imagery: Empower Multimodal Reasoning with Latent Visual Tokens",
    345         "points": 7,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=44358841",
    348         "created_at": "2025-06-23T18:52:55Z"
    349       },
    350       {
    351         "hn_id": "44101770",
    352         "title": "Effective Reinforcement Learning for Reasoning in Language Models",
    353         "points": 4,
    354         "comments": 0,
    355         "url": "https://news.ycombinator.com/item?id=44101770",
    356         "created_at": "2025-05-26T21:17:20Z"
    357       },
    358       {
    359         "hn_id": "44314613",
    360         "title": "Wanting to Be Understood Explains the Meta-Problem of Consciousness",
    361         "points": 3,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=44314613",
    364         "created_at": "2025-06-19T01:16:41Z"
    365       },
    366       {
    367         "hn_id": "44304578",
    368         "title": "Serving Large Language Models on Huawei CloudMatrix384",
    369         "points": 3,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=44304578",
    372         "created_at": "2025-06-17T22:18:43Z"
    373       },
    374       {
    375         "hn_id": "44009979",
    376         "title": "A Search for Planet Nine with IRAS and Akari Data",
    377         "points": 3,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=44009979",
    380         "created_at": "2025-05-16T21:35:58Z"
    381       },
    382       {
    383         "hn_id": "46445614",
    384         "title": "Mechanical non-reciprocity programmed by shear jamming in soft composite solids",
    385         "points": 2,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=46445614",
    388         "created_at": "2025-12-31T16:32:15Z"
    389       },
    390       {
    391         "hn_id": "44047429",
    392         "title": "Model Merging in Pre-Training of Large Language Models",
    393         "points": 2,
    394         "comments": 0,
    395         "url": "https://news.ycombinator.com/item?id=44047429",
    396         "created_at": "2025-05-21T01:12:29Z"
    397       },
    398       {
    399         "hn_id": "42816449",
    400         "title": "Dissecting the NVIDIA Hopper Architecture through Microbenchmarking",
    401         "points": 2,
    402         "comments": 0,
    403         "url": "https://news.ycombinator.com/item?id=42816449",
    404         "created_at": "2025-01-24T20:02:41Z"
    405       }
    406     ],
    407     "top_points": 576,
    408     "total_points": 612,
    409     "total_comments": 244
    410   }
    411 }

Impressum · Datenschutz