scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20598B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems",
      6     "authors": [
      7       "Matias Martinez",
      8       "Xavier Franch"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2506.17208",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All abstract claims—first comprehensive study, 80 unique approaches, Claude dominance, architectural diversity, contributor diversity—are substantiated by Tables 2–6 and the RQ results sections.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "The paper is descriptive and observational; it makes no formal causal claims, only correlational or descriptive observations about leaderboard submissions.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Section 5 (External Validity) explicitly states the findings are bounded to SWE-Bench Lite and Verified and that 'we do not claim that our findings can be applied to them' (other benchmarks).",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 4.3 presents competing perspectives on single- vs. multi-agent architectures from Cognition, Anthropic, OpenHands, and nFactorial; Section 3.1.2 attributes early academic underperformance partly to temporal effects rather than capability.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Section 4.1 explicitly discusses that '% Resolved' conflates plausible and correct patches, citing Wang et al.'s finding of a 6.2pp average overstatement, and calls for additional validation beyond test suites.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5 'Threats to Validity' contains four dedicated subsections: External, Internal, Construct, and Conclusion Validity.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Threats are concrete: risk of missing submission documentation (Internal), exclusion of monetary cost analysis due to token-price normalization difficulty, G8 category for architecturally unclassifiable entries, and content analysis limitations (Construct).",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section 2.1 explicitly excludes Full and Multimodal leaderboards with rationale; data cutoff is July 17th pinned to a specific GitHub commit hash.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding source is mentioned anywhere in the paper.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors are identified as affiliated with Universitat Politècnica de Catalunya, Barcelona, Spain, with contact emails provided.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funding is disclosed, so independence of funder cannot be assessed.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests appears in the paper despite analyzing commercial tools from major AI companies.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 2.2 formally defines workflow authoring (human vs. emergent), control flow autonomy (emergent, scaffolded, fixed), and agent count categories; Section 2.1.2 defines submitter categories with explicit coding schemas.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper explicitly states it presents 'the first in-depth study of the SWE-Bench leaderboards' with three clearly stated research questions (RQ1–RQ3) covering submitter profiling, architecture, and pipeline phases.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 6 engages substantively with prior empirical studies of SWE-Bench patches (Meng et al., Wang et al., Aleithan et al., Ceka et al.) and distinguishes this leaderboard-level characterization from their patch-level analyses.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "The leaderboard data source is pinned to a specific GitHub commit hash; Section 2.1.1 provides the Google supplementary query format ('<Name_Entry> + SWE-Bench') with a worked example.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Inclusion: all entries on Lite and Verified as of July 17th. Exclusion: Full (all solutions are subsets of Lite/Verified) and Multimodal (language-based focus only), with explicit rationale for each.",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No PRISMA or equivalent structured review protocol is followed; the paper uses content analysis on leaderboard data rather than a systematic literature review protocol with formal screening stages.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Section 2.1.1 explicitly provides the Google query format: '<Name_Entry> + SWE-Bench' with example 'GRU' from entry 'Gru(2024-12-08)'.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Sources explicitly listed: SWE-Bench leaderboard pages, SWE-Bench GitHub repository (experiments), Google search, LinkedIn, arXiv, and scientific publications.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Table 1 shows artifact type distribution but there is no PRISMA-style flow diagram with counts at each screening stage (records identified → screened → excluded → included).",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Section 2.1 explains the choice of Lite and Verified (high impact, full coverage of other leaderboards), the July 17th cutoff, and the exclusion of non-language modalities.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Section 4.3 presents substantive disagreement between Cognition (anti-multi-agent), Anthropic (pro-multi-agent for their use case), OpenHands (pro-single-agent), and empirical evolution of nFactorial across four submissions.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The paper categorizes submissions by documentation type (Table 1) and introduces G8 for unclassifiable entries but applies no formal quality rubric or risk-of-bias assessment to source papers or approaches.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "The paper does not discuss leaderboard submission bias (e.g., that only successful approaches are submitted, that negative results are never shared), which is a relevant concern for interpreting apparent progress trends.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Kruskal-Wallis tests with Dunn's post-hoc comparisons are applied to compare % Resolved across submitter types and architecture groups (Tables 2, 6); median and maximum precision reported for all categories.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Recommendations (semantic correctness validation, open-source framework value) directly follow from documented empirical findings: Wang et al.'s overfitting data and the SIMA/Augment Code open-source success cases.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Industry accounts for 58% of distinct submitters and 65% of entries in SWE-Bench Verified, with small companies the dominant subtype.",
    198       "evidence": "Table 2 and Figures 2–4 show 41/71 distinct submitters from industry; 65/99 Verified entries are from industry; 15–16 of those are small companies.",
    199       "supported": "strong"
    200     },
    201     {
    202       "claim": "Proprietary LLMs—especially Claude 3.5 and Claude 4 families—consistently achieve the highest precision on both leaderboards.",
    203       "evidence": "Table 5 shows Claude 3.5 Sonnet is the most-used model; Section 4.5 notes all systems exceeding 70% on Verified use Claude 4 models.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "No single architecture consistently achieves state-of-the-art performance across both leaderboards.",
    208       "evidence": "Table 6 and Kruskal-Wallis tests: no statistically significant architecture differences in Lite (p=0.0579); G3 tops Verified max but G6 (31 entries) is largest and competitive.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Open-source solutions are approaching competitive performance with closed-source, with several reaching state-of-the-art in 2025.",
    213       "evidence": "Table 4 shows top-ranked entries in both leaderboards are open-source; Figure 7 shows convergence of open- and closed-source performance in 2025.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "SWE-Bench may be approaching saturation, with 75.2% precision reached in July 2025 versus ~50% one year earlier.",
    218       "evidence": "Figure 1b documents the progression; Section 4.5 draws HumanEval saturation analogy but notes this is a projection, not established fact.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Current SWE-Bench evaluation overstates resolution rates by ~6.2 percentage points due to patch overfitting.",
    223       "evidence": "Section 4.1 cites Wang et al. [75] who ran PatchDiff on three systems; this finding is not independently verified in the present paper.",
    224       "supported": "moderate"
    225     }
    226   ],
    227   "methodology_tags": [
    228     "observational",
    229     "meta-analysis",
    230     "qualitative"
    231   ],
    232   "key_findings": "This first comprehensive characterization of SWE-Bench Lite (79 entries) and Verified (99 entries) leaderboards finds that industry—especially small companies—dominates submissions (65% of Verified entries), while proprietary LLMs (Claude 3.5/4) consistently achieve highest precision. No single architecture reliably outperforms: human-authored multi-agent fixed workflows (G3) and scaffolded single-agent (G4) approaches top SWE-Bench Lite, while emergent single-agent systems (G6) are the most numerous and competitive on Verified. Open-source approaches became increasingly competitive throughout 2025. The benchmark shows saturation signals at 75% precision, and its test-passing metric likely overstates true resolution rates due to patch overfitting.",
    233   "red_flags": [
    234     {
    235       "flag": "No funding disclosure",
    236       "detail": "No funding source is mentioned anywhere in the paper, making it impossible to assess potential financial conflicts of interest."
    237     },
    238     {
    239       "flag": "No competing interests statement",
    240       "detail": "Neither author declares financial interests despite the paper profiling commercial tools from Anthropic, Google, Amazon, IBM, and others."
    241     },
    242     {
    243       "flag": "Non-reproducible supplementary search",
    244       "detail": "Google search results and LinkedIn browsing used to supplement leaderboard data cannot be exactly reproduced; results vary by user, date, and locale."
    245     },
    246     {
    247       "flag": "Large unclassifiable subset",
    248       "detail": "13 Lite entries and 16 Verified entries (G8) cannot be architecturally classified due to insufficient public documentation, limiting the scope of architectural conclusions."
    249     },
    250     {
    251       "flag": "No PRISMA or formal screening flow",
    252       "detail": "Despite systematically reviewing a corpus of submissions, the paper omits a PRISMA-style screening diagram with counts at each exclusion stage."
    253     },
    254     {
    255       "flag": "Submission bias unaddressed",
    256       "detail": "Leaderboard submissions are self-selected (only positive results submitted); the paper does not discuss how this biases observed architecture or LLM performance distributions."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    262       "relevance": "Primary benchmark analyzed; foundational paper for the entire study."
    263     },
    264     {
    265       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    266       "relevance": "Most-cited non-agentic approach; spawned multiple leaderboard extensions analyzed in detail."
    267     },
    268     {
    269       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    270       "relevance": "Key emergent single-agent baseline (G6); analyzed across all three RQs."
    271     },
    272     {
    273       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    274       "relevance": "Liu et al. taxonomy provides the pipeline phase framework used for RQ3."
    275     },
    276     {
    277       "title": "Are 'Solved Issues' in SWE-Bench Really Solved Correctly? An Empirical Study",
    278       "relevance": "Wang et al. finding of 6.2pp overstatement from patch overfitting, central to Section 4.1 discussion."
    279     },
    280     {
    281       "title": "Introducing SWE-Bench Verified",
    282       "relevance": "Describes construction criteria for the second leaderboard analyzed."
    283     },
    284     {
    285       "title": "Why Do Multi-Agent LLM Systems Fail?",
    286       "relevance": "Provides 14-failure-mode taxonomy used in Section 4.3's single vs. multi-agent debate."
    287     },
    288     {
    289       "title": "AutoCodeRover: Autonomous Program Improvement",
    290       "relevance": "G5 multi-agent scaffolded approach; one of the most-cited academic submissions on the leaderboard."
    291     },
    292     {
    293       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    294       "relevance": "G6 open-source platform with multiple submissions; cited for single-agent architecture advocacy."
    295     },
    296     {
    297       "title": "Revisiting SWE-Bench: On the Importance of Data Quality for LLM-Based Code Models",
    298       "relevance": "Aleithan et al. patch quality analysis motivating discussion of evaluation reliability."
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "Directly informs AI practitioners on which LLMs, architectures, and product types are winning on the most-watched coding agent benchmark — immediately actionable."
    305     },
    306     "surprise_contrarian": {
    307       "score": 2,
    308       "justification": "Challenges the assumption that complex multi-agent architectures are superior — single-agent emergent systems (G6) are the largest and competitive group — and shows individual developers can match major tech companies."
    309     },
    310     "fear_safety": {
    311       "score": 0,
    312       "justification": "No AI risk or safety concerns raised; focus is on benchmark performance and submitter characteristics."
    313     },
    314     "drama_conflict": {
    315       "score": 2,
    316       "justification": "Section 4.3 documents a real public dispute between Cognition (anti-multi-agent post) and Anthropic (pro-multi-agent post one day later), and highlights academia vs. industry evaluation standard misalignment."
    317     },
    318     "demo_ability": {
    319       "score": 1,
    320       "justification": "SWE-Bench leaderboard is public and readers can explore submissions, but the paper itself is analytical without a demo-able artifact."
    321     },
    322     "brand_recognition": {
    323       "score": 2,
    324       "justification": "Analyzes submissions from Anthropic, Google, Amazon, IBM, ByteDance, Meta, and Princeton, providing high name-recognition density even though the authors are from UPC Barcelona."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {
    330         "hn_id": "44489690",
    331         "title": "Mercury: Ultra-fast language models based on diffusion",
    332         "points": 576,
    333         "comments": 242,
    334         "url": "https://news.ycombinator.com/item?id=44489690",
    335         "created_at": "2025-07-07T12:31:08Z"
    336       },
    337       {
    338         "hn_id": "44412427",
    339         "title": "Mercury: Ultra-Fast Language Models Based on Diffusion",
    340         "points": 10,
    341         "comments": 2,
    342         "url": "https://news.ycombinator.com/item?id=44412427",
    343         "created_at": "2025-06-29T12:05:48Z"
    344       },
    345       {
    346         "hn_id": "44358841",
    347         "title": "Machine Mental Imagery: Empower Multimodal Reasoning with Latent Visual Tokens",
    348         "points": 7,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=44358841",
    351         "created_at": "2025-06-23T18:52:55Z"
    352       },
    353       {
    354         "hn_id": "44101770",
    355         "title": "Effective Reinforcement Learning for Reasoning in Language Models",
    356         "points": 4,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=44101770",
    359         "created_at": "2025-05-26T21:17:20Z"
    360       },
    361       {
    362         "hn_id": "44314613",
    363         "title": "Wanting to Be Understood Explains the Meta-Problem of Consciousness",
    364         "points": 3,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=44314613",
    367         "created_at": "2025-06-19T01:16:41Z"
    368       },
    369       {
    370         "hn_id": "44304578",
    371         "title": "Serving Large Language Models on Huawei CloudMatrix384",
    372         "points": 3,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=44304578",
    375         "created_at": "2025-06-17T22:18:43Z"
    376       },
    377       {
    378         "hn_id": "44009979",
    379         "title": "A Search for Planet Nine with IRAS and Akari Data",
    380         "points": 3,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=44009979",
    383         "created_at": "2025-05-16T21:35:58Z"
    384       },
    385       {
    386         "hn_id": "46445614",
    387         "title": "Mechanical non-reciprocity programmed by shear jamming in soft composite solids",
    388         "points": 2,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=46445614",
    391         "created_at": "2025-12-31T16:32:15Z"
    392       },
    393       {
    394         "hn_id": "44047429",
    395         "title": "Model Merging in Pre-Training of Large Language Models",
    396         "points": 2,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=44047429",
    399         "created_at": "2025-05-21T01:12:29Z"
    400       },
    401       {
    402         "hn_id": "42816449",
    403         "title": "Dissecting the NVIDIA Hopper Architecture through Microbenchmarking",
    404         "points": 2,
    405         "comments": 0,
    406         "url": "https://news.ycombinator.com/item?id=42816449",
    407         "created_at": "2025-01-24T20:02:41Z"
    408       }
    409     ],
    410     "top_points": 576,
    411     "total_points": 612,
    412     "total_comments": 244
    413   }
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs