scan-v5.json (20598B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems", 6 "authors": [ 7 "Matias Martinez", 8 "Xavier Franch" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2506.17208", 13 "doi": null 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All abstract claims—first comprehensive study, 80 unique approaches, Claude dominance, architectural diversity, contributor diversity—are substantiated by Tables 2–6 and the RQ results sections.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": false, 25 "answer": false, 26 "justification": "The paper is descriptive and observational; it makes no formal causal claims, only correlational or descriptive observations about leaderboard submissions.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Section 5 (External Validity) explicitly states the findings are bounded to SWE-Bench Lite and Verified and that 'we do not claim that our findings can be applied to them' (other benchmarks).", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 4.3 presents competing perspectives on single- vs. multi-agent architectures from Cognition, Anthropic, OpenHands, and nFactorial; Section 3.1.2 attributes early academic underperformance partly to temporal effects rather than capability.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "Section 4.1 explicitly discusses that '% Resolved' conflates plausible and correct patches, citing Wang et al.'s finding of a 6.2pp average overstatement, and calls for additional validation beyond test suites.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 5 'Threats to Validity' contains four dedicated subsections: External, Internal, Construct, and Conclusion Validity.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Threats are concrete: risk of missing submission documentation (Internal), exclusion of monetary cost analysis due to token-price normalization difficulty, G8 category for architecturally unclassifiable entries, and content analysis limitations (Construct).", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Section 2.1 explicitly excludes Full and Multimodal leaderboards with rationale; data cutoff is July 17th pinned to a specific GitHub commit hash.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "No funding source is mentioned anywhere in the paper.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both authors are identified as affiliated with Universitat Politècnica de Catalunya, Barcelona, Spain, with contact emails provided.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": false, 83 "answer": false, 84 "justification": "No funding is disclosed, so independence of funder cannot be assessed.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement or declaration of financial interests appears in the paper despite analyzing commercial tools from major AI companies.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 2.2 formally defines workflow authoring (human vs. emergent), control flow autonomy (emergent, scaffolded, fixed), and agent count categories; Section 2.1.2 defines submitter categories with explicit coding schemas.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper explicitly states it presents 'the first in-depth study of the SWE-Bench leaderboards' with three clearly stated research questions (RQ1–RQ3) covering submitter profiling, architecture, and pipeline phases.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 6 engages substantively with prior empirical studies of SWE-Bench patches (Meng et al., Wang et al., Aleithan et al., Ceka et al.) and distinguishes this leaderboard-level characterization from their patch-level analyses.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "survey": { 117 "search_and_selection": { 118 "search_strategy_reproducible": { 119 "applies": true, 120 "answer": true, 121 "justification": "The leaderboard data source is pinned to a specific GitHub commit hash; Section 2.1.1 provides the Google supplementary query format ('<Name_Entry> + SWE-Bench') with a worked example.", 122 "source": "haiku" 123 }, 124 "inclusion_exclusion_explicit": { 125 "applies": true, 126 "answer": true, 127 "justification": "Inclusion: all entries on Lite and Verified as of July 17th. Exclusion: Full (all solutions are subsets of Lite/Verified) and Multimodal (language-based focus only), with explicit rationale for each.", 128 "source": "haiku" 129 }, 130 "prisma_or_structured_protocol": { 131 "applies": true, 132 "answer": false, 133 "justification": "No PRISMA or equivalent structured review protocol is followed; the paper uses content analysis on leaderboard data rather than a systematic literature review protocol with formal screening stages.", 134 "source": "haiku" 135 }, 136 "search_terms_provided": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 2.1.1 explicitly provides the Google query format: '<Name_Entry> + SWE-Bench' with example 'GRU' from entry 'Gru(2024-12-08)'.", 140 "source": "haiku" 141 }, 142 "databases_listed": { 143 "applies": true, 144 "answer": true, 145 "justification": "Sources explicitly listed: SWE-Bench leaderboard pages, SWE-Bench GitHub repository (experiments), Google search, LinkedIn, arXiv, and scientific publications.", 146 "source": "haiku" 147 }, 148 "screening_process_documented": { 149 "applies": true, 150 "answer": false, 151 "justification": "Table 1 shows artifact type distribution but there is no PRISMA-style flow diagram with counts at each screening stage (records identified → screened → excluded → included).", 152 "source": "haiku" 153 }, 154 "review_scope_justified": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 2.1 explains the choice of Lite and Verified (high impact, full coverage of other leaderboards), the July 17th cutoff, and the exclusion of non-language modalities.", 158 "source": "haiku" 159 } 160 }, 161 "synthesis_quality": { 162 "conflicting_findings_acknowledged": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 4.3 presents substantive disagreement between Cognition (anti-multi-agent), Anthropic (pro-multi-agent for their use case), OpenHands (pro-single-agent), and empirical evolution of nFactorial across four submissions.", 166 "source": "haiku" 167 }, 168 "quality_assessment_of_sources": { 169 "applies": true, 170 "answer": false, 171 "justification": "The paper categorizes submissions by documentation type (Table 1) and introduces G8 for unclassifiable entries but applies no formal quality rubric or risk-of-bias assessment to source papers or approaches.", 172 "source": "haiku" 173 }, 174 "publication_bias_discussed": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not discuss leaderboard submission bias (e.g., that only successful approaches are submitted, that negative results are never shared), which is a relevant concern for interpreting apparent progress trends.", 178 "source": "haiku" 179 }, 180 "quantitative_synthesis_present": { 181 "applies": true, 182 "answer": true, 183 "justification": "Kruskal-Wallis tests with Dunn's post-hoc comparisons are applied to compare % Resolved across submitter types and architecture groups (Tables 2, 6); median and maximum precision reported for all categories.", 184 "source": "haiku" 185 }, 186 "recommendations_supported_by_evidence": { 187 "applies": true, 188 "answer": true, 189 "justification": "Recommendations (semantic correctness validation, open-source framework value) directly follow from documented empirical findings: Wang et al.'s overfitting data and the SIMA/Augment Code open-source success cases.", 190 "source": "haiku" 191 } 192 } 193 } 194 }, 195 "claims": [ 196 { 197 "claim": "Industry accounts for 58% of distinct submitters and 65% of entries in SWE-Bench Verified, with small companies the dominant subtype.", 198 "evidence": "Table 2 and Figures 2–4 show 41/71 distinct submitters from industry; 65/99 Verified entries are from industry; 15–16 of those are small companies.", 199 "supported": "strong" 200 }, 201 { 202 "claim": "Proprietary LLMs—especially Claude 3.5 and Claude 4 families—consistently achieve the highest precision on both leaderboards.", 203 "evidence": "Table 5 shows Claude 3.5 Sonnet is the most-used model; Section 4.5 notes all systems exceeding 70% on Verified use Claude 4 models.", 204 "supported": "strong" 205 }, 206 { 207 "claim": "No single architecture consistently achieves state-of-the-art performance across both leaderboards.", 208 "evidence": "Table 6 and Kruskal-Wallis tests: no statistically significant architecture differences in Lite (p=0.0579); G3 tops Verified max but G6 (31 entries) is largest and competitive.", 209 "supported": "strong" 210 }, 211 { 212 "claim": "Open-source solutions are approaching competitive performance with closed-source, with several reaching state-of-the-art in 2025.", 213 "evidence": "Table 4 shows top-ranked entries in both leaderboards are open-source; Figure 7 shows convergence of open- and closed-source performance in 2025.", 214 "supported": "strong" 215 }, 216 { 217 "claim": "SWE-Bench may be approaching saturation, with 75.2% precision reached in July 2025 versus ~50% one year earlier.", 218 "evidence": "Figure 1b documents the progression; Section 4.5 draws HumanEval saturation analogy but notes this is a projection, not established fact.", 219 "supported": "moderate" 220 }, 221 { 222 "claim": "Current SWE-Bench evaluation overstates resolution rates by ~6.2 percentage points due to patch overfitting.", 223 "evidence": "Section 4.1 cites Wang et al. [75] who ran PatchDiff on three systems; this finding is not independently verified in the present paper.", 224 "supported": "moderate" 225 } 226 ], 227 "methodology_tags": [ 228 "observational", 229 "meta-analysis", 230 "qualitative" 231 ], 232 "key_findings": "This first comprehensive characterization of SWE-Bench Lite (79 entries) and Verified (99 entries) leaderboards finds that industry—especially small companies—dominates submissions (65% of Verified entries), while proprietary LLMs (Claude 3.5/4) consistently achieve highest precision. No single architecture reliably outperforms: human-authored multi-agent fixed workflows (G3) and scaffolded single-agent (G4) approaches top SWE-Bench Lite, while emergent single-agent systems (G6) are the most numerous and competitive on Verified. Open-source approaches became increasingly competitive throughout 2025. The benchmark shows saturation signals at 75% precision, and its test-passing metric likely overstates true resolution rates due to patch overfitting.", 233 "red_flags": [ 234 { 235 "flag": "No funding disclosure", 236 "detail": "No funding source is mentioned anywhere in the paper, making it impossible to assess potential financial conflicts of interest." 237 }, 238 { 239 "flag": "No competing interests statement", 240 "detail": "Neither author declares financial interests despite the paper profiling commercial tools from Anthropic, Google, Amazon, IBM, and others." 241 }, 242 { 243 "flag": "Non-reproducible supplementary search", 244 "detail": "Google search results and LinkedIn browsing used to supplement leaderboard data cannot be exactly reproduced; results vary by user, date, and locale." 245 }, 246 { 247 "flag": "Large unclassifiable subset", 248 "detail": "13 Lite entries and 16 Verified entries (G8) cannot be architecturally classified due to insufficient public documentation, limiting the scope of architectural conclusions." 249 }, 250 { 251 "flag": "No PRISMA or formal screening flow", 252 "detail": "Despite systematically reviewing a corpus of submissions, the paper omits a PRISMA-style screening diagram with counts at each exclusion stage." 253 }, 254 { 255 "flag": "Submission bias unaddressed", 256 "detail": "Leaderboard submissions are self-selected (only positive results submitted); the paper does not discuss how this biases observed architecture or LLM performance distributions." 257 } 258 ], 259 "cited_papers": [ 260 { 261 "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?", 262 "relevance": "Primary benchmark analyzed; foundational paper for the entire study." 263 }, 264 { 265 "title": "Agentless: Demystifying LLM-Based Software Engineering Agents", 266 "relevance": "Most-cited non-agentic approach; spawned multiple leaderboard extensions analyzed in detail." 267 }, 268 { 269 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 270 "relevance": "Key emergent single-agent baseline (G6); analyzed across all three RQs." 271 }, 272 { 273 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 274 "relevance": "Liu et al. taxonomy provides the pipeline phase framework used for RQ3." 275 }, 276 { 277 "title": "Are 'Solved Issues' in SWE-Bench Really Solved Correctly? An Empirical Study", 278 "relevance": "Wang et al. finding of 6.2pp overstatement from patch overfitting, central to Section 4.1 discussion." 279 }, 280 { 281 "title": "Introducing SWE-Bench Verified", 282 "relevance": "Describes construction criteria for the second leaderboard analyzed." 283 }, 284 { 285 "title": "Why Do Multi-Agent LLM Systems Fail?", 286 "relevance": "Provides 14-failure-mode taxonomy used in Section 4.3's single vs. multi-agent debate." 287 }, 288 { 289 "title": "AutoCodeRover: Autonomous Program Improvement", 290 "relevance": "G5 multi-agent scaffolded approach; one of the most-cited academic submissions on the leaderboard." 291 }, 292 { 293 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 294 "relevance": "G6 open-source platform with multiple submissions; cited for single-agent architecture advocacy." 295 }, 296 { 297 "title": "Revisiting SWE-Bench: On the Importance of Data Quality for LLM-Based Code Models", 298 "relevance": "Aleithan et al. patch quality analysis motivating discussion of evaluation reliability." 299 } 300 ], 301 "engagement_factors": { 302 "practical_relevance": { 303 "score": 3, 304 "justification": "Directly informs AI practitioners on which LLMs, architectures, and product types are winning on the most-watched coding agent benchmark — immediately actionable." 305 }, 306 "surprise_contrarian": { 307 "score": 2, 308 "justification": "Challenges the assumption that complex multi-agent architectures are superior — single-agent emergent systems (G6) are the largest and competitive group — and shows individual developers can match major tech companies." 309 }, 310 "fear_safety": { 311 "score": 0, 312 "justification": "No AI risk or safety concerns raised; focus is on benchmark performance and submitter characteristics." 313 }, 314 "drama_conflict": { 315 "score": 2, 316 "justification": "Section 4.3 documents a real public dispute between Cognition (anti-multi-agent post) and Anthropic (pro-multi-agent post one day later), and highlights academia vs. industry evaluation standard misalignment." 317 }, 318 "demo_ability": { 319 "score": 1, 320 "justification": "SWE-Bench leaderboard is public and readers can explore submissions, but the paper itself is analytical without a demo-able artifact." 321 }, 322 "brand_recognition": { 323 "score": 2, 324 "justification": "Analyzes submissions from Anthropic, Google, Amazon, IBM, ByteDance, Meta, and Princeton, providing high name-recognition density even though the authors are from UPC Barcelona." 325 } 326 }, 327 "hn_data": { 328 "threads": [ 329 { 330 "hn_id": "44489690", 331 "title": "Mercury: Ultra-fast language models based on diffusion", 332 "points": 576, 333 "comments": 242, 334 "url": "https://news.ycombinator.com/item?id=44489690", 335 "created_at": "2025-07-07T12:31:08Z" 336 }, 337 { 338 "hn_id": "44412427", 339 "title": "Mercury: Ultra-Fast Language Models Based on Diffusion", 340 "points": 10, 341 "comments": 2, 342 "url": "https://news.ycombinator.com/item?id=44412427", 343 "created_at": "2025-06-29T12:05:48Z" 344 }, 345 { 346 "hn_id": "44358841", 347 "title": "Machine Mental Imagery: Empower Multimodal Reasoning with Latent Visual Tokens", 348 "points": 7, 349 "comments": 0, 350 "url": "https://news.ycombinator.com/item?id=44358841", 351 "created_at": "2025-06-23T18:52:55Z" 352 }, 353 { 354 "hn_id": "44101770", 355 "title": "Effective Reinforcement Learning for Reasoning in Language Models", 356 "points": 4, 357 "comments": 0, 358 "url": "https://news.ycombinator.com/item?id=44101770", 359 "created_at": "2025-05-26T21:17:20Z" 360 }, 361 { 362 "hn_id": "44314613", 363 "title": "Wanting to Be Understood Explains the Meta-Problem of Consciousness", 364 "points": 3, 365 "comments": 0, 366 "url": "https://news.ycombinator.com/item?id=44314613", 367 "created_at": "2025-06-19T01:16:41Z" 368 }, 369 { 370 "hn_id": "44304578", 371 "title": "Serving Large Language Models on Huawei CloudMatrix384", 372 "points": 3, 373 "comments": 0, 374 "url": "https://news.ycombinator.com/item?id=44304578", 375 "created_at": "2025-06-17T22:18:43Z" 376 }, 377 { 378 "hn_id": "44009979", 379 "title": "A Search for Planet Nine with IRAS and Akari Data", 380 "points": 3, 381 "comments": 0, 382 "url": "https://news.ycombinator.com/item?id=44009979", 383 "created_at": "2025-05-16T21:35:58Z" 384 }, 385 { 386 "hn_id": "46445614", 387 "title": "Mechanical non-reciprocity programmed by shear jamming in soft composite solids", 388 "points": 2, 389 "comments": 0, 390 "url": "https://news.ycombinator.com/item?id=46445614", 391 "created_at": "2025-12-31T16:32:15Z" 392 }, 393 { 394 "hn_id": "44047429", 395 "title": "Model Merging in Pre-Training of Large Language Models", 396 "points": 2, 397 "comments": 0, 398 "url": "https://news.ycombinator.com/item?id=44047429", 399 "created_at": "2025-05-21T01:12:29Z" 400 }, 401 { 402 "hn_id": "42816449", 403 "title": "Dissecting the NVIDIA Hopper Architecture through Microbenchmarking", 404 "points": 2, 405 "comments": 0, 406 "url": "https://news.ycombinator.com/item?id=42816449", 407 "created_at": "2025-01-24T20:02:41Z" 408 } 409 ], 410 "top_points": 576, 411 "total_points": 612, 412 "total_comments": 244 413 } 414 }