scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21925B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["survey_methodology"],
      4   "paper": {
      5     "title": "What's in a Benchmark? The Case of SWE-Bench in Automated Program Repair",
      6     "authors": ["Matias Martinez", "Xavier Franch"],
      7     "year": 2026,
      8     "venue": "ICSE-SEIP 2026",
      9     "arxiv_id": "2602.04449",
     10     "doi": "10.1145/3786583.3786904"
     11   },
     12   "methodology_tags": ["observational", "meta-analysis"],
     13   "key_findings": "Analysis of 212 entries across SWE-Bench Lite and Verified leaderboards shows industry dominates submissions (58-79% depending on how collaborations are counted), with small companies and large publicly traded firms leading. Proprietary LLMs, especially the Claude family, achieve the highest precision, with Claude 4 Sonnet powering all entries above 70% on Verified. Open-source solutions remain competitive but trail in median precision. The paper identifies critical issues including patch overfitting, data contamination, limited real-world representation (Python-only), and a transparency-participation trade-off in metadata requirements.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Appendix data is hosted at https://github.com/UPCBarcelonaTech/DissectingSWEBench (reference [3])."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The complete list of approaches and submissions is provided in the appendix repository [3], and the paper references metadata collected from leaderboard entries."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification provided. The study is primarily manual analysis, but any analysis scripts would benefit from environment documentation."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the process but does not provide a reproducible script or checklist for replicating the data collection."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No confidence intervals or error bars are reported. Tables show medians and maxima but no uncertainty measures."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Kruskal-Wallis test used for comparing precision across submitter types (H=19.89, p=0.0469 for Lite; H=38.0953, p=0.0001 for Verified). Dunn's post-hoc test applied for Verified with specific p-values reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Only H statistics and p-values are reported from statistical tests. No formal effect size measures (e.g., eta-squared, rank-biserial correlation) are provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The sample is the full population of leaderboard entries (79 Lite, 133 Verified), but there is no discussion of whether this is sufficient for the statistical tests performed, nor any power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only medians and maxima are reported. No standard deviations, IQR, or other spread measures are provided for the precision distributions."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "This is described as 'the first comprehensive study' of SWE-Bench leaderboards. No comparison with prior survey results or alternative analysis approaches is provided."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines are included, so contemporaneity cannot be assessed."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is an observational study with no system components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple dimensions analyzed: % Resolved, number of entries, submitter types, product availability modes, open-source share, LLM usage patterns."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation of system outputs is not relevant to this leaderboard analysis study."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not applicable to this observational study; there is no prediction task requiring train/test separation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Extensive breakdowns provided: by submitter type (Table 1), product availability and form (Table 2), open vs closed source (Table 3), and LLM combinations (Table 4)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses entries that could not be fully characterized ('Unknown' categories), country analysis that was discarded due to unreliable data, and limitations of the coding approach."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Reports that open-source LLMs yield lower precision, that academic median precision is lower than industry, and that some planned analyses (country-level) had to be abandoned."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about industry dominance, Claude family dominance, and open-source competitiveness are all supported by Tables 1-4 and the detailed analysis in Section 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper avoids strong causal language, using hedged phrasing like 'This difference may be attributed to the construction of Verified' and 'may be influenced by the early submissions.' The Kruskal-Wallis tests support comparative claims appropriately."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5 (External Validity): 'Other benchmarks may be equally representative of the broader issue-fixing landscape, but we do not claim that our findings can be applied to them.' Scope explicitly limited to two SWE-Bench leaderboards."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Multiple alternative explanations discussed: early academic submissions influencing lower median precision, construction of Verified filtering affecting precision differences, selective reporting inflating apparent saturation, cost barriers explaining open-source underperformance."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper extensively discusses that % Resolved (test-passing) is a proxy for correctness, with Section 4 'Patch Overfitting on SWE-Bench' explaining that overfitting patches pass tests but remain incorrect, and 'Limited Real-World Representation' noting SWE-bench may not generalize."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models are used in the study methodology itself."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used in this study."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No models or hyperparameters are used in this observational study."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2.2 describes the data collection pipeline in detail: visiting leaderboard pages, following links, Google searches with specific query format, LinkedIn inspection, and metadata extraction from README.md and metadata.yaml files."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Threats to Validity' provides substantive discussion across external, internal, and construct validity."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: risk of missing artifacts for some entries, country analysis discarded due to unreliable data, LLM parameter counts not analyzed, cost comparisons excluded due to declining token prices making cross-temporal comparison misleading."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicitly states: excluded Full and Multimodal leaderboards with reasons given, does not claim generalization to other benchmarks, did not analyze LLM parameter counts or monetary costs."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Raw data provided in appendix repository at https://github.com/UPCBarcelonaTech/DissectingSWEBench (reference [3]), including complete lists of approaches and submissions."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2.2 describes the full data collection procedure including visiting leaderboard pages, following links, Google searches, LinkedIn profiles, and metadata files."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data source is public leaderboard entries."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 2.2-2.3 documents the pipeline from visiting leaderboards to content analysis coding. Steps (a)-(f) are enumerated for each entry's data collection. Section 2.3 describes the coding methodology including deductive and inductive approaches."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section discloses: 'Ramon y Cajal' Fellowship (RYC2021-031523-I), grant PID2024-156019OB-I00 funded by MICIU/AEI and ERDF."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Authors affiliated with Universitat Politècnica de Catalunya, clearly stated. No conflict with SWE-bench or any evaluated product."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funded by Spanish government research grants with no financial stake in SWE-bench outcomes or any of the evaluated products/companies."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This study does not evaluate a pre-trained model on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This study does not evaluate a pre-trained model on any benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This study does not evaluate a pre-trained model on any benchmark. However, the paper does discuss contamination as a meta-concern for SWE-bench submissions in Section 4."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a survey/observational study with no computational method whose cost needs reporting."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a survey/observational study; no significant compute was required."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No PRISMA flow diagram or formal review protocol is referenced. The methodology uses content analysis with inductive/deductive coding (Section 2.3) but does not follow an established systematic review protocol."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The study catalogs and classifies leaderboard entries but does not assess the methodological quality of the underlying submissions or their associated papers. All entries are treated equally regardless of rigor."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 4 'Saturation and the Cherry-Picking of Submissions' explicitly discusses selective reporting: 'submissions may test several LLMs but only the best outcome is submitted, hiding result variability.' Also discusses how leaderboard participation is biased toward well-resourced actors."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "The majority of SWE-Bench submissions originate from industry, representing 58% of submitters (79% including industry-academia collaborations).",
    313       "evidence": "Table 1 shows 45 distinct industry submitters across both leaderboards, with 106 total industry entries. Section 3.2 provides detailed breakdown.",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Proprietary LLMs, especially Claude 4 Sonnet, achieve the highest precision on both leaderboards, with all entries exceeding 70% on Verified using Claude 4 models.",
    318       "evidence": "Table 4 shows Claude 4 Sonnet achieving max 76.8% on Verified and 60.33% on Lite. All top-10 entries on Verified use Claude 4 family models.",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "Small companies have been consistently driving improvements in state-of-the-art precision on both leaderboards.",
    323       "evidence": "Figure 2 and Section 3.2 show small companies (e.g., Isoform, Refact.ai) repeatedly pushing top results. Table 1 shows 24 distinct small company submitters with 52 entries.",
    324       "supported": "strong"
    325     },
    326     {
    327       "claim": "There is a statistically significant difference in precision between Single Academia and Small Companies on Lite (H=19.89, p=0.0469).",
    328       "evidence": "Kruskal-Wallis test reported in Section 3.2. However, the authors note this may be influenced by early low-precision academic submissions.",
    329       "supported": "moderate"
    330     },
    331     {
    332       "claim": "Open-source solutions remain competitive with closed-source, with the top performer on Lite being open source and several open-source approaches achieving near-SOTA on Verified.",
    333       "evidence": "Table 3 shows open-source max of 60.33% on Lite (highest overall) and 75.2% on Verified. However, median precision favors closed-source.",
    334       "supported": "strong"
    335     },
    336     {
    337       "claim": "Activity on SWE-Bench Lite has significantly dropped in 2025, with the most recent entry dated June 27, 2025.",
    338       "evidence": "Figure 1a and Section 3.1 document this trend, showing submissions concentrating on Verified.",
    339       "supported": "strong"
    340     }
    341   ],
    342   "red_flags": [
    343     {
    344       "flag": "No inter-rater reliability for coding",
    345       "detail": "The content analysis coding (submitter type, product category, availability) was performed but the paper does not mention whether multiple coders were used or inter-rater reliability was assessed. For an inductive coding approach, this is a significant omission."
    346     },
    347     {
    348       "flag": "Snapshot-dependent results",
    349       "detail": "Data collected as of September 18, 2025. Leaderboard entries and their metadata can change, be updated, or be removed. The findings are inherently time-bound but presented as general characterizations."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "SWE-bench: Can language models resolve real-world github issues?",
    355       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    356       "year": 2024,
    357       "relevance": "The foundational SWE-bench benchmark paper, central to the study's scope."
    358     },
    359     {
    360       "title": "Agentless: Demystifying llm-based software engineering agents",
    361       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    362       "year": 2024,
    363       "relevance": "Key open-source APR approach evaluated on SWE-bench, representing the non-agentic alternative."
    364     },
    365     {
    366       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    367       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    368       "year": 2024,
    369       "relevance": "Influential agentic APR system with multiple leaderboard entries using different LLMs."
    370     },
    371     {
    372       "title": "OpenHands: An open platform for ai software developers as generalist agents",
    373       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    374       "year": 2024,
    375       "relevance": "Open-source agentic development platform evaluated on SWE-bench."
    376     },
    377     {
    378       "title": "Alibaba lingmaagent: Improving automated issue resolution via comprehensive repository exploration",
    379       "authors": ["Yingwei Ma", "Qingping Yang", "Rongyu Cao", "Binhua Li", "Fei Huang", "Yongbin Li"],
    380       "year": 2025,
    381       "relevance": "Industry-backed open-source agent achieving competitive SWE-bench results."
    382     },
    383     {
    384       "title": "The swe-bench illusion: When state-of-the-art llms remember instead of reason",
    385       "authors": ["Shanchao Liang", "Spandan Garg", "Roshanak Zilouchian Moghaddam"],
    386       "year": 2025,
    387       "relevance": "Investigates data contamination in SWE-bench, finding models may memorize rather than reason."
    388     },
    389     {
    390       "title": "Are 'solved issues' in swe-bench really solved correctly? an empirical study",
    391       "authors": ["You Wang", "Michael Pradel", "Zhongxin Liu"],
    392       "year": 2025,
    393       "relevance": "Studies patch overfitting on SWE-bench, finding 7.8% of plausible patches are incorrect."
    394     },
    395     {
    396       "title": "Revisiting SWE-Bench: On the Importance of Data Quality for LLM-Based Code Models",
    397       "authors": ["Reem Aleithan"],
    398       "year": 2025,
    399       "relevance": "Manual analysis of SWE-bench patches finding 32.67% had complete solutions leaked in issue descriptions."
    400     },
    401     {
    402       "title": "Repairagent: An autonomous, llm-based agent for program repair",
    403       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    404       "year": 2024,
    405       "relevance": "LLM-based autonomous program repair agent evaluated on SWE-bench."
    406     },
    407     {
    408       "title": "Trae agent: An llm-based agent for software engineering with test-time scaling",
    409       "authors": ["Trae Research Team"],
    410       "year": 2025,
    411       "relevance": "ByteDance's agent achieving 75.2% on SWE-bench Verified using multiple Claude models."
    412     },
    413     {
    414       "title": "RepairBench: Leaderboard of frontier models for program repair",
    415       "authors": ["André Silva", "Martin Monperrus"],
    416       "year": 2025,
    417       "relevance": "Alternative benchmark/leaderboard for program repair evaluation."
    418     },
    419     {
    420       "title": "Swe-bench goes live!",
    421       "authors": ["Linghao Zhang", "Shilin He", "Chaoyun Zhang"],
    422       "year": 2025,
    423       "relevance": "SWE-bench-Live: automated pipeline for continuously updating the benchmark with recent data to address contamination."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs