ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19928B)


      1 {
      2   "paper": {
      3     "title": "Promises, Perils, and (Timely) Heuristics for Mining Coding Agent Activity",
      4     "authors": ["Romain Robbes", "Théo Matricon", "Thomas Degueule", "Andre Hora", "Stefano Zacchiroli"],
      5     "year": 2026,
      6     "venue": "MSR '26",
      7     "arxiv_id": "2601.18345",
      8     "doi": "10.48550/arXiv.2601.18345"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper references a community-maintained heuristics repository [3] containing heuristics, sample datasets, and Python scripts for data collection. Section 6 describes the repository contents."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Section 6 states the repository contains 'A list of repositories (ca. 10,000) featuring agent adoption as of October 2025' and sample datasets of traces."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specification, requirements, or dependency information is provided for the Python scripts mentioned in the repository."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. The repository is described at a high level but no instructions for replicating the heuristic evaluation or counts are given."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a qualitative/position paper presenting heuristics and a framework. The counts in Table 1 are simple GitHub search result counts, not statistical estimates requiring confidence intervals."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No comparative statistical claims are made. The paper presents heuristics and approximate counts, not statistical comparisons."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No experimental comparisons are made that would require effect sizes."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No experimental samples are drawn. The counts are from GitHub search queries, not sampled data."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No repeated experimental runs. The paper presents one-time GitHub search counts."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "This is not a system or method paper. It proposes a set of heuristics and discusses promises/perils — there is nothing to compare against baselines."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "No baselines are applicable to this type of paper."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No system with components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No system evaluation is performed."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The heuristics were derived from 'extensive manual investigation of known agents, involving checking their documentation for mentions of specific artifacts, and analysis of repositories identified as using agents for visible traces' (Section 4), with manual validation checks."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No prediction task or evaluation on a test set."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Table 1 provides per-agent and per-heuristic-type breakdowns (files, commits, branches, labels) for over 30 agents."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper extensively discusses failure cases of the heuristics: false positives (e.g., CONVENTIONS.md), partial observability (Peril 1), and cases where heuristics miss agent activity (40% of projects with agent markers have no commit-level traces)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Multiple perils are discussed as negative findings: partial observability, agent multiplicity making exhaustive detection difficult, rapid obsolescence of heuristics, and AI coding slop."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims the paper documents promises, perils, and heuristics for studying coding agent activity on GitHub. Sections 4-6 deliver on all three."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper makes no causal claims. It describes heuristics, presents adoption estimates, and discusses research opportunities."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper explicitly bounds its scope to GitHub (Section 4: 'We focus on GitHub, the most common coding platform') and acknowledges limitations in Section 7."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The perils section extensively discusses alternative interpretations: partial observability means adoption estimates are lower bounds, agent diversity means heuristics may miss activity, and false positives require careful filtering."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly distinguishes between detectable traces (what they measure) and actual agent adoption/usage (what they claim). Promise 2 states 'between 15 to 19% of GitHub projects show traces of coding agents,' and Peril 1 discusses how traces are only a partial view."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No LLMs are used as part of this paper's methodology."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting is used in this paper's methodology."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No model-based experiments are conducted."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used by this paper."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper describes a general process for deriving heuristics ('extensive manual investigation... targeted GitHub searches and manual checks') but does not document the specific filtering criteria, how many candidate heuristics were considered and rejected, or the validation methodology in reproducible detail."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7 (Discussion) contains a dedicated 'Limitations' subsection discussing the paper's scope and shortcomings."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The limitations discuss specific issues: 'heuristics are noisy,' the excluded CONVENTIONS.md heuristic for Aider due to false positives, the need for date-range filtering, and the fact that the Peril of velocity means parts of the work will need updates."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7 explicitly states: 'This paper is focused on the promises and perils of mining coding agent usage, rather than the promises and perils of coding agent themselves. Issues such as their environmental impact, impact on the workforce, or intellectual property... are thus not covered.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The heuristic repository contains sample datasets and a list of ~10,000 repositories with agent adoption traces, allowing independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 describes the data collection: 'extensive manual investigation of known agents, involving checking their documentation... and analysis of repositories identified as using agents.' Table 1 provides GitHub query-based counts with date (20/10/25)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data sources are public GitHub repositories and agent documentation, which are standard public data."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The process from initial agent identification to final heuristic list is described qualitatively but not as a documented pipeline with counts at each stage. The paper states heuristics were validated with 'targeted GitHub searches and manual checks' but does not detail how many were filtered at each step."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are clearly listed: universities in France (Bordeaux, Rennes, Télécom Paris) and Brazil (UFMG). No apparent industry affiliations with coding agent companies."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a mining study of coding agent traces."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation on benchmarks is performed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No model evaluation on benchmarks is performed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No LLM inference or computational method is used. This is a qualitative/mining study."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No significant compute is required for this study (GitHub searches and manual investigation)."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Between 15 and 20% of GitHub projects have adopted coding agents to some extent.",
    294       "evidence": "Promise 2 / Section 5.1 cites their companion study [6] of coding agent adoption on GitHub, with estimates of 15-19% as of mid-October 2025.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Claude Code, Codex, Cursor, and Copilot capture more than 80% of coding agent adoption.",
    299       "evidence": "Section 5.2, Mitigation for Perils 2 and 3, supported by Table 1 counts showing these four tools have the highest match counts by large margins.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "More than 40% of projects with markers of coding agent usage do not have commit-level markers.",
    304       "evidence": "Peril 1 (Section 5.2) cites their companion study [6], stating this makes it difficult to ascertain the degree of agent adoption in those projects.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "20% of projects with agent guidance or configuration files exclude all of them from commits via .gitignore.",
    309       "evidence": "Peril 1 (Section 5.2) cites their companion study [6].",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "Coding agents leave many visible traces in software repositories across files, commits, issues, and pull requests.",
    314       "evidence": "Table 1 provides approximate GitHub match counts for 35+ agents across multiple trace categories, with total counts in the millions for the most popular agents.",
    315       "supported": "strong"
    316     }
    317   ],
    318   "methodology_tags": ["observational", "qualitative"],
    319   "key_findings": "This paper catalogs heuristics for detecting coding agent activity on GitHub across files, commits, PRs, issues, and user accounts. It identifies 8 promises and perils of mining such traces, including partial observability, agent multiplicity and diversity, high velocity of change, and AI coding slop. The authors find 15-20% of GitHub projects show agent adoption traces as of October 2025, with Claude Code, Codex, Cursor, and Copilot capturing 80%+ of adoption. They release a community-maintained heuristic repository with ~10,000 labeled repositories.",
    320   "red_flags": [
    321     {
    322       "flag": "Key quantitative claims cite unpublished companion study",
    323       "detail": "The most important empirical claims (15-20% adoption, 40% without commit markers, 20% gitignoring config) all cite [6] 'Anonymous authors. 2025. Empirical study of coding agent adoption on GitHub. Under submission.' These claims cannot be independently verified from this paper alone."
    324     },
    325     {
    326       "flag": "GitHub search counts are approximate and unvalidated",
    327       "detail": "Table 1 counts are from GitHub's web search interface, which the paper acknowledges 'is not precise, does not offer us information about the number of repositories' and 'may contain false positives.' No false positive rate estimation is provided."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    333       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    334       "year": 2025,
    335       "arxiv_id": "2507.09089",
    336       "relevance": "Controlled experiment finding Cursor actually increased task completion time by 19% despite developers believing it reduced time by 20%."
    337     },
    338     {
    339       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    340       "authors": ["Islem Bouzenia", "Michael Pradel"],
    341       "year": 2025,
    342       "arxiv_id": "2506.18824",
    343       "relevance": "Analysis of 120 agent interaction logs from SWE-bench, highlighting behavioral differences between agents."
    344     },
    345     {
    346       "title": "Why AI Agents Still Need You: Findings from Developer-Agent Collaborations in the Wild",
    347       "authors": ["Aayush Kumar", "Yasharth Bajpai", "Sumit Gulwani", "Gustavo Soares", "Emerson Murphy-Hill"],
    348       "year": 2025,
    349       "relevance": "Observation study of 19 developers using Cursor, finding tacit knowledge as main barrier to effective agent use."
    350     },
    351     {
    352       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    353       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    354       "year": 2023,
    355       "arxiv_id": "2302.06590",
    356       "relevance": "Major RCT on Copilot productivity (95 programmers, 55% faster completion), frequently cited in the field."
    357     },
    358     {
    359       "title": "Evaluating large language models trained on code",
    360       "authors": ["Mark Chen", "Jerry Tworek"],
    361       "year": 2021,
    362       "arxiv_id": "2107.03374",
    363       "relevance": "Codex paper, foundational to the coding assistant ecosystem studied in this paper."
    364     },
    365     {
    366       "title": "Unveiling ChatGPT's Usage in Open Source Projects: A Mining-based Study",
    367       "authors": ["Rosalia Tufano", "Antonio Mastropaolo"],
    368       "year": 2024,
    369       "arxiv_id": "2402.16480",
    370       "relevance": "MSR study of explicit ChatGPT usage in GitHub, developed taxonomy of 45 SE tasks."
    371     },
    372     {
    373       "title": "Self-Admitted GenAI Usage in Open-Source Software",
    374       "authors": ["Tao Xiao", "Youmei Fan"],
    375       "year": 2025,
    376       "arxiv_id": "2507.10422",
    377       "relevance": "Qualitative study of 1,200+ GenAI usage instances in software artifacts, analyzing task types and usage guidelines."
    378     },
    379     {
    380       "title": "Measuring AI Ability to Complete Long Tasks",
    381       "authors": ["Thomas Kwa", "Ben West", "Joel Becker"],
    382       "year": 2025,
    383       "arxiv_id": "2503.14499",
    384       "relevance": "METR study estimating autonomous task duration doubles every 7 months, directly relevant to agent capability trajectory."
    385     },
    386     {
    387       "title": "An Empirical Study on Automatically Detecting AI-Generated Source Code: How Far are We?",
    388       "authors": ["Hyunjae Suh", "Mahan Tafreshipour"],
    389       "year": 2025,
    390       "relevance": "Studies detection of AI-generated code, relevant to the partial observability peril discussed in this paper."
    391     },
    392     {
    393       "title": "Vibe Coding in Practice: Motivations, Challenges, and a Future Outlook",
    394       "authors": ["Ahmed Fawz", "Amjed Tahir", "Kelly Blincoe"],
    395       "year": 2025,
    396       "arxiv_id": "2510.00328",
    397       "relevance": "Grey literature review of vibe coding practice, relevant to understanding developer-agent interaction patterns."
    398     },
    399     {
    400       "title": "AI Copilot Code Quality: Evaluating 2024's Increased Defect Rate via Code Quality Metrics",
    401       "authors": ["William Harding"],
    402       "year": 2025,
    403       "relevance": "GitClear whitepaper on code churn and quality decline after Copilot adoption, a key study this paper contextualizes."
    404     }
    405   ]
    406 }

Impressum · Datenschutz