scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28009B)
      1 {
      2   "paper": {
      3     "title": "Agentic Much? Adoption of Coding Agents on GitHub",
      4     "authors": [
      5       "Romain Robbes",
      6       "Théo Matricon",
      7       "Thomas Degueule",
      8       "Andre Hora",
      9       "Stefano Zacchiroli"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv preprint (submitted to ACM journal)",
     13     "arxiv_id": "2601.18341"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The paper states in footnote 22 'we will share all the datasets and analyses that we made to facilitate this' (future tense). No working URL or archive is provided at time of publication."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper promises to share datasets (footnote 22) but provides no download link or archive. The dataset of 129,134 GitHub projects and derived metrics is not yet released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No requirements.txt, Dockerfile, or dependency specification is provided. The paper describes the pipeline conceptually but does not list library versions or environment details needed to reproduce."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step instructions for reproducing the analysis are provided. The methodology section describes the pipeline at a high level but is not a reproducible recipe."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper explicitly computes a 99% confidence interval with 1% margin of error for the commit-level sampling (Section 5.2): 'we select a large sample of non-adopters at the file level for this analysis, in order to have a 99% confidence interval and a small 1% margin of error.' Confidence intervals are also used in the qualitative commit sample (95%/96% CI with Bonferroni correction, Section 10.1)."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Chi-square goodness-of-fit tests with p-values are applied to assess whether adoption distributions are uniform across deciles in Sections 6.2 and 6.3. The paper reports 'p-value < 0.01' for all metrics."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Effect sizes are reported alongside Chi-square tests in Sections 6.2 and 6.3: 'The effect sizes are small for lines of code, contributors, commits, and issues, and medium for pull requests and age.' Effect sizes are also given contextually for commit size comparisons (e.g., median AI-assisted lines added is 34, triple the human median of 10)."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Sample sizes are justified using Cochran's formula (Section 4.2.5) for the commit-level sample (n=16,000) and again using Cochran's formula for the qualitative commit sample (n=790, Section 10.1), with explicit finite population corrections and confidence level specifications."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Distributions of commit sizes are reported with quartiles (Q1, median, Q3) in Figure 11 and Figure 12 shows the relative differences across size categories. Distribution plots (boxplots/ridgeline) and Sparklines convey spread throughout the analysis."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares AI-assisted commits against human-authored commits and bot commits as natural baselines in RQ5 (Section 9). For commit type classification in RQ6, results are compared against Zeng et al.'s human-authored baseline."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Baselines are contemporary: the comparison study by Zeng et al. [43] is from 2025, and the human and bot commit comparisons are drawn from the same 2025 time period dataset."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "This is an observational mining study with no system components to ablate. The paper does evaluate different detection heuristics (file-based, commit-based, gitignore-based) as complementary measurement strategies, not as ablation."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple metrics are used throughout: file-level adoption rate, commit-level adoption rate, adoption ratio by project characteristic deciles, commit size (lines added, lines deleted, files changed), commit type distribution, and temporal evolution."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Manual human classification is performed in Section 10.2 for commits that do not follow the Conventional Commits specification: 'two authors were tasked to manually and individually classify the remaining 277 commits into one of the seven categories by inspecting their commit message and diff, with an initial agreement on 250 of the 277 commits (90%).' Inter-rater reliability is reported."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "This is an observational study of real-world repositories, not a machine learning evaluation. There is no train/test split concept applicable here."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Per-category breakdowns are extensive: adoption is broken down by project metrics (LOC, age, contributors, commits, issues, PRs) in Tables 3 and 4; by organization in Table 5; by topic in Figures 4 and 5; by programming language in Figure 6; and by tool in Figure 8."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses cases where heuristics fail (e.g., CONVENTIONS.md excluded due to false positive risk, AGENTS.md ambiguity), under-detection of certain agents (e.g., Codex not signing commits, developers disabling traces), and Section 11 extensively discusses reasons for over- and under-estimation."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 11.2 explicitly reports reasons why the study may over-estimate adoption. The finding that 41.2% of file-level adopters have zero AI-assisted commits is reported and discussed. The low correlation (r=0.1 at best) between file-level and commit-level metrics is presented as a key negative/unexpected finding."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims of 15.85%-22.60% adoption, rapid adoption, breadth across project types, and larger AI commits are all supported by results in Sections 5-10. The paper appropriately hedges its estimates as conservative and high bounds."
    117       },
    118       "causal_claims_justified": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "The paper is explicitly observational and makes no causal claims. It consistently uses language like 'adoption is associated with,' 'we find that,' and explicitly states it does not try to answer productivity or causal questions (Section 9, 'How this affects productivity is an open question, which we do not try to answer')."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Generalizations are appropriately bounded. The paper explicitly restricts claims to GitHub public repositories with at least 10 stars, 5,000 LOC, and 100 commits; acknowledges results may not translate to industry (Section 11.1); and qualifies that metrics are undercounts. Section 11.4 clearly delineates what is known vs. hypothesized."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 11 extensively discusses alternative explanations. For example: the high PR metric may be 'self-fulfilling' since agents contribute PRs (Section 6.2); the low commit ratios for AI/LLM topics might reflect developer awareness and deliberate hiding of traces rather than lower use (Section 11.4); and reasons for both over- and under-estimation are systematically enumerated."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "This is a mining study, not an LLM evaluation study. The authors study coding agents used by other developers in repositories — they do not run LLM inference themselves. No model version specification is needed."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "The authors do not use LLM prompting in their analysis pipeline. The paper mines repository artifacts. No prompting is involved in the study methodology."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No LLM hyperparameters are involved. The study uses GitHub APIs, git log, and heuristic matching. Statistical parameters (confidence levels, margin of error) are reported in Sections 4.2.5 and 10.1."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "The paper does not use agentic scaffolding in its analysis. It is a mining study using GitHub APIs."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The data preprocessing pipeline is described in detail in Section 4.2: starting with 130,621 projects, filtering to 129,134 after pipeline failures and dotfile exclusions, with explicit counts and reasons at each step. Filtering criteria are stated (not forks, ≥5,000 LOC, ≥100 commits, active in last 3 months)."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 11 'Discussion' contains extensive dedicated subsections on limitations: 11.1 'Limitations', 11.2 'Reasons for over-estimating coding agent use', and 11.3 'Reasons for under-estimating coding agent use', spanning several pages."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Threats are specific to this study: the Peril of Partial Observability (projects with file markers but no commit markers), the specific bias introduced by squash merges, the impossibility of attributing AGENTS.md to Codex with certainty, and the specific finding that AI/LLM topic projects likely hide their traces deliberately. Not generic disclaimers."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Scope boundaries are explicitly stated throughout: results apply only to open-source GitHub repositories meeting specific minimum criteria (10 stars, 5,000 LOC, 100 commits, active); the paper explicitly states results 'may not translate well to more general use, and to industry in particular' (Section 11.1); estimates are explicitly labeled as conservative vs. high bounds."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No raw data is available at time of publication. The paper promises to share datasets (footnote 22) but provides no download link, DOI, or archive."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Data collection is described in detail in Section 4.2: the sampling tool by Dabic et al. [10] is used, with explicit criteria, accessed on 29/08/2025. The GitHub REST and GraphQL APIs are described, and the analysis was run on October 31st, 2025."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "This is a mining study with no human participants. The 'sample' is GitHub repositories satisfying objective criteria, not recruited participants."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full data pipeline is documented in Section 4.2 with explicit step numbering and counts: from 130,621 initial projects to 129,134 analyzed, with 900 pipeline failures and 500 dotfile exclusions explained. Commit extraction steps are numbered 1-6 in Section 4.2.5."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding or acknowledgments section is present in the paper text. No funding sources are disclosed."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly disclosed on the title page: Univ. Bordeaux, Univ. Rennes/Inria, UFMG (Brazil), and Institut Polytechnique de Paris. None are affiliated with the coding agent companies whose tools are evaluated."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "The authors are researchers at CNRS, Inria, UFMG, and Institut Polytechnique de Paris — institutions that routinely fund research through government grants. The absence of a funding disclosure does not mean the work is unfunded; it means we cannot verify funder independence. Since the paper should have disclosed funding and did not, the criterion applies but is not satisfied."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement is present. Absence of disclosure is not evidence of absence of conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This is a repository mining study that does not evaluate a pre-trained model's capability on any benchmark. No LLM is evaluated for knowledge or capability; the study observes adoption traces left by coding agents."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Not applicable — this is a mining study, not a benchmark evaluation of an LLM."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Not applicable — the paper does not use a benchmark to evaluate LLM knowledge or capabilities."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "This is a repository mining study. Mining public GitHub repositories is not a human subjects study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Mining public GitHub repository data does not require IRB approval. No human subjects are involved."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The repository selection criteria are clearly stated but these are not participant inclusion/exclusion criteria."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. The random sampling of repositories uses stratified statistical principles (Cochran's formula), which is documented in Section 4.2.5."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants. The two-author manual classification in Section 10.2 did not use blinding, but this is a qualitative coding exercise, not a human subjects experiment."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The authors do not report the cost of the API calls to GitHub APIs or the computational cost of the pipeline. Section 4.2.4 mentions that 'the whole analysis still requires more than half a terabyte in total' of storage but does not report time or API cost."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget is stated. Storage requirements (>0.5 TB) are mentioned but not time or hardware specifications."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Between 15.85% and 22.60% of a large sample of GitHub projects (129,134 projects) show traces of coding agent adoption as of October 2025.",
    292       "evidence": "Section 5 (RQ1) and Table 1 present the full estimation: 7.89% file-level detection + extrapolated 8.64% commit-level detection in non-file adopters = 15.85% conservative estimate; 22.60% high estimate. Statistics computed with 99% CI and 1% margin of error.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "AI-assisted commits are substantially larger than human-authored commits: the median added lines for AI-assisted commits is 34, triple the human median of 10.",
    297       "evidence": "Section 9 (RQ5) and Figure 11 show distribution of commit sizes by author type (human n=3.1M, AI n=171K, bot n=285K). The median comparison (34 vs 10 vs 3) is clearly stated with quartiles.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Coding agent adoption is broad across project types, organizations, and programming languages, with very few categories having near-zero adoption.",
    302       "evidence": "Sections 7 and RQ3 analyze adoption by topic (Figure 4), organization (Table 5), and programming language (Figure 6). Most topics and languages show adoption within ±10% of each other.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Feature implementation (feat) and bug fixes (fix) account for two-thirds (65.6%) of AI-assisted commits, compared to ~44% for human-authored commits.",
    307       "evidence": "Table 7 (Section 10.3) shows feat=35.7% and fix=29.9% in 790 sampled Claude Code commits. Comparison to Zeng et al.'s human-authored baseline (feat=17%, fix=27%) is made directly.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Younger projects (≤1 year old) show dramatically higher file-level coding agent adoption (21.17%) compared to the oldest projects (>11 years, 4.69%).",
    312       "evidence": "Table 3 (Section 6.2) shows adoption by age decile. Chi-square test p<0.01 with medium effect size for age.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Claude Code and GitHub Copilot together account for more than half of all detected coding agent adoption, with Claude Code being the single most popular tool (4,896 projects).",
    317       "evidence": "Section 8.2 and Figure 8 present per-tool adoption counts. The paper states 'Just Claude and Copilot are responsible for more than half of the adoption.'",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "observational"
    323   ],
    324   "key_findings": "This large-scale mining study of 129,134 GitHub projects finds that between 15.85% and 22.60% of projects show traces of coding agent adoption as of October 2025 — a remarkably rapid adoption for a technology category that only launched in 2024. Adoption is broad across project types, sizes, organizations, and programming languages, though with a strong bias toward younger projects. AI-assisted commits are substantially larger than human-authored ones (median 34 lines added vs. 10) and skew heavily toward feature implementation and bug fixes rather than maintenance. The study likely undercounts true adoption because agents can be configured to hide their traces.",
    325   "red_flags": [
    326     {
    327       "flag": "Heuristic-based detection with potential false positives/negatives",
    328       "detail": "The paper's detection relies on file-based, commit-based, branch-based, and label-based heuristics. By the authors' own admission, these may produce false positives (e.g., projects that once had agent files but abandoned agents) and false negatives (e.g., agents configured not to leave traces). The true adoption rate is bounded by the conservative and high estimates, but the true value is unknown."
    329     },
    330     {
    331       "flag": "Data not yet released",
    332       "detail": "The paper promises to share datasets and analysis code (footnote 22) but provides no actual download link or archive. The study cannot currently be independently replicated or verified."
    333     },
    334     {
    335       "flag": "RQ6 qualitative analysis limited to Claude Code commits only",
    336       "detail": "The commit type analysis in Section 10 is based solely on a sample of 790 Claude Code commits, chosen because Claude Code is the most popular agent. This sampling choice may not be representative of commit types for other agents (e.g., Copilot, Codex, Cursor), which may be used for different tasks."
    337     },
    338     {
    339       "flag": "No funding disclosure",
    340       "detail": "No funding source or acknowledgments section is present in the paper. While author affiliations suggest no commercial conflicts, the absence of disclosure means this cannot be verified."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    346       "authors": [
    347         "Joel Becker",
    348         "Nate Rush",
    349         "Elizabeth Barnes",
    350         "David Rein"
    351       ],
    352       "year": 2025,
    353       "arxiv_id": "2507.09089",
    354       "relevance": "RCT measuring coding agent (Cursor) impact on developer productivity — directly relevant to the survey scope on agentic AI evaluation."
    355     },
    356     {
    357       "title": "Sharp Tools: How Developers Wield Agentic AI in Real Software Engineering Tasks",
    358       "authors": [
    359         "Aayush Kumar",
    360         "Yasharth Bajpai",
    361         "Sumit Gulwani",
    362         "Gustavo Soares",
    363         "Emerson Murphy-Hill"
    364       ],
    365       "year": 2025,
    366       "relevance": "Observational study of developers using agentic AI (Cursor) for real software engineering tasks, studying barriers and successes."
    367     },
    368     {
    369       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    370       "authors": [
    371         "Carlos E. Jimenez",
    372         "John Yang",
    373         "Alexander Wettig",
    374         "Shunyu Yao",
    375         "Kexin Pei",
    376         "Ofir Press",
    377         "Karthik R. Narasimhan"
    378       ],
    379       "year": 2024,
    380       "relevance": "Foundational benchmark for evaluating coding agent capability on real GitHub issues — widely used in the agentic AI evaluation literature."
    381     },
    382     {
    383       "title": "Speed at the Cost of Quality? The Impact of LLM Agent Assistance on Software Development",
    384       "authors": [
    385         "Hao He",
    386         "Courtney Miller",
    387         "Shyam Agarwal",
    388         "Christian Kästner",
    389         "Bogdan Vasilescu"
    390       ],
    391       "year": 2025,
    392       "arxiv_id": "2511.04427",
    393       "relevance": "Empirical study of the impact of LLM agent (Cursor) adoption on software quality using difference-in-differences design."
    394     },
    395     {
    396       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    397       "authors": [
    398         "Islem Bouzenia",
    399         "Michael Pradel"
    400       ],
    401       "year": 2025,
    402       "arxiv_id": "2506.18824",
    403       "relevance": "Studies the traces of coding agent interactions on SWE-bench, analyzing diversity of agent trajectories and failure modes."
    404     },
    405     {
    406       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    407       "authors": [
    408         "Sida Peng",
    409         "Eirini Kalliamvakou",
    410         "Peter Cihon",
    411         "Mert Demirer"
    412       ],
    413       "year": 2023,
    414       "arxiv_id": "2302.06590",
    415       "relevance": "Controlled experiment finding 55% productivity increase from Copilot — a key reference for evaluating coding assistant impact claims."
    416     },
    417     {
    418       "title": "AIDev: Studying AI Coding Agents on GitHub",
    419       "authors": [
    420         "Hao Li",
    421         "Haoxiang Zhang",
    422         "Ahmed E. Hassan"
    423       ],
    424       "year": 2025,
    425       "relevance": "Concurrent MSR study of coding agent pull requests on GitHub — directly comparable methodology to the paper being scanned."
    426     },
    427     {
    428       "title": "Context Engineering for AI Agents in Open-Source Software",
    429       "authors": [
    430         "Seyedmoein Mohsenimofidi",
    431         "Matthias Galster",
    432         "Christoph Treude",
    433         "Sebastian Baltes"
    434       ],
    435       "year": 2025,
    436       "arxiv_id": "2510.21413",
    437       "relevance": "Qualitative study of coding agent guidance file contents in open-source repositories — directly extends the analysis in this paper."
    438     },
    439     {
    440       "title": "Agentless: Demystifying LLM-based software engineering agents",
    441       "authors": [
    442         "Chunqiu Steven Xia",
    443         "Yinlin Deng",
    444         "Soren Dunn",
    445         "Lingming Zhang"
    446       ],
    447       "year": 2024,
    448       "arxiv_id": "2407.01489",
    449       "relevance": "Presents an agentic approach for SWE-bench, evaluating cost and performance tradeoffs of coding agents."
    450     },
    451     {
    452       "title": "Promises, Perils, and (Timely) Heuristics for Mining Coding Agent Activity",
    453       "authors": [
    454         "Romain Robbes",
    455         "Théo Matricon",
    456         "Thomas Degueule",
    457         "Andre Hora",
    458         "Stefano Zacchiroli"
    459       ],
    460       "year": 2025,
    461       "relevance": "Companion paper by the same authors describing the heuristics used for detecting coding agent activity in repositories — foundational for the current study's methodology."
    462     },
    463     {
    464       "title": "Measuring AI ability to complete long tasks",
    465       "authors": [
    466         "Thomas Kwa",
    467         "Ben West",
    468         "Joel Becker"
    469       ],
    470       "year": 2025,
    471       "arxiv_id": "2503.14499",
    472       "relevance": "Evaluates the ability of AI systems to complete progressively longer tasks, with findings about task completion time doubling every 7 months — cited as evidence for increasing agent capability."
    473     },
    474     {
    475       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    476       "authors": [
    477         "Jenny T Liang",
    478         "Chenyang Yang",
    479         "Brad A Myers"
    480       ],
    481       "year": 2024,
    482       "relevance": "Survey of 410 developers on AI coding assistant use cases and barriers — relevant to understanding developer behavior with coding tools."
    483     },
    484     {
    485       "title": "Unveiling ChatGPT's Usage in Open Source Projects: A Mining-based Study",
    486       "authors": [
    487         "Rosalia Tufano",
    488         "Antonio Mastropaolo",
    489         "Federica Pepe",
    490         "Ozren Dabic",
    491         "Massimiliano Di Penta",
    492         "Gabriele Bavota"
    493       ],
    494       "year": 2024,
    495       "arxiv_id": "2402.16480",
    496       "relevance": "MSR study of ChatGPT usage traces in GitHub — predecessor methodology to the current study of coding agent adoption."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs