scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22802B)
      1 {
      2   "paper": {
      3     "title": "The Impact of Large Language Models on Open-Source Innovation: Evidence from GitHub Copilot",
      4     "authors": ["Doron Yeverechyahu", "Raveesh Mayya", "Gal Oestreicher-Singer"],
      5     "year": 2024,
      6     "venue": "International Conference on Interaction Sciences",
      7     "arxiv_id": "2409.08379",
      8     "doi": "10.2139/ssrn.4684662"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["observational"],
     13   "key_findings": "Using a natural experiment from GitHub Copilot's selective language support (Python/Rust supported, R/Haskell not), the study finds LLMs significantly increase open-source contributions (37% for Python vs R, 54% for Rust vs Haskell). LLMs disproportionately boost iterative/maintenance innovation over capability/feature innovation. This gap widens in high-activity projects with richer context and after model upgrades, suggesting improving LLMs may further skew innovation toward refinement over novel feature development.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code archive is provided in the paper. The data collection and analysis code is not released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The dataset of 1.1 million commits is not released. No data download link is provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or software version details are provided for the analysis pipeline."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology but does not provide scripts or commands to replicate the analysis."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Standard errors are reported in parentheses for all regression coefficients in Tables 3-7 and appendix tables. Figure 4 shows 99% confidence interval bars for the parallel trends analysis."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Statistical significance is reported throughout with *** p<0.001, ** p<0.01, * p<0.05 notation on all regression coefficients. The DiD framework provides formal hypothesis testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported as both absolute changes (e.g., 6.816 additional commits per quarter) and percentage changes relative to pre-treatment means (e.g., 37.05% increase from baseline of 18.395). Section 5.1 and throughout."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The sample sizes (1,187 matched Python/R packages, 1,373 matched Rust/Haskell packages) are described but not justified through power analysis or any formal justification for why these sizes are adequate."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Table A1 reports standard deviations for all variables. Jackknife variance estimation is used for SDiD models. Standard errors clustered by package are reported for all TWFE models."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The control groups (R packages for Python, Haskell packages for Rust) serve as baselines via propensity score matching and DiD framework."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Control languages (R, Haskell) are contemporary active programming ecosystems observed over the same time period (Oct 2019 - Dec 2022)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper conducts multiple sub-analyses separating effects by innovation type (function addition vs. not, code development vs. maintenance), project activity level, and model upgrade period, which function as ablations of the main effect."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple outcome variables are used: commit counts, version releases, function-adding vs non-function-adding commits, LLM-classified commit types (code development, maintenance, documentation, testing). Log-transformed DVs also used."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three expert human annotators tagged 400 commit comments (200 Python, 200 R) to benchmark the LLM classification. Cohen's Kappa reported for inter-rater agreement (Section 3.3.2, Table B1)."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a natural experiment study with DiD estimation, not a prediction task. There is no train/test split concept applicable."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by language pair (Python/R, Rust/Haskell), innovation type (function addition, code development, maintenance), project activity level, and model upgrade period."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not discuss cases where the natural experiment might not hold, specific packages where effects were negative, or where the classification failed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 6 Panel B shows non-significant coefficients for low-activity Rust/Haskell projects (coefficients near zero, not significant), indicating Copilot had virtually no effect on low-activity projects."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract's claims about increased contributions, disproportionate effect on iterative vs capability innovation, and widening gap with model improvements are all supported by the regression results in Tables 3-7."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper uses a well-designed natural experiment with DiD identification strategy, propensity score matching, parallel trends verification (Figure 4), and synthetic DiD as robustness. The selective language support by Copilot provides exogenous variation. Multiple robustness checks (temporal variations, alternative DVs, log transformations) strengthen causal claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6.3 (Limitations) explicitly bounds generalizability: focuses on Oct 2021-Dec 2022 window, notes not all contributors used Copilot, acknowledges open-source dynamics may not translate to organizational settings, and notes Copilot's limited interactive capabilities at the time."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper addresses selection concerns via PSM, tests temporal variations (June vs October 2021 treatment), drops early quarters to avoid early-adopter bias, uses balanced panels, and discusses cognitive load as an alternative mechanism (Section 6.3)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper carefully distinguishes between commit counts as a proxy and innovation as the broader construct, using two complementary measurement approaches (function detection and LLM classification) and discussing what each captures. The framework explicitly defines capability vs iterative innovation and maps measurements to these constructs."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "For the LLM classification task, the paper specifies 'gpt-4o-2024-08-06' (Table B2) with exact temperature (0.0), top_p (1.0), and penalty parameters. Benchmarked models also specified with versions (Table B1)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full prompt used for commit classification is provided in Appendix B, including the exact annotation guidelines, category definitions, and output format specification."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table B2 reports model parameters: temperature=0.0, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0 for the GPT-4o classification."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The LLM is used only for classification of commit messages, not as an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.2 documents data collection from GitHub API, exclusion of inactive packages (348 Python, 7,965 R, 209 Rust, 42 Haskell), propensity score matching procedure, and quarterly aggregation. Function detection methodology detailed in Appendix C."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.3 is titled 'Limitations' and provides substantive discussion of multiple limitations across three paragraphs."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6.3 discusses specific threats: the time window limitation (Oct 2021-Dec 2022), not all contributors using Copilot (findings are a lower bound), limited interactive capabilities of early Copilot, and caution about generalizing beyond open-source."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states findings reflect 'initial adoption patterns among early users rather than the full potential of current LLM capabilities,' notes the temporal boundary was necessary for clean control groups, and that open-source dynamics 'may not fully translate to other settings with formal task assignment.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The raw commit data, matched package lists, and classification outputs are not made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 describes data collection in detail: GitHub API for commits, specific package registries (PyPI, CRAN, Hackage, Crates) for version releases, top 2000 packages per language, time period Oct 2019-Oct 2022."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Package selection is described: top 2000 most popular packages by language (with specific sources cited), full CRAN list for R, with exclusion criteria for inactive packages. Human annotator recruitment described as 'expert human annotators' and 'human expert research assistant.'"
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: package identification → GitHub API commit download → inactive package exclusion → propensity score matching → quarterly aggregation → classification (function detection + LLM). Filtering counts provided (e.g., 348 Python packages excluded)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in a footnote on page 1: EU Horizon 2020 program (#759540) for Oestreicher-Singer and Center for Global Economy and Business (CGEB) at NYU Stern for Mayya."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Tel Aviv University (Coller School of Management) and NYU (Stern School of Business). No affiliation with GitHub or OpenAI."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "EU Horizon 2020 and NYU CGEB are academic funding sources with no financial stake in whether GitHub Copilot affects innovation. Funders are independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is an observational study analyzing the impact of Copilot on open-source commits, not evaluating a pre-trained model's capability on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not a benchmark evaluation study. The LLM (GPT-4o) is used as a classification tool, not evaluated on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not a benchmark evaluation study."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This is a repository mining / natural experiment study analyzing public GitHub data, not a human subjects study. The human annotators were used for ground truth labeling, not as research participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "Mining public GitHub repositories is not a human subjects study requiring IRB approval."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study. Public repository data is analyzed."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. Package inclusion/exclusion is documented under data_integrity."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. Treatment assignment is the natural experiment (language support by Copilot)."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in an experimental study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix B reports total token usage: 638 million input tokens and 60 million output tokens for classifying 1.1 million commits with GPT-4o."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "The paper states the code diff download took 'over 4 months and close to 0.5 TB' using GitHub's API (footnote 5). Token counts for LLM classification are reported in Appendix B."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "GitHub Copilot increased voluntary commits to Python packages by 37.05% compared to R packages.",
    296       "evidence": "Table 3, Column 1: coefficient 6.816 (SE 0.892, p<0.001) relative to pre-treatment mean of 18.395. Consistent across balanced TWFE (29.88%) and SDiD (26.60%).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "GitHub Copilot increased Rust package commits by 54.43% compared to Haskell packages.",
    301       "evidence": "Table 3, Column 4: coefficient 5.583 (SE 1.081, p<0.001) relative to pre-treatment mean of 10.258. Consistent across specifications.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Copilot's impact is substantially greater on iterative innovation (maintenance) than capability innovation (code development).",
    306       "evidence": "Table 4: non-function-adding commits increase 6.016 vs 0.800 for function-adding (Python/R). Table 5: maintenance 2.299 vs code development 1.472 (Python/R). Triple difference tests confirm statistical significance (Tables A3, A5).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "The gap between iterative and capability innovation widens in high-activity projects.",
    311       "evidence": "Table 6: in high-activity Python projects, maintenance effect is 3.768 vs code development 2.596 (gap 1.172), nearly 3x the gap in low-activity projects (0.422). For Rust/Haskell, virtually all effect is concentrated in high-activity projects.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "The June 2022 model upgrade further increased the gap between innovation types.",
    316       "evidence": "Table 7: post-upgrade additional effects show maintenance receiving larger boosts than code development across both language pairs. Python/R: maintenance +0.948 vs code development +0.288 (insignificant).",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Treatment assignment assumption",
    323       "detail": "The paper assumes Copilot's language support choice is exogenous, but Python and Rust may have been chosen precisely because they had more active communities and commercial value, which could correlate with different innovation trajectories."
    324     },
    325     {
    326       "flag": "No data or code release",
    327       "detail": "Despite analyzing 1.1 million commits from public repositories, neither the dataset nor analysis code is released, making independent verification impossible."
    328     },
    329     {
    330       "flag": "LLM-based classification circularity risk",
    331       "detail": "Using an LLM (GPT-4o) to classify commits in a study about LLM impact on commits introduces a subtle methodological concern, though the function detection approach provides a code-based alternative that doesn't rely on LLMs."
    332     },
    333     {
    334       "flag": "Control group contamination timing",
    335       "detail": "While the paper ends analysis at ChatGPT's release (Dec 2022), other code-completion tools (e.g., TabNine, Kite) existed during the study period for control languages, potentially attenuating measured effects rather than inflating them."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Generative AI at work",
    341       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey R. Raymond"],
    342       "year": 2025,
    343       "relevance": "Major study on LLM productivity effects in customer service, foundational reference for guided-setting LLM impact."
    344     },
    345     {
    346       "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
    347       "authors": ["Shakked Noy", "Whitney Zhang"],
    348       "year": 2023,
    349       "doi": "10.1126/science.adh2586",
    350       "relevance": "RCT on LLM productivity effects in writing tasks, key comparison point for individual vs collaborative productivity."
    351     },
    352     {
    353       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    354       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    355       "year": 2023,
    356       "arxiv_id": "2302.06590",
    357       "relevance": "Directly studies GitHub Copilot's productivity impact on individual developers in controlled settings."
    358     },
    359     {
    360       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    361       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    362       "year": 2024,
    363       "relevance": "Closely related study examining Copilot's impact on open-source aggregate productivity without distinguishing innovation types."
    364     },
    365     {
    366       "title": "Generative AI and the Nature of Work",
    367       "authors": ["Manuel Hoffmann", "Sam Boysel", "Frank Nagle", "Sida Peng", "Kevin Xu"],
    368       "year": 2024,
    369       "relevance": "Studies how generative AI affects the nature of developer work in open-source contexts."
    370     },
    371     {
    372       "title": "The crowdless future? Generative AI and creative problem-solving",
    373       "authors": ["Léonard Boussioux", "Jacqueline N. Lane", "Miaomiao Zhang", "Vladimir Jacimovic", "Karim R. Lakhani"],
    374       "year": 2024,
    375       "relevance": "Examines LLM impact on creative problem-solving, relevant to capability vs iterative innovation distinction."
    376     },
    377     {
    378       "title": "Generative artificial intelligence enhances individual creativity but reduces the collective diversity of novel content",
    379       "authors": ["Anil R. Doshi", "Oliver P. Hauser"],
    380       "year": 2024,
    381       "relevance": "Studies LLM impact on creativity showing homogenization effects, relevant to innovation quality concerns."
    382     },
    383     {
    384       "title": "The consequences of generative AI for online knowledge communities",
    385       "authors": ["Gordon Burtch", "Dokyun Lee", "Zhichen Chen"],
    386       "year": 2024,
    387       "relevance": "Documents declining knowledge exchange on UGC platforms after LLM introduction, contrasting with this paper's findings."
    388     },
    389     {
    390       "title": "GPTs are GPTs: Labor market impact potential of LLMs",
    391       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    392       "year": 2024,
    393       "relevance": "Foundational analysis of LLM labor market impact potential across occupations."
    394     },
    395     {
    396       "title": "Large language model in creative work: The role of collaboration modality and user expertise",
    397       "authors": ["Zhuoran Chen", "Joel Chan"],
    398       "year": 2024,
    399       "relevance": "Studies how collaboration modality affects LLM impact on creative work, relevant to open-ended vs structured tasks."
    400     }
    401   ]
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs