scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22858B)
      1 {
      2   "paper": {
      3     "title": "Developer Productivity With and Without GitHub Copilot: A Longitudinal Mixed-Methods Case Study",
      4     "authors": ["Viktoria Stray", "Elias Goldmann Brandtzæg", "Viggo Tellefsen Wivestad", "Astri Barbala", "Nils Brede Moe"],
      5     "year": 2025,
      6     "venue": "Unknown (appears to be a conference or journal submission)",
      7     "doi": null
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Scripts for mining GitHub data, cleaning, analyzing, statistical tests, and generating plots are available at https://figshare.com/s/736af5662435675e7914 (Appendix A)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The commit dataset and survey/interview data are not released. Only analysis scripts are shared via Figshare. The underlying data (26,317 commits from 39 developers) is not publicly available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specification (requirements.txt, library versions, etc.) is mentioned in the paper. Only Python and PyDriller are named without versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The Figshare link contains scripts but the paper does not describe how to run them to reproduce the results."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% confidence intervals are shown in Figures 2 and 3 (shaded areas and error bars). Explicitly stated: 'The shaded areas around each line represent the 95% confidence intervals.'"
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Mann-Whitney U tests are used to compare Copilot users vs. non-users (p < 0.00555). Spearman correlation with p-values reported (ρ ≈ 0.17, p = 0.406). Bonferroni correction applied for six tests (α = 0.0083)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Spearman ρ values reported (ρ ≈ 0.17 for commit count, ρ ≈ 0.09 for net lines). Raw differences in weekly activity provided (e.g., 188 vs 80 lines added, 200 vs 70 after). These provide sufficient context for magnitude."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for sample sizes (N=39 developers, N=25 Copilot users, N=14 non-users, N=13 interviewees). No power analysis. The small N is not discussed as a limitation of statistical power."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "95% confidence intervals are reported in Figures 2 and 3, providing spread measures for the weekly activity data across the observation period."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Non-users (n=14) serve as a comparison group. Pre-adoption period serves as a temporal baseline for Copilot users. Both before/after and user/non-user comparisons are made."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The comparison is between contemporaneous groups (Copilot users vs. non-users) within the same organization during the same time period."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is an observational case study, not a system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: commit frequency, lines added, lines removed, net lines changed, plus perceived productivity (Likert scale), plus qualitative interview data. Six activity metrics examined for correlation."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The study includes survey responses on perceived productivity and 13 semi-structured interviews with developers, providing human evaluation of the tool's impact."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — this is an observational/case study, not a machine learning evaluation requiring train/test splits."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by Copilot users vs. non-users, before vs. after adoption, and across multiple metrics (insertions, deletions, net diff, commits). Roles are shown in Figure 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses developers who were skeptical or declined Copilot, developers who tried it and stopped, and the discrepancy between perceived and measured productivity. Developer concerns about hidden mistakes and subtle bugs are noted."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The main finding is essentially a negative result: no statistically significant change in commit-based activity after Copilot adoption, and no significant correlation between perceived and measured productivity."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims are well-supported: Copilot users were more active pre-adoption (Figures 2-3, Mann-Whitney U), no statistically significant change post-adoption, discrepancy between metrics and subjective experience (Figure 4, Spearman correlation)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper is careful to avoid strong causal claims. It uses language like 'suggests' and explicitly notes the self-selection bias. Section 5 acknowledges the longitudinal view shows the activity gap pre-existed Copilot, cautioning against causal interpretation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper frames results within NAV IT specifically and notes differences from controlled experiments (Peng et al., Cui et al.). It acknowledges this is a single organization case study in a Norwegian public sector context."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Self-selection bias is discussed in detail. Alternative explanations for perceived productivity (subject-expectancy effect, reduced cognitive load vs. actual output) are explored. Role differences, seasonal patterns, and early adoption phase are all considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper refers to 'GitHub Copilot' without specifying the version. No mention of which Copilot model version or underlying LLM was used during the study period."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool used by developers in their workflow. No custom prompts were used as part of the research methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The study evaluates Copilot as a black-box tool in a naturalistic setting. There are no hyperparameters to report — developers used Copilot with default settings in their normal workflow."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper evaluates a third-party tool (GitHub Copilot) as a black box. Authors cannot be expected to describe internal scaffolding they have no access to."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3 describes the full data pipeline: 37,974 original commits → duplicate removal → outlier trimming at 95th percentile → exclusion of low-activity users (< 1 commit/week) → 26,317 final commits. Weekly aggregation and zero-imputation documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. Some limitations are discussed inline (e.g., self-selection bias in Section 5, metrics limitations), but there is no substantive dedicated section."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The paper identifies specific threats: self-selection bias (Copilot adopters were already more active), subject-expectancy effect for perceived productivity, commit-based metrics failing to capture real productivity, and seasonal fluctuations. These are specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. While it implicitly limits scope to NAV IT, it does not make explicit statements about what populations, settings, or claims are excluded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw commit data and survey/interview transcripts are not publicly available. Only analysis scripts are shared. The underlying data cannot be independently verified."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes data collection in detail: GitHub API for repo list, PyDriller for commit mining, survey distributed March-April 2024, 13 interviews in Nov-Dec 2023 (30-60 min each, avg 47 min), AI transcription with manual review."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Recruitment is described: 100 developers at NAV IT volunteered for early Copilot access in September 2023. Survey respondents who shared GitHub usernames (63 total) were linked. 39 remained after filtering. Self-selection into Copilot group acknowledged."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Full pipeline documented in Section 3: 37,974 commits → duplicate removal → outlier trimming at 95th percentile → low-activity user exclusion → 26,317 commits from 39 developers across 703 repos. Weekly aggregation into N=4095 observations."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding disclosed in Acknowledgments: 'This work was supported by the Research Council of Norway through the projects TransformIT (grant 321477) and Kairos (grant 357147).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations clearly listed: University of Oslo and SINTEF. These are academic/research institutions, not GitHub/Microsoft."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Research Council of Norway is a public funding agency with no financial stake in whether GitHub Copilot improves productivity."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper. Absence of disclosure is not the same as absence of conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This study does not evaluate a pre-trained model on a benchmark. It observes developer behavior with a tool in a naturalistic setting."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — the study does not evaluate model capability on a benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no benchmark evaluation is performed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No mention of pre-registration (OSF, AsPredicted, or similar). The study involves human participants (surveys and interviews) but was not pre-registered."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No mention of IRB or ethics board approval despite collecting survey data and conducting interviews with identifiable developers."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Participant roles are reported in Figure 1 (front-end developer, back-end developer, tech lead, platform engineer, data scientist/analyst, data engineer). The organizational context (NAV IT, Norwegian public sector) is described."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Inclusion criteria stated: developers who provided GitHub usernames in survey (63 initially). Exclusion: users with fewer than 1 commit/week average excluded. Final sample: 39 developers (25 Copilot users, 14 non-users)."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is an observational study, not an experiment. Copilot adoption was voluntary (self-selected), not randomized. Randomization is not applicable."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not applicable — this is an observational study. Blinding is not feasible when studying voluntary tool adoption."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Attrition documented: 63 survey respondents provided GitHub usernames, reduced to 39 after filtering low-activity users. Original 37,974 commits reduced to 26,317 after cleaning. The pipeline from initial to final sample is clear."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is an observational case study of tool adoption, not proposing a method with computational costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is an observational case study. No significant computation was involved beyond data mining scripts."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Copilot users were consistently more active than non-users even before Copilot's introduction, indicating self-selection bias.",
    286       "evidence": "Mann-Whitney U tests (p < 0.00555) confirm significantly higher activity for Copilot users both before and after adoption. Figures 2 and 3 show the pre-existing gap. Section 4.1.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "No statistically significant changes in commit-based activity were found for Copilot users after adoption.",
    291       "evidence": "Section 4.1: Minor increases observed (e.g., +16 net lines/week) but not statistically significant. Figures 2-3 show no visible spike at adoption.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "There is no significant correlation between perceived productivity and commit-based activity metrics among Copilot users.",
    296       "evidence": "Section 4.2: Spearman ρ ≈ 0.17 (p = 0.406) for commit count. None of six activity metrics showed significant correlation with perceived productivity after Bonferroni correction (α = 0.0083).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Copilot did not negatively impact code quality metrics.",
    301       "evidence": "Section 4.1: 'structural metrics (e.g., function complexity, average module size) remained virtually unchanged and showed no significant differences between users and non-users.' No detailed analysis or statistics provided for this claim.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Copilot's value may lie in reducing cognitive load and improving developer experience rather than increasing raw output.",
    306       "evidence": "Interview quotes from I12, I13 about smoother flow and faster starting points. Survey data showing no decreases in perceived productivity. Section 5 discussion.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["observational", "case-study", "qualitative"],
    311   "key_findings": "In a longitudinal study of 39 developers at NAV IT over two years, Copilot users showed no statistically significant change in commit-based activity after adoption, despite perceiving productivity improvements. Copilot adopters were already more active developers before the tool's introduction, indicating self-selection bias. No significant correlation was found between perceived productivity and any of six commit-based activity metrics, suggesting that traditional output metrics fail to capture the subjective benefits developers report from AI coding assistants.",
    312   "red_flags": [
    313     {
    314       "flag": "Small sample size",
    315       "detail": "Only 39 developers (25 Copilot users, 14 non-users) from a single organization. No power analysis. The small N limits statistical power for detecting effects and makes subgroup analysis unreliable."
    316     },
    317     {
    318       "flag": "Self-selection bias acknowledged but not controlled",
    319       "detail": "Copilot adoption was voluntary, creating inherent self-selection. The paper acknowledges this but cannot control for it since there was no randomization. Propensity score matching or similar techniques were not attempted."
    320     },
    321     {
    322       "flag": "Code quality claim weakly supported",
    323       "detail": "The claim that Copilot did not negatively impact code quality is stated without detailed analysis or statistics. Only a brief mention of structural metrics being 'virtually unchanged' with no supporting data shown."
    324     },
    325     {
    326       "flag": "No ethics approval mentioned",
    327       "detail": "The study collects survey data and conducts interviews with identifiable developers (linked to GitHub usernames) but does not mention IRB or ethics board approval."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    333       "authors": ["S. Peng", "E. Kalliamvakou", "P. Cihon", "M. Demirer"],
    334       "year": 2023,
    335       "arxiv_id": "2302.06590",
    336       "relevance": "Foundational RCT on Copilot productivity (55.8% faster task completion), directly compared in this study."
    337     },
    338     {
    339       "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers",
    340       "authors": ["Z. K. Cui", "M. Demirer", "S. Jaffe", "L. Musolff", "S. Peng", "T. Salz"],
    341       "year": 2024,
    342       "relevance": "Field experiments at Microsoft/Accenture showing 26% task completion increase with Copilot, contrasted with this study's null finding."
    343     },
    344     {
    345       "title": "Large language models, small labor market effects",
    346       "authors": ["A. Humlum", "E. Vestergaard"],
    347       "year": 2025,
    348       "relevance": "Large-scale Danish study finding only 3% time savings from GenAI tools, aligning with this study's modest findings."
    349     },
    350     {
    351       "title": "Generative AI at work",
    352       "authors": ["E. Brynjolfsson", "D. Li", "L. Raymond"],
    353       "year": 2025,
    354       "relevance": "Reports relatively small productivity gains from GenAI in customer support, supporting pattern of modest real-world effects."
    355     },
    356     {
    357       "title": "Ironies of generative AI: Understanding and mitigating productivity loss in human-AI interaction",
    358       "authors": ["A. Simkute", "L. Tankelevitch", "V. Kewenig", "A. E. Scott", "A. Sellen", "S. Rintel"],
    359       "year": 2025,
    360       "relevance": "Studies role shift from creating to supervising AI outputs, relevant to understanding perceived vs measured productivity."
    361     },
    362     {
    363       "title": "Generative AI and developer workflows: How GitHub Copilot and ChatGPT influence solo and pair programming",
    364       "authors": ["V. Stray", "N. B. Moe", "N. Ganeshan", "S. Kobbenes"],
    365       "year": 2025,
    366       "relevance": "Companion study from same research group examining Copilot's influence on developer workflows and collaboration."
    367     },
    368     {
    369       "title": "The SPACE of developer productivity: There's more to it than you think",
    370       "authors": ["N. Forsgren", "M.-A. Storey", "C. Maddila", "T. Zimmermann", "B. Houck", "J. Butler"],
    371       "year": 2021,
    372       "relevance": "Framework for multidimensional developer productivity measurement (satisfaction, performance, activity, communication, efficiency) referenced in this study's discussion."
    373     },
    374     {
    375       "title": "Exploring GenAI in software development: Insights from a case study in a large Brazilian company",
    376       "authors": ["G. V. Pereira", "V. Jackson", "R. Prikladnicki"],
    377       "year": 2025,
    378       "relevance": "Mixed-methods study of GenAI adoption in corporate setting finding perceived task completion improvements."
    379     },
    380     {
    381       "title": "Is GitHub Copilot a substitute for human pair-programming? An empirical study",
    382       "authors": ["S. Imai"],
    383       "year": 2022,
    384       "relevance": "Early empirical study examining Copilot's role relative to pair programming practices."
    385     },
    386     {
    387       "title": "Practices and challenges of using GitHub Copilot: An empirical study",
    388       "authors": ["B. Zhang", "P. Liang", "X. Zhou", "A. Ahmad", "M. Waseem"],
    389       "year": 2023,
    390       "relevance": "Empirical study of Copilot usage practices and challenges, relevant to understanding adoption patterns."
    391     }
    392   ]
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs