scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24697B)
      1 {
      2   "paper": {
      3     "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
      4     "authors": [
      5       "Sida Peng",
      6       "Eirini Kalliamvakou",
      7       "Peter Cihon",
      8       "Mert Demirer"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv",
     12     "arxiv_id": "2302.06590"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor"
     17   ],
     18   "methodology_tags": [
     19     "rct"
     20   ],
     21   "key_findings": "A controlled experiment with 95 developers recruited via Upwork found that those with access to GitHub Copilot completed an HTTP server task in JavaScript 55.8% faster than the control group (95% CI: 21-89%, p=0.0017). Heterogeneous effects suggest less experienced programmers, older developers, and those who code more hours per day benefit most. Self-reported productivity estimates (~35%) underestimated the measured effect. The task success rate difference (7 percentage points higher for treated) was not statistically significant.",
     22   "claims": [
     23     {
     24       "claim": "The treated group completed the task 55.8% faster than the control group",
     25       "evidence": "Results section: average completion time 71.17 min (treated) vs 160.89 min (control), t-test p=0.0017, 95% CI [21%, 89%]. Robust to dropping 4 outliers.",
     26       "supported": "strong"
     27     },
     28     {
     29       "claim": "Less experienced developers benefit more from Copilot",
     30       "evidence": "Table 1: Programming experience coefficient 8.23 (SE 4.36, p=0.0629) in heterogeneous treatment effects regression using Horvitz-Thomson transformation.",
     31       "supported": "moderate"
     32     },
     33     {
     34       "claim": "Developers who code more hours per day benefit more from Copilot",
     35       "evidence": "Table 1: Hours of programming coefficient -11.70 (SE 4.74, p=0.0168).",
     36       "supported": "moderate"
     37     },
     38     {
     39       "claim": "Self-reported productivity estimates underestimate the actual effect",
     40       "evidence": "Results section: both groups estimated ~35% productivity gain vs measured 55.8%.",
     41       "supported": "moderate"
     42     },
     43     {
     44       "claim": "Treated group's willingness to pay is significantly higher than control",
     45       "evidence": "Results section: treated $27.25/month vs control $16.91/month, statistically significant at 95% level.",
     46       "supported": "moderate"
     47     }
     48   ],
     49   "red_flags": [
     50     {
     51       "flag": "Company evaluating own product",
     52       "detail": "Authors are from Microsoft Research and GitHub, evaluating GitHub Copilot (a Microsoft/GitHub product). This conflict is not acknowledged in the paper beyond affiliations listed in the header."
     53     },
     54     {
     55       "flag": "High attrition without analysis",
     56       "detail": "95 participants started but only ~35 completed. That is ~63% attrition with no per-group breakdown or analysis of whether completers differ systematically from dropouts."
     57     },
     58     {
     59       "flag": "Single narrow task generalized broadly",
     60       "detail": "Results are from one HTTP server task in JavaScript on Upwork freelancers, but the title claims 'Impact of AI on Developer Productivity' generally. Discussion extrapolates to all 4.6M US tech workers and GDP impact."
     61     },
     62     {
     63       "flag": "No blinding",
     64       "detail": "Treatment group knew they had Copilot; control knew they did not. Hawthorne and novelty effects are not discussed."
     65     },
     66     {
     67       "flag": "Non-representative sample",
     68       "detail": "Participants are Upwork freelancers, mostly from India/Pakistan with median income $10-19K. Results may not generalize to professional developers in industry settings."
     69     }
     70   ],
     71   "checklist": {
     72     "artifacts": {
     73       "code_released": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No repository URL, code archive, or data release mentioned anywhere in the paper."
     77       },
     78       "data_released": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No dataset released. Survey responses, completion times, and demographic data are not made available."
     82       },
     83       "environment_specified": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No environment specifications provided. The task used GitHub Classroom and a Node.js template but no dependency versions or setup details are given."
     87       },
     88       "reproduction_instructions": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No reproduction instructions provided. The task description (Figure 4) and job posting (Figure 1) are shown but there is no guide for replicating the experiment."
     92       }
     93     },
     94     "statistical_methodology": {
     95       "confidence_intervals_or_error_bars": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "95% confidence interval reported for the main result: '[21%, 89%]' for the productivity improvement. Also 95% CI for success rate difference '[-0.11, 0.25]'."
     99       },
    100       "significance_tests": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "t-test reported with p=0.0017 for the main completion time difference. Statistical significance at 95% level also noted for willingness-to-pay difference."
    104       },
    105       "effect_sizes_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "55.8% reduction in completion time reported with baseline context (71.17 min vs 160.89 min). This provides sufficient magnitude context for the reader."
    109       },
    110       "sample_size_justified": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No power analysis or justification for N=95 (or N=35 completers). No discussion of whether the sample is large enough for the heterogeneous effects analysis."
    114       },
    115       "variance_reported": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "Standard deviations of completion times are not reported numerically. Distributions are shown in Figure 6 but no numeric spread measures (std dev, IQR) are given in text or tables."
    119       }
    120     },
    121     "evaluation_design": {
    122       "baselines_included": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Control group without Copilot serves as the baseline. Control participants could use internet search and Stack Overflow."
    126       },
    127       "baselines_contemporary": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The control condition (standard development with internet access) represents the contemporary alternative to AI-assisted coding at the time of the study."
    131       },
    132       "ablation_study": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Single intervention (Copilot vs no Copilot). There are no system components to ablate."
    136       },
    137       "multiple_metrics": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Two primary metrics: task completion time and task success rate (percentage passing all 12 tests)."
    141       },
    142       "human_evaluation": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "The evaluation uses an automated test suite with binary pass/fail. Human judgment of output quality is not applicable since the task has objective correctness criteria."
    146       },
    147       "held_out_test_set": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "This is an RCT measuring human performance, not a model evaluation on a dataset split."
    151       },
    152       "per_category_breakdown": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Table 1 provides heterogeneous treatment effects broken down by programming experience, hours per day, age group, employment status, income, education, and language preference."
    156       },
    157       "failure_cases_discussed": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No discussion of cases where Copilot did not help or hurt productivity. The 5 treated participants who did not finish Copilot setup are mentioned but not analyzed."
    161       },
    162       "negative_results_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The success rate difference (7pp higher for treated) is reported as not statistically significant, CI [-0.11, 0.25]. Several heterogeneous effects are reported with non-significant p-values."
    166       }
    167     },
    168     "claims_and_evidence": {
    169       "abstract_claims_supported": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Abstract claims '55.8% faster' and heterogeneous effects favoring less experienced/older developers are supported by results in Table 1 and the main analysis."
    173       },
    174       "causal_claims_justified": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Causal claims are justified by the RCT design with random assignment to treatment and control groups. This is an appropriate design for causal inference."
    178       },
    179       "generalization_bounded": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "Title claims 'Impact of AI on Developer Productivity' broadly. Discussion extrapolates to 4.6M US tech workers and GDP impact from a single JS task with Upwork freelancers. While limitations are noted, the framing significantly overreaches the evidence."
    183       },
    184       "alternative_explanations_discussed": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No discussion of alternative explanations such as Hawthorne effect, novelty effect, differential attrition, or selection effects from Upwork recruitment. The discussion section only mentions task generalizability and code quality as limitations."
    188       },
    189       "proxy_outcome_distinction": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper measures task completion time on a single HTTP server task and frames this as 'Impact of AI on Developer Productivity' (title). The discussion extrapolates to 4.6M US tech workers and GDP impact. The gap between completing one standardized task faster (the proxy) and actual developer productivity (the claimed outcome) is enormous — productivity encompasses code quality, maintenance burden, collaboration, debugging, and more. The limitations section notes task specificity but does not acknowledge the proxy-outcome gap in framing."
    193       }
    194     },
    195     "setup_transparency": {
    196       "model_versions_specified": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Paper says 'GitHub Copilot' powered by 'Codex' without specifying the Codex model version, API version, or snapshot date."
    200       },
    201       "prompts_provided": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "Copilot is used as a black-box IDE integration. Users do not write prompts; the tool suggests code based on context."
    205       },
    206       "hyperparameters_reported": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "Copilot is evaluated as a black-box commercial tool. Authors have no control over its hyperparameters."
    210       },
    211       "scaffolding_described": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "Copilot is evaluated as a third-party tool. Authors cannot be expected to describe its internal scaffolding."
    215       },
    216       "data_preprocessing_documented": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The study describes recruitment via Upwork (166 offers, 95 accepted, random assignment), task administration via GitHub Classroom with automated timestamps and test suites, and exit survey collection."
    220       }
    221     },
    222     "limitations_and_scope": {
    223       "limitations_section_present": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The Discussion section contains substantive limitations discussion covering task specificity, lack of code quality measurement, and generalization concerns."
    227       },
    228       "threats_to_validity_specific": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Discussion identifies specific threats: standardized task not representative of collaborative projects, productivity benefits may vary across tasks and languages, and code quality effects not examined."
    232       },
    233       "scope_boundaries_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Discussion explicitly states what was NOT tested: collaboration on large projects, effects across different tasks/languages, code quality effects. 'More research is needed to understand how our results generalizes to other tasks.'"
    237       }
    238     },
    239     "data_integrity": {
    240       "raw_data_available": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No raw data (completion times, survey responses, demographic data) is released for independent verification."
    244       },
    245       "data_collection_described": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Data collected via GitHub Classroom (timestamps, test results) and surveys (entry demographic + exit experience). Time period May 15 - June 20, 2022. Upwork platform with job posting shown in Figure 1."
    249       },
    250       "recruitment_methods_described": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "Recruited through Upwork freelancing platform. Job posting shown in Figure 1 with task description and budget. Contract shown in Figure 2. Compensation structure tied to completion time."
    254       },
    255       "data_pipeline_documented": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "166 offers sent, 95 accepted, ~35 completed, but no explanation of why 71 offers were not accepted, why ~60 participants dropped out, or whether any data cleaning was performed. Per-group completion numbers are ambiguous."
    259       }
    260     },
    261     "conflicts_of_interest": {
    262       "funding_disclosed": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No funding disclosure, acknowledgments section, or grant information in the paper."
    266       },
    267       "affiliations_disclosed": {
    268         "applies": true,
    269         "answer": true,
    270         "justification": "Author affiliations clearly listed: Microsoft Research, GitHub Inc., and MIT Sloan. GitHub is the maker of Copilot."
    271       },
    272       "funder_independent_of_outcome": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "Microsoft Research and GitHub employees are evaluating GitHub Copilot, a Microsoft/GitHub product. The employer has a direct financial interest in positive results."
    276       },
    277       "financial_interests_declared": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No competing interests or financial disclosure statement in the paper. Two authors are GitHub employees evaluating a GitHub product, which is not explicitly acknowledged as a conflict."
    281       }
    282     },
    283     "contamination": {
    284       "training_cutoff_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "This is an RCT measuring human productivity, not evaluating model capability on a benchmark."
    288       },
    289       "train_test_overlap_discussed": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Not a benchmark evaluation. The study measures how fast humans complete a task with/without AI assistance."
    293       },
    294       "benchmark_contamination_addressed": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "Not a benchmark evaluation of model knowledge."
    298       }
    299     },
    300     "human_studies": {
    301       "pre_registered": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No pre-registration mentioned. No link to OSF, AsPredicted, or any registry."
    305       },
    306       "irb_or_ethics_approval": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "'Before we began recruitment, we received approval for the study from the Microsoft Research Ethics Review Board.'"
    310       },
    311       "demographics_reported": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Figure 5 presents age, location, income, education level, programming experience, hours coding daily, number of languages used, and employment status."
    315       },
    316       "inclusion_exclusion_criteria": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No explicit inclusion or exclusion criteria stated. The Upwork posting (Figure 1) shows skill requirements but no formal eligibility criteria for the study are described."
    320       },
    321       "randomization_described": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Paper states participants were 'randomly split into control and treatment groups' but provides no details on the randomization procedure (method, stratification, tool used)."
    325       },
    326       "blinding_described": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "No blinding described or discussed. Treatment group knew they had Copilot (watched tutorial, installed tool). Control group knew they did not have Copilot."
    330       },
    331       "attrition_reported": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "166 offers sent, 95 accepted, ~35 completed — but per-group completion numbers are ambiguous ('Thirty-five developers from both groups'). Five treated participants who didn't finish Copilot setup are mentioned but reasons for ~60 other dropouts are not explained. No intention-to-treat analysis."
    335       }
    336     },
    337     "cost_and_practicality": {
    338       "inference_cost_reported": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "This is an RCT evaluating a commercial tool's productivity impact, not proposing a computational method with inference costs."
    342       },
    343       "compute_budget_stated": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No computational method proposed. The study measures human task performance."
    347       }
    348     },
    349     "experimental_rigor": {
    350       "seed_sensitivity_reported": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "RCT with human participants — no random seeds in the computational sense."
    354       },
    355       "number_of_runs_stated": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Single experiment with human participants. Not a computational experiment with multiple runs."
    359       },
    360       "hyperparameter_search_budget": {
    361         "applies": false,
    362         "answer": false,
    363         "justification": "No hyperparameters to tune in an RCT design."
    364       },
    365       "best_config_selection_justified": {
    366         "applies": false,
    367         "answer": false,
    368         "justification": "No configuration selection in an RCT."
    369       },
    370       "multiple_comparison_correction": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "Table 1 tests heterogeneous effects across 8 covariates without any correction for multiple comparisons (no Bonferroni, Holm, or similar adjustment mentioned)."
    374       },
    375       "self_comparison_bias_addressed": {
    376         "applies": true,
    377         "answer": false,
    378         "justification": "Microsoft Research and GitHub employees evaluate GitHub Copilot without acknowledging the bias of evaluating their own company's product. No independent evaluation or mitigation strategy discussed."
    379       },
    380       "compute_budget_vs_performance": {
    381         "applies": false,
    382         "answer": false,
    383         "justification": "Not applicable to an RCT measuring human task performance."
    384       },
    385       "benchmark_construct_validity": {
    386         "applies": true,
    387         "answer": false,
    388         "justification": "The paper briefly notes the task is 'standardized' but does not substantively analyze whether completing a single HTTP server task in JavaScript measures 'developer productivity' as claimed in the title."
    389       },
    390       "scaffold_confound_addressed": {
    391         "applies": false,
    392         "answer": false,
    393         "justification": "Copilot is evaluated as a bundled commercial product (IDE integration + model). The paper does not claim to isolate the model from the tool — it evaluates the complete Copilot experience. The scaffold IS the thing being tested, so this question is NA."
    394       }
    395     }
    396   },
    397   "cited_papers": [
    398     {
    399       "title": "Evaluating large language models trained on code",
    400       "authors": [
    401         "Mark Chen",
    402         "Jerry Tworek",
    403         "Heewoo Jun"
    404       ],
    405       "year": 2021,
    406       "arxiv_id": "2107.03374",
    407       "relevance": "Introduces Codex, the model powering GitHub Copilot, and the HumanEval benchmark for code generation."
    408     },
    409     {
    410       "title": "Grounded copilot: How programmers interact with code-generating models",
    411       "authors": [
    412         "Shraddha Barke",
    413         "Michael B. James",
    414         "Nadia Polikarpova"
    415       ],
    416       "year": 2022,
    417       "arxiv_id": "2206.15000",
    418       "relevance": "Qualitative study of how programmers interact with Copilot-like code generation tools."
    419     },
    420     {
    421       "title": "Security implications of large language model code assistants: A user study",
    422       "authors": [
    423         "Gustavo Sandoval",
    424         "Hammond Pearce",
    425         "Teo Nys"
    426       ],
    427       "year": 2022,
    428       "arxiv_id": "2208.09727",
    429       "relevance": "Studies security implications of LLM code assistants through a user study."
    430     },
    431     {
    432       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    433       "authors": [
    434         "Hussein Mozannar",
    435         "Gagan Bansal",
    436         "Adam Fourney",
    437         "Eric Horvitz"
    438       ],
    439       "year": 2022,
    440       "arxiv_id": "2210.14306",
    441       "relevance": "Models user behavior and cognitive costs when using AI programming assistants."
    442     },
    443     {
    444       "title": "Productivity assessment of neural code completion",
    445       "authors": [
    446         "Albert Ziegler",
    447         "Eirini Kalliamvakou",
    448         "X. Alice Li"
    449       ],
    450       "year": 2022,
    451       "relevance": "Earlier productivity assessment of neural code completion from GitHub, directly related to Copilot productivity measurement."
    452     },
    453     {
    454       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    455       "authors": [
    456         "Nhan Nguyen",
    457         "Sarah Nadi"
    458       ],
    459       "year": 2022,
    460       "relevance": "Empirical evaluation of Copilot's code suggestion quality."
    461     },
    462     {
    463       "title": "The robots are coming: Exploring the implications of OpenAI Codex on introductory programming",
    464       "authors": [
    465         "James Finnie-Ansley",
    466         "Paul Denny",
    467         "Brett A. Becker"
    468       ],
    469       "year": 2022,
    470       "relevance": "Studies implications of Codex for programming education."
    471     },
    472     {
    473       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    474       "authors": [
    475         "Priyan Vaithilingam",
    476         "Tianyi Zhang",
    477         "Elena L. Glassman"
    478       ],
    479       "year": 2022,
    480       "relevance": "Usability study of LLM-powered code generation tools."
    481     },
    482     {
    483       "title": "A research agenda for assessing the economic impacts of code generation models",
    484       "authors": [
    485         "Sarah Manning",
    486         "Pamela Mishkin",
    487         "Gillian Hadfield"
    488       ],
    489       "year": 2022,
    490       "relevance": "Research agenda for studying economic impacts of AI code generation, directly framing this paper's contribution."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 2,
    496       "justification": "Directly quantifies productivity gains from a widely-used tool (Copilot), giving practitioners evidence to justify adoption."
    497     },
    498     "surprise_contrarian": {
    499       "score": 1,
    500       "justification": "The 55.8% speed gain is larger than most expected but the direction (Copilot helps) confirms conventional wisdom."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No safety, security, or risk angle is explored in the paper."
    505     },
    506     "drama_conflict": {
    507       "score": 2,
    508       "justification": "Microsoft/GitHub employees evaluating their own product with a non-representative sample and 63% attrition invites skepticism and 'corporate science' critique."
    509     },
    510     "demo_ability": {
    511       "score": 1,
    512       "justification": "Anyone can try Copilot but the experiment itself is not reproducible without the specific setup and recruitment."
    513     },
    514     "brand_recognition": {
    515       "score": 3,
    516       "justification": "GitHub Copilot is one of the most widely-used AI developer tools, and the study is from Microsoft Research."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs