scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31951B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
      6     "authors": [
      7       "Peng, S.",
      8       "Kalliamvakou, E.",
      9       "Cihon, P.",
     10       "Demirer, M."
     11     ],
     12     "year": 2023,
     13     "venue": "arXiv",
     14     "arxiv_id": "2302.06590",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims '55.8% faster' and heterogeneous effects favoring less experienced/older developers are supported by results in Table 1 and the main analysis.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims are justified by the RCT design with random assignment to treatment and control groups. This is an appropriate design for causal inference.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Title claims 'Impact of AI on Developer Productivity' broadly. Discussion extrapolates to 4.6M US tech workers and GDP impact from a single JS task with Upwork freelancers. While limitations are noted, the framing significantly overreaches the evidence.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No discussion of alternative explanations such as Hawthorne effect, novelty effect, differential attrition, or selection effects from Upwork recruitment. The discussion section only mentions task generalizability and code quality as limitations.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures task completion time on a single HTTP server task and frames this as 'Impact of AI on Developer Productivity' (title). The discussion extrapolates to 4.6M US tech workers and GDP impact. The gap between completing one standardized task faster (the proxy) and actual developer productivity (the claimed outcome) is enormous — productivity encompasses code quality, maintenance burden, collaboration, debugging, and more. The limitations section notes task specificity but does not acknowledge the proxy-outcome gap in framing.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The Discussion section contains substantive limitations discussion covering task specificity, lack of code quality measurement, and generalization concerns.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Discussion identifies specific threats: standardized task not representative of collaborative projects, productivity benefits may vary across tasks and languages, and code quality effects not examined.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Discussion explicitly states what was NOT tested: collaboration on large projects, effects across different tasks/languages, code quality effects. 'More research is needed to understand how our results generalizes to other tasks.'",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure, acknowledgments section, or grant information in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly listed: Microsoft Research, GitHub Inc., and MIT Sloan. GitHub is the maker of Copilot.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Microsoft Research and GitHub employees are evaluating GitHub Copilot, a Microsoft/GitHub product. The employer has a direct financial interest in positive results.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial disclosure statement in the paper. Two authors are GitHub employees evaluating a GitHub product, which is not explicitly acknowledged as a conflict.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "'Productivity' is used throughout but never formally defined. Paper measures task completion time and claims this reflects 'productivity,' but other definitions (code quality, maintenance burden, learning rate) are possible and not discussed upfront.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Stated explicitly: 'This paper studies the productivity effects of AI tools on software development. We present a controlled trial of GitHub Copilot...' Positioned as 'the first controlled experiment to measure the productivity of AI tools in professional software development.'",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "References relevant prior work on AI capabilities (Zhang et al. 2022), developer perceptions (Nguyen & Nadi 2022, Barke et al. 2022), and economic impacts (Raj & Seamans 2018, Agrawal et al. 2019). Shows how this controlled trial fills gap in productivity research on AI-powered tools.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, code archive, or data release mentioned anywhere in the paper.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No dataset released. Survey responses, completion times, and demographic data are not made available.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No environment specifications provided. The task used GitHub Classroom and a Node.js template but no dependency versions or setup details are given.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No reproduction instructions provided. The task description (Figure 4) and job posting (Figure 1) are shown but there is no guide for replicating the experiment.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "95% confidence interval reported for the main result: '[21%, 89%]' for the productivity improvement. Also 95% CI for success rate difference '[-0.11, 0.25]'.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "t-test reported with p=0.0017 for the main completion time difference. Statistical significance at 95% level also noted for willingness-to-pay difference.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "55.8% reduction in completion time reported with baseline context (71.17 min vs 160.89 min). This provides sufficient magnitude context for the reader.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or justification for N=95 (or N=35 completers). No discussion of whether the sample is large enough for the heterogeneous effects analysis.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Standard deviations of completion times are not reported numerically. Distributions are shown in Figure 6 but no numeric spread measures (std dev, IQR) are given in text or tables.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Control group without Copilot serves as the baseline. Control participants could use internet search and Stack Overflow.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The control condition (standard development with internet access) represents the contemporary alternative to AI-assisted coding at the time of the study.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Single intervention (Copilot vs no Copilot). There are no system components to ablate.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two primary metrics: task completion time and task success rate (percentage passing all 12 tests).",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "The evaluation uses an automated test suite with binary pass/fail. Human judgment of output quality is not applicable since the task has objective correctness criteria.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is an RCT measuring human performance, not a model evaluation on a dataset split.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 1 provides heterogeneous treatment effects broken down by programming experience, hours per day, age group, employment status, income, education, and language preference.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "No discussion of cases where Copilot did not help or hurt productivity. The 5 treated participants who did not finish Copilot setup are mentioned but not analyzed.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The success rate difference (7pp higher for treated) is reported as not statistically significant, CI [-0.11, 0.25]. Several heterogeneous effects are reported with non-significant p-values.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Paper says 'GitHub Copilot' powered by 'Codex' without specifying the Codex model version, API version, or snapshot date.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "Copilot is used as a black-box IDE integration. Users do not write prompts; the tool suggests code based on context.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "Copilot is evaluated as a black-box commercial tool. Authors have no control over its hyperparameters.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Copilot is evaluated as a third-party tool. Authors cannot be expected to describe its internal scaffolding.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The study describes recruitment via Upwork (166 offers, 95 accepted, random assignment), task administration via GitHub Classroom with automated timestamps and test suites, and exit survey collection.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "No raw data (completion times, survey responses, demographic data) is released for independent verification.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Data collected via GitHub Classroom (timestamps, test results) and surveys (entry demographic + exit experience). Time period May 15 - June 20, 2022. Upwork platform with job posting shown in Figure 1.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Recruited through Upwork freelancing platform. Job posting shown in Figure 1 with task description and budget. Contract shown in Figure 2. Compensation structure tied to completion time.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "166 offers sent, 95 accepted, ~35 completed, but no explanation of why 71 offers were not accepted, why ~60 participants dropped out, or whether any data cleaning was performed. Per-group completion numbers are ambiguous.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This is an RCT measuring human productivity, not evaluating model capability on a benchmark.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not a benchmark evaluation. The study measures how fast humans complete a task with/without AI assistance.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Not a benchmark evaluation of model knowledge.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration mentioned. No link to OSF, AsPredicted, or any registry.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "'Before we began recruitment, we received approval for the study from the Microsoft Research Ethics Review Board.'",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Figure 5 presents age, location, income, education level, programming experience, hours coding daily, number of languages used, and employment status.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": false,
    333           "justification": "No explicit inclusion or exclusion criteria stated. The Upwork posting (Figure 1) shows skill requirements but no formal eligibility criteria for the study are described.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": false,
    339           "justification": "Paper states participants were 'randomly split into control and treatment groups' but provides no details on the randomization procedure (method, stratification, tool used).",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": true,
    344           "answer": false,
    345           "justification": "No blinding described or discussed. Treatment group knew they had Copilot (watched tutorial, installed tool). Control group knew they did not have Copilot.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": false,
    351           "justification": "166 offers sent, 95 accepted, ~35 completed — but per-group completion numbers are ambiguous ('Thirty-five developers from both groups'). Five treated participants who didn't finish Copilot setup are mentioned but reasons for ~60 other dropouts are not explained. No intention-to-treat analysis.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "This is an RCT evaluating a commercial tool's productivity impact, not proposing a computational method with inference costs.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "No computational method proposed. The study measures human task performance.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": false,
    372           "answer": false,
    373           "justification": "RCT with human participants — no random seeds in the computational sense.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": false,
    378           "answer": false,
    379           "justification": "Single experiment with human participants. Not a computational experiment with multiple runs.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": false,
    384           "answer": false,
    385           "justification": "No hyperparameters to tune in an RCT design.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": false,
    390           "answer": false,
    391           "justification": "No configuration selection in an RCT.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "Table 1 tests heterogeneous effects across 8 covariates without any correction for multiple comparisons (no Bonferroni, Holm, or similar adjustment mentioned).",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "Microsoft Research and GitHub employees evaluate GitHub Copilot without acknowledging the bias of evaluating their own company's product. No independent evaluation or mitigation strategy discussed.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": false,
    408           "answer": false,
    409           "justification": "Not applicable to an RCT measuring human task performance.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper briefly notes the task is 'standardized' but does not substantively analyze whether completing a single HTTP server task in JavaScript measures 'developer productivity' as claimed in the title.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "Copilot is evaluated as a bundled commercial product (IDE integration + model). The paper does not claim to isolate the model from the tool — it evaluates the complete Copilot experience. The scaffold IS the thing being tested, so this question is NA.",
    422           "source": "opus"
    423         }
    424       }
    425     }
    426   },
    427   "claims": [
    428     {
    429       "claim": "GitHub Copilot increases developer productivity by 55.8% on task completion speed",
    430       "evidence": "Randomized controlled trial (n=35 per group) completing HTTP server implementation. Treated group: 71.17 min average. Control group: 160.89 min average. Difference: 55.8% (95% CI [21%, 89%], p=0.0017).",
    431       "supported": "strong"
    432     },
    433     {
    434       "claim": "Less experienced developers benefit more from Copilot than more experienced developers",
    435       "evidence": "Heterogeneous treatment effects regression (Table 1): programming experience coefficient = 8.23 (p=0.0629). Inverse relationship: lower experience → larger treatment effect. Marginally significant.",
    436       "supported": "moderate"
    437     },
    438     {
    439       "claim": "Developers coding more hours per day benefit more from Copilot",
    440       "evidence": "Table 1: hours of programming per day coefficient = -11.70 (p=0.0168). Higher daily coding load → larger treatment effect. Statistically significant.",
    441       "supported": "moderate"
    442     },
    443     {
    444       "claim": "Developers aged 25-44 benefit more from Copilot than other age groups",
    445       "evidence": "Table 1: age 25-44 coefficient = -74.55 (p=0.0303). Developers in this age range show larger treatment effect.",
    446       "supported": "moderate"
    447     },
    448     {
    449       "claim": "Copilot improves task success rate",
    450       "evidence": "Treated group success rate is 7 percentage points higher than control, but 95% CI [-0.11, 0.25] includes zero. Not statistically significant.",
    451       "supported": "weak"
    452     },
    453     {
    454       "claim": "Developers value Copilot based on willingness-to-pay",
    455       "evidence": "Exit survey: treated group average willingness to pay $27.25/month vs control group $16.91/month. Difference statistically significant at 95% level. Suggests treated group perceived greater value.",
    456       "supported": "moderate"
    457     },
    458     {
    459       "claim": "This is the first controlled experiment measuring AI productivity effects in professional software development",
    460       "evidence": "Paper claims: 'To the best of our knowledge, it is the first controlled experiment to measure the productivity of AI tools in professional software development.' Self-reported novelty claim.",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "Task completion time speedup does not imply code quality improvement",
    465       "evidence": "Paper explicitly states: 'this study does not examine the effects of AI on code quality. AI assistance can increase code quality if it suggests code better than the programmer writes, or it can reduce quality if the programmer pays less attention to code.'",
    466       "supported": "strong"
    467     }
    468   ],
    469   "methodology_tags": [
    470     "rct"
    471   ],
    472   "key_findings": "GitHub Copilot accelerates task completion time by 55.8% (95% CI [21-89%], p=0.0017) in a randomized controlled trial of 35 developers per group implementing an HTTP server in JavaScript. Benefits are heterogeneous: less experienced programmers, those coding more hours daily, and developers aged 25-44 show larger productivity gains. However, the study is limited to a single standardized task, does not examine code quality, and uses an unrepresentative sample (Upwork freelancers, median income $10-19k annually), limiting generalization to real-world professional development.",
    473   "red_flags": [
    474     {
    475       "flag": "Undisclosed conflict of interest",
    476       "detail": "Two of three lead authors employed by GitHub Inc., the company developing Copilot. No explicit conflict-of-interest statement; no discussion of how GitHub's business interest in favorable results was managed. Company employees evaluating their own product."
    477     },
    478     {
    479       "flag": "Narrow task generalization",
    480       "detail": "Single standardized task (HTTP server in JavaScript) may not generalize to real development workflows, language-specific challenges, or collaborative projects. Authors acknowledge this but still frame as broad productivity study."
    481     },
    482     {
    483       "flag": "Unrepresentative sample",
    484       "detail": "95 Upwork freelancers from India/Pakistan with median annual income $10-19k and 6 years average experience. Not representative of typical professional software developers in high-income countries or enterprise settings."
    485     },
    486     {
    487       "flag": "High attrition without analysis",
    488       "detail": "95 recruited, only 35 completed task AND survey (63% dropout). No per-group attrition rates reported. No analysis of why 60 participants started but didn't finish. Attrition patterns could bias results."
    489     },
    490     {
    491       "flag": "Small sample size without justification",
    492       "detail": "n=35 per group (70 total completers) with no power analysis or sample size justification. How was 95 initial sample size determined? Is 35 adequate for detecting heterogeneous effects across 8 subgroups?"
    493     },
    494     {
    495       "flag": "Code quality not examined",
    496       "detail": "Paper measures only task speed, not code quality, security, maintainability, or correctness beyond passing 12 basic tests. Copilot's speed benefit could come at cost of quality (acknowledged but not investigated)."
    497     },
    498     {
    499       "flag": "Productivity as task speed proxy",
    500       "detail": "Defines 'productivity' as time to complete a single greenfield task. Real developer productivity includes maintenance, debugging, learning, collaboration, and sustained output—not measured."
    501     },
    502     {
    503       "flag": "No pre-registration",
    504       "detail": "Study conducted May-June 2022 with no evidence of pre-registration. Raises concerns about selective hypothesis testing, p-hacking, or unreported outcome measures."
    505     },
    506     {
    507       "flag": "Setup implementation gaps",
    508       "detail": "5 of 45 treated participants (11%) failed to fully configure Copilot before starting task. Not separately analyzed. Were results similar for those without full Copilot access? Unknown."
    509     },
    510     {
    511       "flag": "Outlier concentration",
    512       "detail": "Four completion-time outliers (>300 min) all in control group. While authors claim results robust without outliers, the concentration suggests potential unmeasured confounds (participant skill, task difficulty perception, etc.)."
    513     }
    514   ],
    515   "cited_papers": [
    516     {
    517       "title": "Evaluating large language models trained on code",
    518       "authors": "Chen et al.",
    519       "year": 2021,
    520       "relevance": "Introduces Codex, the foundational model powering GitHub Copilot; core technical baseline"
    521     },
    522     {
    523       "title": "Reading between the lines: Modeling user behavior and costs in ai-assisted programming",
    524       "authors": "Mozannar et al.",
    525       "year": 2022,
    526       "relevance": "Models user behavior and interaction patterns with AI programming assistants; relevant to productivity measurement"
    527     },
    528     {
    529       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    530       "authors": "Vaithilingam et al.",
    531       "year": 2022,
    532       "relevance": "Empirical evaluation of code generation tool usability; directly comparable productivity work"
    533     },
    534     {
    535       "title": "The robots are coming: Exploring the implications of openai codex on introductory programming",
    536       "authors": "Finnie-Ansley et al.",
    537       "year": 2022,
    538       "relevance": "Studies Codex's effects on novice programming; relevant to heterogeneous effects on less experienced developers"
    539     },
    540     {
    541       "title": "Grounded copilot: How programmers interact with code-generating models",
    542       "authors": "Barke et al.",
    543       "year": 2022,
    544       "relevance": "Qualitative study of developer interaction patterns with code suggestion systems; contextualizes productivity mechanisms"
    545     },
    546     {
    547       "title": "An empirical evaluation of github copilot's code suggestions",
    548       "authors": "Nguyen & Nadi",
    549       "year": 2022,
    550       "relevance": "Empirical quality assessment of Copilot suggestions; complements productivity findings with quality analysis"
    551     },
    552     {
    553       "title": "Security implications of large language model code assistants: A user study",
    554       "authors": "Sandoval et al.",
    555       "year": 2022,
    556       "relevance": "User study of code assistant adoption and security concerns; relevant to real-world deployment risks"
    557     },
    558     {
    559       "title": "Productivity assessment of neural code completion",
    560       "authors": "Ziegler et al.",
    561       "year": 2022,
    562       "relevance": "Prior work on productivity measurement for code completion tools; methodological precedent"
    563     },
    564     {
    565       "title": "Artificial intelligence: the ambiguous labor market impact of automating prediction",
    566       "authors": "Agrawal et al.",
    567       "year": 2019,
    568       "relevance": "Theoretical framework for AI's labor market effects; contextualizes potential broader economic impacts"
    569     },
    570     {
    571       "title": "Artificial intelligence, labor, productivity, and the need for firm-level data",
    572       "authors": "Raj & Seamans",
    573       "year": 2018,
    574       "relevance": "Calls for empirical data on AI's productivity effects; motivates this controlled experiment"
    575     }
    576   ],
    577   "engagement_factors": {
    578     "practical_relevance": {
    579       "score": 2,
    580       "justification": "Directly quantifies productivity gains from a widely-used tool (Copilot), giving practitioners evidence to justify adoption."
    581     },
    582     "surprise_contrarian": {
    583       "score": 1,
    584       "justification": "The 55.8% speed gain is larger than most expected but the direction (Copilot helps) confirms conventional wisdom."
    585     },
    586     "fear_safety": {
    587       "score": 0,
    588       "justification": "No safety, security, or risk angle is explored in the paper."
    589     },
    590     "drama_conflict": {
    591       "score": 2,
    592       "justification": "Microsoft/GitHub employees evaluating their own product with a non-representative sample and 63% attrition invites skepticism and 'corporate science' critique."
    593     },
    594     "demo_ability": {
    595       "score": 1,
    596       "justification": "Anyone can try Copilot but the experiment itself is not reproducible without the specific setup and recruitment."
    597     },
    598     "brand_recognition": {
    599       "score": 3,
    600       "justification": "GitHub Copilot is one of the most widely-used AI developer tools, and the study is from Microsoft Research."
    601     }
    602   },
    603   "hn_data": {
    604     "threads": [
    605       {
    606         "hn_id": "44484075",
    607         "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot (2023)",
    608         "points": 6,
    609         "comments": 0,
    610         "url": "https://news.ycombinator.com/item?id=44484075",
    611         "created_at": "2025-07-06T21:09:52Z"
    612       },
    613       {
    614         "hn_id": "35076049",
    615         "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    616         "points": 4,
    617         "comments": 1,
    618         "url": "https://news.ycombinator.com/item?id=35076049",
    619         "created_at": "2023-03-08T23:07:44Z"
    620       },
    621       {
    622         "hn_id": "40706181",
    623         "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    624         "points": 2,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=40706181",
    627         "created_at": "2024-06-17T14:52:08Z"
    628       }
    629     ],
    630     "top_points": 6,
    631     "total_points": 12,
    632     "total_comments": 1
    633   }
    634 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs