scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23320B)
      1 {
      2   "paper": {
      3     "title": "Developer Productivity with GenAI",
      4     "authors": [
      5       "Sadia Afroz",
      6       "Zixuan Feng",
      7       "Katie Kimura",
      8       "Bianca Trinkenreich",
      9       "Igor Steinmacher",
     10       "Anita Sarma"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2510.24265",
     15     "doi": "10.48550/arXiv.2510.24265"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [],
     19   "methodology_tags": [
     20     "observational",
     21     "qualitative"
     22   ],
     23   "key_findings": "A survey of 415 software practitioners using the SPACE framework found that GenAI adoption has not produced substantial productivity changes across any dimension. Frequent AI users reported slightly higher efficiency and satisfaction but no gains in performance, activity, or collaboration. The paper identifies a 'productivity paradox' where developers become faster but do not necessarily create better software or feel more fulfilled.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No repository URL or code archive is provided in the paper. The supplementary material reference [1] points to a Zenodo DOI for the appendix/questionnaire, but no analysis code is released."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No survey response data is released. The supplementary material [1] contains only the questionnaire appendix, not the collected data."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No environment or dependency specifications are provided. The analysis approach (descriptive statistics, stacked bar charts, violin plots) is described but no software environment is specified."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported. The paper presents only descriptive statistics (medians, percentages) via stacked bar charts and violin plots with no uncertainty quantification."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are used. The paper compares frequent vs. non-frequent AI users using only visual inspection of stacked bar charts and violin plots, with no formal tests despite making comparative claims."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No effect sizes are reported. Differences between groups are described qualitatively ('slightly higher') without quantifying the magnitude."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The sample size of 415 is reported but not justified. No power analysis or rationale for why this N is sufficient is provided."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Violin plots show distribution shapes but no standard deviations, IQRs, or other formal spread measures are reported in text or tables."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The study compares frequent AI users against non-frequent AI users as a baseline comparison group across all SPACE dimensions."
     79       },
     80       "baselines_contemporary": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Not applicable — this is a survey study, not a system evaluation. The comparison groups are contemporary by design."
     84       },
     85       "ablation_study": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — no system with components to ablate."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The study measures productivity across 5 SPACE dimensions with 20 individual items (S1-S4, P1-P3, A1-A7, C1-C4, E1-E2)."
     94       },
     95       "human_evaluation": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Not applicable — this is a survey study measuring perceptions, not evaluating system outputs."
     99       },
    100       "held_out_test_set": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "Not applicable — no ML model or test set involved."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by each SPACE dimension and by individual survey items within each dimension (Figures 1-6)."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper discusses where GenAI fails to help: no improvement in collaboration (Observation 5), no improvement in test success or learning pace (Observation 3), continued developer fatigue (Observation 2)."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The core finding is essentially negative: GenAI adoption has not produced substantial productivity changes. The paper reports limited impact across most SPACE dimensions."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'limited overall productivity change' and a 'productivity paradox,' both supported by the survey results showing most responses in the neutral/no-change range across SPACE dimensions."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper uses language like 'GenAI adoption affects developer productivity' (RQ) and 'GenAI tools streamline routine coding tasks' (Observation 4), implying causal relationships from a cross-sectional survey that cannot establish causation. Self-selection bias (more productive developers may adopt AI more) is not addressed."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title 'Developer Productivity with GenAI' is broad, but the sample is drawn from 56 OSS communities (not representative of all developers). The paper does not bound its claims to OSS developers or acknowledge this limitation."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations such as self-selection bias (productive developers adopt AI more), novelty effects, or confounding variables like experience level or task complexity driving the observed patterns."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Section 6 explicitly discusses the gap between perceived speed and actual productivity: 'Does faster task completion truly represent progress, or merely shift effort without improving outcomes?' The paper acknowledges it measures perceptions, not objective productivity."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Not applicable — the study surveys developers about their GenAI usage generally; it does not use specific AI models in experiments."
    153       },
    154       "prompts_provided": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Not applicable — no prompting is used in this study."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "Not applicable — no AI models are run as part of this study."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "Not applicable — no agentic scaffolding is used."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper states 688 responses were received and 'after removing invalid entries, we kept 415 responses' but does not describe the criteria for determining invalidity or the filtering process."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 6) acknowledges self-reported data limitations in one sentence but does not provide substantive discussion."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No specific threats to validity are discussed. The paper only mentions 'such perceptions may not fully align with objective productivity outcomes' without addressing specific threats like self-selection bias, response bias, or sample representativeness."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No explicit scope boundaries are stated. The paper does not specify what its results do NOT show or what populations/settings are excluded from its claims."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Raw survey response data is not made available. Only aggregated results are presented in the figures."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 3.1.2 describes the data collection procedure: email invitations to 56 OSS communities, two-week collection window, anonymization per GDPR and IRB approval, $50 raffle incentive."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 3.1.2 describes recruitment from 56 OSS communities including organizational repositories (IBM, Oracle, Google, Adobe), infrastructure projects, AI projects, and data science communities."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The pipeline from 688 to 415 responses is mentioned but the filtering criteria for 'removing invalid entries' are not described. The 273 removed responses (40%) are unexplained."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All authors' affiliations are clearly listed: Oregon State University, Colorado State University, Northern Arizona University."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — this is a survey study that does not evaluate any pre-trained model on a benchmark."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Not applicable — no model evaluation on benchmarks."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Not applicable — no benchmark evaluation."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry is provided."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 3.1 states 'The protocol was approved by our university's IRB.'"
    263       },
    264       "demographics_reported": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Section 3.1.2 reports demographics: gender (90.6% men), organization size (57.83% large/extra-large), experience (82.17% >5 years), roles (36.87% full-stack, 16.87% backend, 15.42% data/ML)."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No explicit inclusion or exclusion criteria are stated. The paper describes recruiting from OSS communities but does not specify who was eligible or what made an entry 'invalid' (273 responses removed)."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "Not applicable — this is a cross-sectional survey, not an experimental study with randomized assignment."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Not applicable — cross-sectional survey, not an experimental study."
    283       },
    284       "attrition_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "The paper reports 688 responses received and 415 kept after removing invalid entries, documenting attrition from initial responses to final sample."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "Not applicable — this is a survey study with no computational method."
    295       },
    296       "compute_budget_stated": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "Not applicable — survey study."
    300       }
    301     }
    302   },
    303   "claims": [
    304     {
    305       "claim": "GenAI adoption has not produced substantial positive or negative shifts in perceived productivity across SPACE dimensions",
    306       "evidence": "Figure 1 shows all median values remain within the neutral ('no-change') range for both frequent and non-frequent AI user groups. Section 4, Observation 1.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Frequent GenAI use increases perceived coding output volume (72.7% report more lines of code changed per day) but does not improve test success or learning pace",
    311       "evidence": "Figure 3: 72.7% of frequent users reported more LOC changes, but majority reported no change or worse for test pass rate (P2) and API methods learned (P3). Section 4.1, Observation 3.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "GenAI tools have not reshaped team communication or collaboration",
    316       "evidence": "Figure 5: >75% of developers report no-change or negative across all communication items (C1-C4) regardless of AI usage frequency. Section 4.1, Observation 5.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "GenAI tools streamline routine coding tasks but their impact on evaluative tasks (code review) remains limited",
    321       "evidence": "Figure 4: 84.3% of frequent users indicated AI did not reduce time spent on code reviews (A7). Section 4.1, Observation 4.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "A 'productivity paradox' exists where developers become faster but do not necessarily create better software or feel more fulfilled",
    326       "evidence": "Section 6 discussion synthesizing results across all SPACE dimensions, noting speed gains without quality or satisfaction improvements.",
    327       "supported": "weak"
    328     }
    329   ],
    330   "red_flags": [
    331     {
    332       "flag": "No statistical tests",
    333       "detail": "The paper makes comparative claims between frequent and non-frequent AI users (e.g., 'slightly higher median scores') without any statistical significance tests. All comparisons are based on visual inspection of bar charts and violin plots."
    334     },
    335     {
    336       "flag": "Large unexplained attrition",
    337       "detail": "40% of responses (273 of 688) were removed as 'invalid entries' with no description of the invalidity criteria. This is a substantial and unexplained data loss that could introduce bias."
    338     },
    339     {
    340       "flag": "Self-selection bias unaddressed",
    341       "detail": "Developers who use GenAI frequently may differ systematically from those who don't (e.g., in skill level, openness to tools, task types). The cross-sectional survey design cannot disentangle these confounds, yet the paper implies causal effects of GenAI on productivity."
    342     },
    343     {
    344       "flag": "Recruitment bias",
    345       "detail": "Sample drawn from 56 OSS communities including major tech companies. 90.6% male, 82.17% with >5 years experience. Results are presented as general 'developer productivity' findings without bounding to this specific population."
    346     },
    347     {
    348       "flag": "No limitations section",
    349       "detail": "A 6-page survey paper with 415 participants and no dedicated limitations or threats-to-validity section. The one-sentence acknowledgment that 'perceptions may not fully align with objective productivity outcomes' is insufficient."
    350     },
    351     {
    352       "flag": "Publication metadata errors",
    353       "detail": "The ACM reference format shows placeholder text ('Conference acronym XX, June 03-05, 2018, Woodstock, NY') and placeholder DOIs, suggesting the paper may be an early preprint with incomplete preparation."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    359       "authors": [
    360         "Joel Becker",
    361         "Nate Rush",
    362         "Elizabeth Barnes",
    363         "David Rein"
    364       ],
    365       "year": 2025,
    366       "arxiv_id": "2507.09089",
    367       "relevance": "RCT measuring AI impact on developer productivity, finding developers 19% slower with AI."
    368     },
    369     {
    370       "title": "Sea change in software development: Economic and productivity analysis of the ai-powered developer lifecycle",
    371       "authors": [
    372         "Thomas Dohmke",
    373         "Marco Iansiti",
    374         "Greg Richards"
    375       ],
    376       "year": 2023,
    377       "arxiv_id": "2306.15033",
    378       "relevance": "Reports GitHub Copilot completing tasks 55.8% faster; key claim about AI-assisted developer productivity."
    379     },
    380     {
    381       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    382       "authors": [
    383         "Sida Peng",
    384         "Eirini Kalliamvakou",
    385         "Peter Cihon",
    386         "Mert Demirer"
    387       ],
    388       "year": 2023,
    389       "arxiv_id": "2302.06590",
    390       "relevance": "Seminal study on GitHub Copilot's productivity impact with empirical evidence."
    391     },
    392     {
    393       "title": "How much does AI impact development speed? An enterprise RCT",
    394       "authors": [
    395         "Elise Paradis",
    396         "Kate Grey",
    397         "Quinn Madison"
    398       ],
    399       "year": 2025,
    400       "relevance": "Enterprise RCT at Google finding AI assistance reduced coding task time by ~21%."
    401     },
    402     {
    403       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    404       "authors": [
    405         "Nicole Forsgren",
    406         "Margaret-Anne Storey",
    407         "Chandra Maddila",
    408         "Thomas Zimmermann",
    409         "Brian Houck",
    410         "Jenna Butler"
    411       ],
    412       "year": 2021,
    413       "relevance": "The SPACE productivity framework used as the analytical lens in this study; foundational for multidimensional productivity measurement."
    414     },
    415     {
    416       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    417       "authors": [
    418         "Priyan Vaithilingam",
    419         "Tianyi Zhang",
    420         "Elena L Glassman"
    421       ],
    422       "year": 2022,
    423       "relevance": "Found higher task-failure rates and no significant improvement in completion time with LLM code generation tools."
    424     },
    425     {
    426       "title": "Beyond code generation: An observational study of chatgpt usage in software engineering practice",
    427       "authors": [
    428         "Ranim Khojah",
    429         "Mazen Mohamad",
    430         "Philipp Leitner",
    431         "Francisco de Oliveira Neto"
    432       ],
    433       "year": 2024,
    434       "relevance": "Observational study of ChatGPT usage patterns in SE practice."
    435     },
    436     {
    437       "title": "Will I be replaced? Assessing ChatGPT's effect on software development and programmer perceptions of AI tools",
    438       "authors": [
    439         "Mohammad Amin Kuhail",
    440         "Sujith Samuel Mathew",
    441         "Ashraf Khalil",
    442         "Jose Berengueres",
    443         "Syed Jawad Hussain Shah"
    444       ],
    445       "year": 2024,
    446       "relevance": "Found over-reliance on AI may erode developers' coding proficiency and critical thinking."
    447     },
    448     {
    449       "title": "Generative artificial intelligence for software engineering—A research agenda",
    450       "authors": [
    451         "Anh Nguyen-Duc",
    452         "Beatriz Cabrero-Daniel",
    453         "Adam Przybylek"
    454       ],
    455       "year": 2025,
    456       "relevance": "Research agenda for GenAI in software engineering, contextualizing rapid adoption patterns."
    457     },
    458     {
    459       "title": "Productivity assessment of neural code completion",
    460       "authors": [
    461         "Albert Ziegler",
    462         "Eirini Kalliamvakou",
    463         "X Alice Li",
    464         "Andrew Rice"
    465       ],
    466       "year": 2022,
    467       "relevance": "Empirical productivity assessment of neural code completion tools."
    468     }
    469   ],
    470   "engagement_factors": {
    471     "practical_relevance": {
    472       "score": 1,
    473       "justification": "Offers framework-level insights about GenAI productivity but no actionable techniques or tools practitioners can directly apply."
    474     },
    475     "surprise_contrarian": {
    476       "score": 2,
    477       "justification": "The 'productivity paradox' finding that GenAI hasn't meaningfully improved developer productivity contradicts the dominant industry narrative of massive AI-driven gains."
    478     },
    479     "fear_safety": {
    480       "score": 0,
    481       "justification": "No safety, security, or risk angle is addressed in the paper."
    482     },
    483     "drama_conflict": {
    484       "score": 1,
    485       "justification": "Mildly questions the AI productivity hype promoted by GitHub/Microsoft but doesn't directly challenge specific company claims with strong evidence."
    486     },
    487     "demo_ability": {
    488       "score": 0,
    489       "justification": "Survey-based study with no code, tool, or demo to interact with."
    490     },
    491     "brand_recognition": {
    492       "score": 1,
    493       "justification": "Authors are from recognized universities (Oregon State, Colorado State, NAU) but not famous AI labs; the topic touches well-known tools like Copilot but only tangentially."
    494     }
    495   }
    496 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs