scan.json (23331B)
1 { 2 "paper": { 3 "title": "More code, less validation: Risk factors for over-reliance on AI coding tools among scientists", 4 "authors": ["Gabrielle O'Brien", "Alexis Parker", "Nasir U. Eisty", "Jeffrey Carver"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2512.19644", 8 "doi": "10.48550/arXiv.2512.19644" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "Section 4.3 states 'R scripts for processing data from the survey are provided in the accompanying GitHub repository' and a GitHub URL is provided in Section 4.5 (Code Availability)." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "Data Availability section states 'raw survey responses are not made public. Anonymized data is available to peer reviewers at request.' Available upon request is not public release." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No environment specifications, requirements.txt, or library versions mentioned. Only that analyses were conducted in R." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions provided. Scripts are said to be in a GitHub repository but no README or reproduction guide is described." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "Results report point estimates (e.g., r = -0.14, p = 0.0009) but no confidence intervals or error bars on main results. The linear model in Table 3 reports standard errors on coefficients but no CIs on the main findings." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": true, 44 "justification": "Extensive use of significance tests: ANOVA (F tests with p-values), polyserial/polychoric correlations with p-values, t-tests for group comparisons, chi-squared tests, and linear model F-statistics throughout Section 2." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Correlation coefficients reported throughout (e.g., polyserial r = -0.12, r = 0.31), R² values for models (R² = 0.04, adjusted R² = 0.035), and percentage of variance explained (9.6%). These provide magnitude context." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "No power analysis or sample size justification. The sample of 868 is a convenience sample; no discussion of whether this is sufficient for the analyses conducted." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Standard deviations reported for key variables (e.g., programming experience σ = 9.20, perceived productivity σ = 0.70) and residual standard error in regression models." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The study compares across groups (experienced vs inexperienced, high vs low development practice adoption, adopters vs non-adopters) and references prior work (Ziegler et al.'s GitHub Copilot study) as comparison." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Comparisons reference contemporary studies: Ziegler et al. (2022), Kumar et al. (2025), O'Brien (2025), Chugunova et al. (2025). The SPACE scale is adapted from recent work." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "This is a survey study, not a system with components to ablate." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Multiple outcome variables analyzed: usage frequency, tool choice, composite perceived productivity score, individual SPACE dimensions, development practice scores, lines of code accepted." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "Human evaluation of system outputs is not applicable to a survey study." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "Not applicable to a survey study — there is no train/test split." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down by research area (Table 1, Figure 2), position (Figure 3), gender (Table B2, Figure A3), tool type (Table 2), and individual development practices (Figure 4)." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses the low explanatory power of their model (R² = 0.04) explicitly, and Section 2.5 presents reasons for non-adoption including failures and frustrations with tools." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Several negative results reported: no significant overall effect of gender (p = 0.069), no significant effect of research area on perceived productivity (p = 0.125), and low R² of the main model acknowledged." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims about adoption patterns, tool preferences, interaction of experience and practices, and lines-of-code as strongest predictor are all supported by results in Sections 2.2-2.4." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper uses language like 'factors associated with' and 'predictors' which is appropriately correlational, but the title 'Risk factors for over-reliance' and discussion of 'over-reliance' imply causal mechanisms from cross-sectional survey data. Section 3.2 acknowledges 'cross-sectional study design precludes strong causal claims' but the framing throughout suggests causation." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "Section 3.2 explicitly limits generalizability: 'our survey sample is non-random, mostly academic, and heavily U.S.-centric, limiting generalizability to non-academic researchers and scientists in the global south.' Section 2.1 emphasizes 'this is a non-random, convenience sample.'" 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "Section 3.1 discusses alternative explanations: users accepting many lines may still modify code substantially; perceived productivity may genuinely reflect benefit for novices, not just automation bias; the SPACE scale may have imperfect reliability. Section 2.4.3 offers an 'ad hoc explanation' for the interaction effect." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper consistently distinguishes 'perceived productivity' from actual productivity. Section 3.1 states 'our survey lacks direct measures of code quality or developer activity' and discusses the proxy gap extensively. The title itself flags 'over-reliance' as the real concern behind perceived productivity gains." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": false, 139 "answer": false, 140 "justification": "This is a survey study; no AI models were used by the researchers in the methodology." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "No prompting used in the methodology. The paper studies others' use of AI tools." 146 }, 147 "hyperparameters_reported": { 148 "applies": false, 149 "answer": false, 150 "justification": "No AI models or hyperparameters involved in the study methodology." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding used in this survey study." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.3 documents data handling: inclusion criteria filtering (1272 → 868), log transformations, ordered factor conversion, standardized dictionary for job title recoding. Section 4.2.1 details each exclusion step with counts (283 incomplete, 8 no consent, 102 non-programmers, 9 non-research, 1 quality control)." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.2 is titled 'Limitations' and provides substantive discussion of study limitations." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3.2 discusses specific threats: cross-sectional design precluding causal inference, non-random U.S.-centric academic sample, recruitment through RSE communities underrepresenting casual programmers, self-report divergence from actual productivity (citing specific studies showing this gap)." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 3.2 explicitly states what results do NOT show: 'limiting generalizability to non-academic researchers and scientists in the global south.' Section 3.1 states 'our survey lacks direct measures of code quality or developer activity, we cannot definitively determine how perceived productivity relates to these constructs.'" 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "Raw data not publicly available. 'Anonymized data is available to peer reviewers at request' — not public access." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 4.1 describes the survey instrument in detail (seven sections, Qualtrics platform, 10-15 minute estimated completion). Section 4.2 describes recruitment channels. The full survey is provided as supplementary material." 190 }, 191 "recruitment_methods_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4.2 describes recruitment in detail: US-RSE Slack and newsletter, pyOpenSci community, University of Michigan mailing list (18,851 addresses), Alfred P. Sloan Foundation newsletter, Learning Engineering Google group. Incentivization via cash drawing ($100 × 5)." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.2.1 documents the full pipeline: 1272 collected → 283 incomplete removed → 8 no consent → 102 non-programmers screened → 1 quality control → 9 non-research screened → 868 final. Section 4.3 documents all transformations." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Acknowledgments section states: 'This work is supported by a grant from the Alfred P. Sloan foundation to G.O.'" 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All author affiliations are clearly listed: University of Michigan, University of Tennessee, University of Alabama. No commercial affiliations that would create conflicts." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": true, 216 "justification": "Alfred P. Sloan Foundation is a philanthropic organization with no financial stake in whether genAI tools help or harm scientific programming." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": true, 221 "justification": "Ethics declarations section states: 'The authors declare no competing interests.'" 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This is a survey study, not evaluating a pre-trained model on any benchmark." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "No pre-trained model evaluated on benchmarks." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "No pre-trained model evaluated on benchmarks." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": true, 244 "answer": false, 245 "justification": "No pre-registration mentioned. The study was reviewed by IRB but no pre-registration link (OSF, AsPredicted, etc.) is provided." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 4 states: 'the survey and study plan were reviewed by the University of Michigan Institutional Review Board and deemed to be of minimal risk to participants.'" 251 }, 252 "demographics_reported": { 253 "applies": true, 254 "answer": true, 255 "justification": "Extensive demographics: career stage (Section 2.1), programming experience (median 7 years, µ=9.50, σ=9.20), gender (441 men, 384 women, 24 non-binary), research area (Table 1), institution type, geographic distribution." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": true, 259 "answer": true, 260 "justification": "Section 4.2.1 details inclusion criteria: must program at least sometimes, 18+, informed consent, research group publishes peer-reviewed works. Exclusion counts documented for each criterion." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "This is a cross-sectional survey, not an experimental study with treatment/control conditions. Randomization is not applicable." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "Not applicable to a cross-sectional survey study." 271 }, 272 "attrition_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 4.2.1 reports: 1272 total responses, 283 incomplete removed, 8 no consent, 102 non-programmers, 9 non-research, 1 quality control = 868 final. Additionally, per-question response counts are reported throughout (e.g., n=760 for usage frequency, n=609 for tool choice)." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "Survey paper — no AI inference costs apply." 283 }, 284 "compute_budget_stated": { 285 "applies": false, 286 "answer": false, 287 "justification": "Survey paper — no significant compute involved." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "GenAI adoption for programming is highest among students and less experienced programmers.", 294 "evidence": "Section 2.2: Student research assistants showed highest adoption (Figure 3). Log-years of programming experience inversely related to usage frequency (polyserial r = -0.12, p = 0.001).", 295 "supported": "strong" 296 }, 297 { 298 "claim": "Scientific programmers overwhelmingly prefer general-purpose conversational tools (ChatGPT) over developer-specific tools.", 299 "evidence": "Section 2.3: 472/609 (77.5%) selected general-purpose tools vs 88/609 (14.4%) developer-specific. ChatGPT was primary tool for 391/609 (64.2%). Table 2.", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Both inexperience and limited development practices are associated with greater perceived productivity, and these factors interact.", 304 "evidence": "Table 3: Main effects of development practice score (β = -0.217, p < 0.01) and log-years experience (β = -0.554, p < 0.01) with significant interaction (β = 0.207, p < 0.01). However, overall R² = 0.04.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "The strongest predictor of perceived productivity is the number of lines of generated code typically accepted at once.", 309 "evidence": "Section 2.4.3: polyserial r = 0.31, p = 3.12e-17, explaining 9.6% of variance. Robustness check excluding activity item: r = 0.28, p = 1.01e-13.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Women tend to report less genAI usage for programming than men.", 314 "evidence": "Section 2.2: β = -0.20778, SE = 0.09006, p = 0.0213. Omnibus effect of gender did not meet significance threshold (p = 0.069). Described as exploratory.", 315 "supported": "moderate" 316 }, 317 { 318 "claim": "Scientific programmers using genAI may gauge productivity by code generation rather than validation.", 319 "evidence": "Section 3.1: Lines accepted negatively correlates with all development practice measures (Figure 5). Users accepting >100 lines report highest productivity but have lowest practice adoption. However, this is inferential — the study has no direct measures of code quality.", 320 "supported": "moderate" 321 } 322 ], 323 "methodology_tags": ["observational", "qualitative"], 324 "key_findings": "Survey of 868 scientists who program reveals genAI adoption is highest among students and less experienced programmers, with ChatGPT dominating (64.2%) over developer-specific tools (14.4%). Both programming inexperience and limited use of development practices (testing, code review, version control) are associated with greater perceived productivity, with a significant interaction suggesting practices partially compensate for inexperience. The strongest single predictor of perceived productivity is the volume of code accepted at once (r = 0.31), raising concerns that scientific programmers equate code generation with productivity rather than validation. About 25% of respondents rejected genAI tools, citing concerns about skill development, ethics, no perceived need, inefficiency, and accuracy problems.", 325 "red_flags": [ 326 { 327 "flag": "Non-random convenience sample", 328 "detail": "The sample is heavily skewed toward U.S. academic researchers recruited through RSE mailing lists and one university's employee list (18,851 addresses at University of Michigan). 843/868 respondents are from the U.S. The paper acknowledges this but the title and abstract do not bound the generalization to this population." 329 }, 330 { 331 "flag": "Low explanatory power of main model", 332 "detail": "The main hypothesis-testing model explains only 4% of variance in perceived productivity (R² = 0.04). While the paper acknowledges this, it still frames the results as supporting its hypotheses about risk factors for over-reliance." 333 }, 334 { 335 "flag": "Self-report measures only", 336 "detail": "All measures are self-reported with no behavioral validation. The paper frames findings as 'risk factors for over-reliance' but has no measure of actual over-reliance, code quality, or real productivity. The leap from survey correlations to 'over-reliance' in the title is inferential." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "Productivity assessment of neural code completion", 342 "authors": ["Albert Ziegler"], 343 "year": 2022, 344 "relevance": "Foundation for the SPACE perceived productivity scale used in this study; large study of GitHub Copilot users linking experience to perceived productivity." 345 }, 346 { 347 "title": "Intuition to Evidence: Measuring AI's True Impact on Developer Productivity", 348 "authors": ["Aman Kumar"], 349 "year": 2025, 350 "arxiv_id": "2509.19708", 351 "relevance": "Study linking perceived productivity with genAI tools to measurable increases in objective programming activity." 352 }, 353 { 354 "title": "The Widening Gap: The Benefits and Harms of Generative AI for Novice Programmers", 355 "authors": ["James Prather"], 356 "year": 2024, 357 "relevance": "Describes the 'illusion of competence' where novice programmers overestimate their understanding of AI-generated code." 358 }, 359 { 360 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 361 "authors": ["Arghavan Moradi Dakhel"], 362 "year": 2023, 363 "relevance": "Empirical evaluation of Copilot's code quality and risks of uncritical acceptance." 364 }, 365 { 366 "title": "Speed at the Cost of Quality? The Impact of LLM Agent Assistance on Software Development", 367 "authors": ["Hao He", "Cody Miller", "Siddharth Agarwal", "Christian Kästner", "Bogdan Vasilescu"], 368 "year": 2025, 369 "relevance": "Study finding that Cursor-associated projects show initial activity bursts but declining quality and maintainability." 370 }, 371 { 372 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 373 "authors": ["Josh Becker", "Neil Rush", "Brian Barnes", "David Rein"], 374 "relevance": "Study that failed to find meaningful productivity increases with AI tools, providing counterevidence to productivity claims." 375 }, 376 { 377 "title": "Developer Productivity With and Without GitHub Copilot: A Longitudinal Mixed-Methods Case Study", 378 "authors": ["Viktoria Stray"], 379 "year": 2025, 380 "arxiv_id": "2509.20353", 381 "relevance": "Longitudinal case study examining Copilot's impact on developer productivity with mixed-methods approach." 382 }, 383 { 384 "title": "How Scientists Use Large Language Models to Program", 385 "authors": ["Gabrielle O'Brien"], 386 "year": 2025, 387 "relevance": "Prior study by first author on scientific programmers' use of LLMs, informing the survey design." 388 }, 389 { 390 "title": "Vibe Coding in Practice: Motivations, Challenges, and a Future Outlook - a Grey Literature Review", 391 "authors": ["Alaa Fawzy", "Amjed Tahir", "Kelly Blincoe"], 392 "year": 2025, 393 "relevance": "Grey literature review on vibe coding practices with genAI tools, relevant to how users interact with AI coding assistants." 394 }, 395 { 396 "title": "Self-Admitted GenAI Usage in Open-Source Software", 397 "authors": ["Tao Xiao"], 398 "year": 2025, 399 "arxiv_id": "2507.10422", 400 "relevance": "Study on identifying genAI-originated code in open-source, relevant to detecting AI tool usage patterns." 401 }, 402 { 403 "title": "A survey of the state of the practice for research software in the United States", 404 "authors": ["Jeffrey C. Carver", "Nic Weber", "Karthik Ram", "Sandra Gesing", "Daniel S. Katz"], 405 "year": 2022, 406 "relevance": "Foundation survey instrument for development practice adoption among scientific programmers, directly adapted in this study." 407 } 408 ] 409 }