scan.json (22571B)
1 { 2 "paper": { 3 "title": "Significant Productivity Gains through Programming with Large Language Models", 4 "authors": ["Thomas Weber", "Maximilian Brandmaier", "Albrecht Schmidt", "Sven Mayer"], 5 "year": 2024, 6 "venue": "Proc. ACM Hum.-Comput. Interact. (EICS)", 7 "doi": "10.1145/3661145" 8 }, 9 "scan_version": 2, 10 "active_modules": [], 11 "methodology_tags": ["rct", "qualitative"], 12 "key_findings": "In a within-subjects study (N=24), both auto-complete (GitHub Copilot) and conversational (GPT-3) AI assistants significantly increased developer productivity (~65% more requirements implemented per minute) compared to browser-only baseline. Code quality (correctness, maintainability index) did not differ significantly across conditions. Distinct usage patterns emerged: auto-complete favored many small code snippets while conversational systems produced fewer but larger code blocks, essentially replacing browser search.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides an OSF link (https://osf.io/a3vxc/) containing experimental setup, datasets, and analysis scripts under 'Open Science' section." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'The underlying anonymized data is available as part of the supplementary material' and links to the OSF repository." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The VM setup is described (10 vCores, 18GB RAM, Windows, VSC, Chrome) but no environment specification files (requirements.txt, etc.) for reproducing the analysis are mentioned in the paper." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "While the OSF repository is provided, the paper itself does not include step-by-step reproduction instructions for the analysis pipeline." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Results are reported as means and medians with boxplots showing distributions, but no confidence intervals or error bars (±) are provided for the main results." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "Extensive statistical testing: Friedman tests with Wilcoxon signed-rank post-hoc tests or repeated measures ANOVAs with t-tests, Bonferroni-corrected p-values throughout (Section 4, Table 2)." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes reported: Kendall's W for Friedman tests (e.g., W=0.447 for AttrakDiff) and η² for ANOVAs (Table 2). Also concrete magnitudes like '65% more requirements' and '55.8% faster'." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No power analysis or justification for N=24. The sample size is not discussed as potentially insufficient for detecting effects." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Standard deviations reported (e.g., age SD: 3.6), distributions shown via boxplots with outliers, and spread measures visible in figures throughout Section 4." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Three conditions compared: Baseline (browser only), Auto-complete (Copilot), and Conversational (GPT-3). The browser-only baseline represents traditional development." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "GitHub Copilot and GPT-3 were contemporary AI assistants at the time of the study. The browser baseline is the standard practice being compared against." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "This is a comparative user study, not a system with modular components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics across all five SPACE dimensions: AttrakDiff satisfaction, code correctness, maintainability index, character output per minute, requirements per minute, code snippet counts, and more (Table 2)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "The entire study is a human evaluation — participants completed programming tasks and provided subjective assessments via AttrakDiff and Likert scales." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a user study, not a machine learning evaluation requiring train/test splits." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results broken down across all five SPACE dimensions (Satisfaction, Performance, Activity, Communication, Efficiency) with sub-metrics in each (Figures 5-10, Table 2)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4.7 discusses negative qualitative feedback: struggles understanding generated code (5 participants), incorrect code generation (2), context-switching overhead (6), information overload (3). Section 4.8 analyzes individual outlier cases." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Multiple negative findings: no significant code quality improvement from AI (Section 4.2), overtrust in AI-generated code observed, conversational system's context-switching overhead criticized, some participants performed worse with AI." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of 'significantly increasing productivity metrics' and 'distinctive usage patterns' are supported by statistical tests in Section 4.5 and usage pattern analysis in Section 4.4/4.8." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The within-subjects design with Latin Square counterbalancing for order and task-condition assignment supports causal claims about AI assistants affecting productivity. Randomization described in Section 3.3." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "Title claims 'Significant Productivity Gains through Programming with Large Language Models' broadly, but results are from 24 participants doing 3 Python tasks in 15 minutes. Section 5.4 discusses some limitations but the title and abstract overreach." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 5.4 discusses several alternative explanations: novelty effect, sample skewing young, task scope limitations, model-specific effects, and that different models were used across conditions." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper explicitly uses the SPACE framework and discusses multiple dimensions of productivity rather than equating a single metric with 'productivity'. Section 2.2 extensively discusses what productivity means and why multiple measures are needed." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper says 'GPT-3' and 'GitHub Copilot' without specifying exact model versions, API snapshot dates, or Codex version used by Copilot." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "The study evaluates human participants using AI tools — the prompts are generated by participants during the study, not by the researchers. The researchers did not design prompts for the AI systems." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No mention of temperature, top-p, or other generation parameters for either GPT-3 or Copilot." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper evaluates Copilot and GPT-3 as third-party black-box tools integrated via VSC extensions. Authors cannot describe internal scaffolding." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3.5 describes how metrics were collected: character counting excluding comments, normalization by time, snippet counting differentiated by source (browser vs AI). Data pipeline from screen recordings and interaction logs is documented." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 5.4 'Limitations' provides a dedicated, substantive discussion of study limitations spanning over a page." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5.4 discusses specific threats: young participant sample, constrained task scope vs real-world development, probabilistic nature of LLMs meaning different models could produce different results, tasks not requiring creative decomposition." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 5.4 explicitly states limitations: tasks were limited in scope, did not test group dynamics, did not test long-term usage, deferred task decomposition to future work, and acknowledges results are 'a snapshot of the current time.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "Anonymized raw data available at OSF repository (https://osf.io/a3vxc/) as stated in the paper and 'Open Science' section." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3 describes the full data collection procedure: VM setup, screen recording, automatic interaction logging, survey instruments (AttrakDiff), and supervised sessions." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 3.1: recruited via social media, mailing lists of three CS/SE institutions, and direct contact with industry professionals. Selection bias acknowledged by noting young-skewing sample." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 3.5 documents the full pipeline: interaction recording → character counting → normalization by time, code snippet extraction differentiated by source, unit testing for correctness, static analysis for maintainability. 5 exclusions explained (Section 3.1)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section mentioning grants or sponsors is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All authors listed as LMU Munich affiliations. No evaluated product is their own — they evaluate third-party tools (GitHub Copilot, GPT-3)." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This is a user study evaluating human productivity with AI tools, not a benchmark evaluation of model capability. Contamination is not relevant." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not a benchmark evaluation of model capability." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not a benchmark evaluation of model capability." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": false, 246 "justification": "No mention of pre-registration. The OSF link is for data/materials sharing, not a pre-registration." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "No mention of IRB or ethics board approval despite collecting data from 29 human participants." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": true, 256 "justification": "Section 3.1: age (mean 26.8, SD 3.6), gender (4 female, 20 male), education levels (5 Master's, 16 Bachelor's, 3 current students), 9 professional developers, self-assessed experience levels (Figure 2)." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section 3.1: required basic Python knowledge, AI tool experience explicitly not required. 5 participants excluded for not completing tasks or violating study constraints (using ChatGPT in baseline)." 262 }, 263 "randomization_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "Latin Square method used for counterbalancing task-condition combinations and order (Section 3.3, 3.4). This is a within-subjects design so all participants experienced all conditions." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": false, 271 "justification": "Participants necessarily knew which condition they were in (browser only vs. Copilot vs. chatbot). No discussion of blinding or its infeasibility." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 3.1: 29 started, 5 excluded (reasons given: didn't complete tasks or violated constraints by using ChatGPT in baseline condition), 24 analyzed." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a user study evaluating human productivity, not proposing a method with inference costs." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a user study, not a computational method. VM specs described for study apparatus but not as a compute budget." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Both AI assistants significantly increased the number of requirements implemented per minute compared to the browser-only baseline (~65% improvement).", 295 "evidence": "Section 4.5, Figure 9: Friedman test significant (χ²=6.583, p=0.037), pairwise Wilcoxon tests significant for both AI conditions vs baseline (p<0.001).", 296 "supported": "strong" 297 }, 298 { 299 "claim": "AI assistants did not significantly affect code quality (correctness or maintainability index).", 300 "evidence": "Section 4.2: Friedman test not significant for correctness (χ²=0.636, p=0.728) or maintainability index (χ²=0.065, p=0.967). Kolmogorov-Smirnov test confirms same distribution (Table 3).", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Auto-complete favored many small code snippets while conversational systems produced fewer but larger snippets.", 305 "evidence": "Section 4.4: Auto-complete mean 12.6 snippets (median 41.8 chars each) vs conversational mean 5.0 snippets (median 110.9 chars each), p<0.001 for both comparisons.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "User satisfaction was significantly higher for both AI conditions compared to baseline.", 310 "evidence": "Section 4.1, Figure 5: AttrakDiff significant differences for hedonic quality (p<0.001 both), attractiveness (p=0.001, p=0.003). Kendall's W=0.447.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "Participants showed overtrust in AI-generated code, rating AI-assisted code quality higher despite no objective quality difference.", 315 "evidence": "Section 4.6 (self-assessment) vs Section 4.2 (objective measures): subjective performance ratings significantly higher for AI conditions (Table 4, p<0.001) while objective metrics showed no difference.", 316 "supported": "moderate" 317 } 318 ], 319 "red_flags": [ 320 { 321 "flag": "Small sample size for subgroup claims", 322 "detail": "N=24 is adequate for main within-subjects comparisons but the individual differences analysis (Section 4.8) draws conclusions from 6 outlier participants without statistical power for these subgroup claims." 323 }, 324 { 325 "flag": "No IRB/ethics approval mentioned", 326 "detail": "A study with 29 human participants collecting behavioral and demographic data does not mention ethics board approval." 327 }, 328 { 329 "flag": "No blinding discussion", 330 "detail": "Participants knew which condition they were in, which could affect effort and engagement. The novelty/excitement of AI tools may inflate satisfaction and efficiency measures." 331 }, 332 { 333 "flag": "Short task duration limits ecological validity", 334 "detail": "15-minute tasks with predefined subtasks do not reflect real software development where AI tools might have different effects on larger, more complex, and less structured problems. The paper acknowledges this." 335 }, 336 { 337 "flag": "Different underlying models across conditions", 338 "detail": "GPT-3 (conversational) and Codex (Copilot auto-complete) are related but different models, confounding the interaction-style comparison with model-capability differences. Acknowledged in Section 5.4." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 344 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 345 "year": 2023, 346 "doi": "10.48550/ARXIV.2302.06590", 347 "relevance": "RCT measuring Copilot's impact on productivity, finding 55.8% faster task completion — directly comparable methodology." 348 }, 349 { 350 "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models", 351 "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"], 352 "year": 2022, 353 "doi": "10.1145/3491101.3519665", 354 "relevance": "Within-subjects study finding Copilot did not significantly improve task completion time due to overreliance and debugging issues." 355 }, 356 { 357 "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models", 358 "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"], 359 "year": 2022, 360 "doi": "10.48550/ARXIV.2206.15000", 361 "relevance": "Qualitative study identifying acceleration vs exploration usage modes for Copilot." 362 }, 363 { 364 "title": "Reading Between the Lines: Modeling User Behavior and Costs in AI-Assisted Programming", 365 "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"], 366 "year": 2022, 367 "doi": "10.48550/ARXIV.2210.14306", 368 "relevance": "Observational study modeling developer behavior with Copilot, finding verification dominates time." 369 }, 370 { 371 "title": "The Programmer's Assistant: Conversational Interaction with a Large Language Model for Software Development", 372 "authors": ["Steven I. Ross", "Fernando Martinez", "Stephanie Houde", "Michael Muller", "Justin D. Weisz"], 373 "year": 2023, 374 "doi": "10.1145/3581641.3584037", 375 "relevance": "User study of conversational LLM coding assistant, finding complementary support to auto-complete and search." 376 }, 377 { 378 "title": "Productivity Assessment of Neural Code Completion", 379 "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X. Alice Li"], 380 "year": 2022, 381 "doi": "10.1145/3520312.3534864", 382 "relevance": "Analysis of 2047 Copilot users showing acceptance rate predicts perceived productivity." 383 }, 384 { 385 "title": "Taking Flight with Copilot: Early Insights and Opportunities of AI-Powered Pair-Programming Tools", 386 "authors": ["Christian Bird", "Denae Ford", "Thomas Zimmermann", "Nicole Forsgren"], 387 "year": 2023, 388 "doi": "10.1145/3582083", 389 "relevance": "Mixed-methods study of early Copilot adoption finding improved perceived productivity." 390 }, 391 { 392 "title": "The SPACE of Developer Productivity: There's more to it than you think", 393 "authors": ["Nicole Forsgren", "Margaret-Anne D. Storey", "Chandra Shekhar Maddila", "Thomas Zimmermann"], 394 "year": 2021, 395 "doi": "10.1145/3454122.3454124", 396 "relevance": "Defines the SPACE framework for holistic productivity measurement used as the theoretical basis in this study." 397 }, 398 { 399 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 400 "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh"], 401 "year": 2023, 402 "doi": "10.1016/j.jss.2023.111734", 403 "relevance": "Evaluates Copilot code quality, finding it produces code at least on par with human developers." 404 }, 405 { 406 "title": "Evaluating Large Language Models Trained on Code", 407 "authors": ["Mark Chen", "Jerry Tworek"], 408 "year": 2021, 409 "relevance": "Introduces Codex (the model behind Copilot) and HumanEval benchmark for code generation evaluation." 410 } 411 ] 412 }