scan.json (20649B)
1 { 2 "paper": { 3 "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence", 4 "authors": ["Shakked Noy", "Whitney Zhang"], 5 "year": 2023, 6 "venue": "Working Paper (MIT)", 7 "doi": "10.1126/science.adh2586" 8 }, 9 "scan_version": 2, 10 "active_modules": [], 11 "methodology_tags": ["rct"], 12 "key_findings": "In a preregistered RCT with 444 college-educated professionals, ChatGPT access reduced task completion time by 0.8 SDs and increased output quality by 0.4 SDs. ChatGPT compressed the productivity distribution, benefiting low-ability workers more and halving the correlation between first-task and second-task grades. ChatGPT primarily substituted for worker effort rather than complementing skills, with 68% of treated participants submitting unedited ChatGPT output.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, code release, or data archive mentioned in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No dataset download link or data release mentioned. The paper references an Online Appendix but does not provide a data archive." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No computational environment or software dependency details provided. The experiment is an online survey, but no platform or analysis software details are given." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions provided. The experimental design is described, and materials are referenced in an Online Appendix, but no replication package is offered." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": true, 40 "justification": "95% confidence intervals are reported for main treatment effects (e.g., Figure 1: 'Treatment Effect: -0.83 SDs, 95% CI: [-0.63, -1.03]') and error bars shown on figures." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "P-values reported for main results (e.g., time: p=0.000, grades: p=0.000, inequality slope difference: p=0.004, worry: p=0.006, excitement: p=0.000, optimism: p=0.037)." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes reported in standard deviations throughout (e.g., -0.83 SDs for time, 0.45 SDs for grades, 0.40 SDs for job satisfaction). Also reports 37% reduction in time and absolute grade differences." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No power analysis or justification for the sample size of 444 participants. The number appears to be determined by recruitment capacity rather than statistical planning." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Standard deviations reported in Table 1 for all descriptive statistics. Standard errors reported for regression slopes (e.g., Figure 2: 'Slope: 0.491 (SE 0.053)')." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Control group serves as the baseline. Also compares pre-treatment and post-treatment performance within subjects. Raw ChatGPT output is evaluated as an additional comparison point." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The control group is the appropriate baseline for an RCT. ChatGPT was the most prominent generative AI tool at the time." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple supplementary arms probe specific mechanisms: a fixed-time arm (15 minutes, isolating quality effects from time savings), a revision arm (showing first-task output and allowing editing), and comparison of linear vs. convex incentive schemes." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics: time taken, overall grades, writing quality, content quality, originality, earnings per minute, job satisfaction, self-efficacy, automation beliefs." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Blinded experienced professionals in the same occupations evaluated outputs. Each piece received three evaluations with average cross-evaluator correlation of 0.44. Evaluators were incentivized." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "Not applicable — this is a human subjects RCT, not a machine learning benchmark evaluation." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results broken down by grade distribution (Figure 2), by incentive scheme (linear vs. convex), by task component (brainstorming, rough-drafting, editing), by writing skill terciles, and by pre-treatment ability level." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Discusses cases where ChatGPT doesn't help: the complementarity hypothesis finds no evidence (human editing doesn't improve ChatGPT output), and follow-up survey reveals participants not using ChatGPT because it lacks context-specific knowledge." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Several null/negative results reported: no evidence of human-machine complementarity, no heterogeneity by relative writing skills, no effect on real job satisfaction at two-week follow-up, fixed-time arm imprecisely estimated (p=0.13)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims (0.8 SD time decrease, 0.4 SD quality increase, inequality compression, substitution rather than complementarity, job satisfaction/self-efficacy effects) are all supported by corresponding results sections with statistics." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "Causal claims justified by RCT design with random assignment. Pre-registered at AEA RCT Registry (AEARCTR-0010882). Balance tests reported in Table 1. Lee bounds and robustness checks for selective attrition reported." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "Discussion section explicitly enumerates limitations: tasks are short and self-contained, lack context-specific knowledge, only capture direct/immediate effects, and results may vary by occupation/task/skill level. Acknowledges experiment inflates ChatGPT's usefulness." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Discusses selective attrition (10% vs 5%) with Lee bounds, potential control group contamination (10-20% used ChatGPT, making estimates lower bounds), novelty effects, and placebo effects from the Overleaf control condition." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper defines productivity as 'earnings per minute' and acknowledges that short, self-contained tasks may inflate estimates of ChatGPT's usefulness. Discussion notes tasks lack context-specific knowledge that real work requires, and follow-up survey confirms participants find ChatGPT less useful for real tasks (3.65/5 vs 4.4/5)." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper refers only to 'ChatGPT' without specifying the version (GPT-3.5 vs GPT-4) or snapshot date. Given the March 2023 date, this was likely GPT-3.5, but it is not stated." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "Participants freely prompted ChatGPT themselves — there are no researcher-designed prompts to report. The task prompts given to participants are referenced in the Online Appendix." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "Participants used ChatGPT through its default web interface. No hyperparameter tuning by researchers." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding used. Participants used ChatGPT's web interface directly." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Data pipeline described: participant output collected with minute-by-minute snapshots, three blinded evaluators per piece, cross-evaluator correlation reported (0.44), objective time measure constructed from snapshots, control group ChatGPT usage detected." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Discussion section (Section 3) contains substantive limitations discussion spanning multiple paragraphs with specific limitations enumerated." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Specific threats discussed: tasks are short and self-contained, lack context-specific knowledge (which may inflate estimates), differential attrition (10% vs 5%), control group contamination. Follow-up survey data used to test external validity." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Explicitly states: 'an experiment, by its nature, captures only direct, immediate effects on the selected occupations. There will be many indirect, reinforcing, or counteracting general-equilibrium effects.' Also notes variation by occupation, task, and skill level." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data or data archive provided. Only aggregated statistics and figures in the paper." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Detailed description: online experiment with 444 college-educated professionals across six occupations, occupation-specific writing tasks, 20-30 minute assignments, high-powered bonus incentives, minute-by-minute output snapshots, three blinded evaluators per output." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The paper does not describe how the 444 professionals were recruited. It states they are 'college-educated professionals' but does not describe the recruitment platform, channels, or process." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Pipeline documented: recruitment → random assignment → first task → treatment intervention (ChatGPT signup vs. Overleaf signup) → second task → evaluation by three blinded professionals → follow-up survey at two weeks. Attrition rates reported by group." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Funding disclosed: Emergent Ventures grant, George and Obie Shultz Fund, NSF Graduate Research Fellowship (Grant No. 1745302)." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Both authors listed as MIT affiliates. No product affiliation conflict — they are academic researchers, not employees of OpenAI." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "Funders (Emergent Ventures, NSF, Shultz Fund) are general research grants with no commercial stake in ChatGPT's performance." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement included in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This is an RCT studying productivity effects of ChatGPT use, not evaluating model capability on a benchmark. Contamination is not applicable." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not a benchmark evaluation — no train/test overlap concern." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not a benchmark evaluation — contamination not applicable." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": true, 246 "justification": "Pre-registered at AEA RCT Registry (AEARCTR-0010882), explicitly stated in the acknowledgments." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": true, 251 "justification": "Approved by MIT Committee on the Use of Humans as Experimental Subjects, stated in the acknowledgments." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": true, 256 "justification": "Table 1 reports: annual salary, years of tenure, employment status, college degree status, occupation breakdown (HR, consultant, data analyst, grant writer, manager, marketer)." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": true, 261 "justification": "Participants are college-educated professionals in specified occupations (marketers, grant writers, consultants, data analysts, HR professionals, managers). The paper states 'experienced, college-educated professionals.'" 262 }, 263 "randomization_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "Random assignment described: 'A randomly-selected 50% of our participants—the treatment group—are instructed to sign up for ChatGPT.' Balance tests on 13 pre-treatment characteristics reported in Table 1." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": true, 271 "justification": "Evaluators are described as 'blinded' — they do not know which condition produced the output they are grading. Participant blinding addressed by the Overleaf control (active control)." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": true, 276 "justification": "Attrition rates reported: 5% in control, 10% in treatment. Lee (2009) bounds computed to address differential attrition. Follow-up survey response rate: 82% with no differential response by treatment status." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is an RCT studying human productivity, not proposing a computational method. Inference cost is not relevant." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Purely a human subjects experiment; no computational budget to report." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "ChatGPT reduces time taken on writing tasks by 0.83 SDs (37%, ~10 minutes)", 295 "evidence": "Figure 1 Panel (a): treatment effect -0.83 SDs, 95% CI [-0.63, -1.03], p=0.000. Control mean 27 minutes, treatment mean 17 minutes.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "ChatGPT increases average output quality by 0.45 SDs", 300 "evidence": "Figure 1 Panel (b): treatment effect 0.45 SDs, 95% CI [0.27, 0.63], p=0.000. Similar increases for writing quality, content quality, and originality.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "ChatGPT compresses productivity inequality by benefiting low-ability workers more", 305 "evidence": "Figure 2: correlation between first-task and second-task grades drops from 0.49 (control) to 0.25 (treatment), p=0.004 on difference in slopes.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "ChatGPT substitutes for worker effort rather than complementing worker skills", 310 "evidence": "68% of treated participants submit ChatGPT output without editing; average only 3 minutes active after pasting; no correlation between editing time and grade; treated participants do not receive higher grades than raw ChatGPT output.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "ChatGPT increases job satisfaction by 0.40 SDs", 315 "evidence": "Figure 4 Panel (a): treatment effect 0.5 SDs, 95% CI [0.32, 0.68], p=0.000.", 316 "supported": "strong" 317 }, 318 { 319 "claim": "Exposure to ChatGPT increases both worry about automation and excitement about AI", 320 "evidence": "Figure 4 Panel (c): worry +0.26 SDs (p=0.006), excitement +0.39 SDs (p=0.000), net optimism +0.20 SDs (p=0.037).", 321 "supported": "strong" 322 } 323 ], 324 "red_flags": [ 325 { 326 "flag": "Differential attrition", 327 "detail": "Attrition is 10% in treatment vs 5% in control. While the paper addresses this with Lee bounds and robustness checks, differential attrition in an RCT is a concern. Two of 13 balance variables show significant differences." 328 }, 329 { 330 "flag": "Short, artificial tasks may inflate effects", 331 "detail": "The paper acknowledges this limitation: 20-30 minute self-contained tasks without context-specific knowledge may overstate ChatGPT's real-world usefulness. The follow-up survey confirms lower usefulness ratings for real tasks (3.65/5 vs 4.4/5)." 332 }, 333 { 334 "flag": "Control group contamination", 335 "detail": "10-20% of control group participants used ChatGPT on the tasks, which the authors acknowledge makes their estimates lower bounds. However, this means the clean treatment-control contrast is muddied." 336 }, 337 { 338 "flag": "Recruitment methods not described", 339 "detail": "The paper does not describe how the 444 professionals were recruited, making it impossible to assess selection bias or generalizability of the sample." 340 } 341 ], 342 "cited_papers": [ 343 { 344 "title": "Robots and Jobs: Evidence from US Labor Markets", 345 "authors": ["Daron Acemoglu", "Pascual Restrepo"], 346 "year": 2020, 347 "relevance": "Foundational work on automation's labor market effects, relevant context for AI productivity studies." 348 }, 349 { 350 "title": "The Race between Man and Machine: Implications of Technology for Growth, Factor Shares, and Employment", 351 "authors": ["Daron Acemoglu", "Pascual Restrepo"], 352 "year": 2018, 353 "relevance": "Theoretical framework on automation displacing vs complementing workers, directly applicable to AI coding tools." 354 }, 355 { 356 "title": "AI, Skill, and Productivity: The Case of Taxi Drivers", 357 "authors": ["Kyogo Kanazawa", "Daiji Kawaguchi", "Hitoshi Shigeoka", "Yasutora Watanabe"], 358 "year": 2022, 359 "relevance": "Empirical study of AI's productivity effects in a non-writing domain, useful comparison for AI productivity claims." 360 }, 361 { 362 "title": "Artificial Intelligence: The Ambiguous Labor Market Impact of Automating Prediction", 363 "authors": ["Ajay Agrawal", "Joshua S Gans", "Avi Goldfarb"], 364 "year": 2019, 365 "relevance": "Framework on AI complementing vs substituting for human prediction tasks." 366 }, 367 { 368 "title": "Human Decisions and Machine Predictions", 369 "authors": ["Jon Kleinberg", "Himabindu Lakkaraju", "Jure Leskovec", "Jens Ludwig", "Sendhil Mullainathan"], 370 "year": 2018, 371 "relevance": "Empirical study of ML predictions complementing human judgment in bail decisions." 372 } 373 ] 374 }