scan-v4.json (27542B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence", 6 "authors": [ 7 "Shakked Noy", 8 "Whitney Zhang" 9 ], 10 "year": 2023, 11 "venue": "MIT Working Paper", 12 "arxiv_id": null, 13 "doi": "10.1126/science.adh2586" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "Abstract claims (0.8 SD time decrease, 0.4 SD quality increase, inequality compression, substitution rather than complementarity, job satisfaction/self-efficacy effects) are all supported by corresponding results sections with statistics.", 21 "source": "opus" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "Causal claims justified by RCT design with random assignment. Pre-registered at AEA RCT Registry (AEARCTR-0010882). Balance tests reported in Table 1. Lee bounds and robustness checks for selective attrition reported.", 27 "source": "opus" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Discussion section explicitly enumerates limitations: tasks are short and self-contained, lack context-specific knowledge, only capture direct/immediate effects, and results may vary by occupation/task/skill level. Acknowledges experiment inflates ChatGPT's usefulness.", 33 "source": "opus" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": true, 38 "justification": "Discusses selective attrition (10% vs 5%) with Lee bounds, potential control group contamination (10-20% used ChatGPT, making estimates lower bounds), novelty effects, and placebo effects from the Overleaf control condition.", 39 "source": "opus" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper defines productivity as 'earnings per minute' and acknowledges that short, self-contained tasks may inflate estimates of ChatGPT's usefulness. Discussion notes tasks lack context-specific knowledge that real work requires, and follow-up survey confirms participants find ChatGPT less useful for real tasks (3.65/5 vs 4.4/5).", 45 "source": "opus" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Discussion section (Section 3) contains substantive limitations discussion spanning multiple paragraphs with specific limitations enumerated.", 53 "source": "opus" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Specific threats discussed: tasks are short and self-contained, lack context-specific knowledge (which may inflate estimates), differential attrition (10% vs 5%), control group contamination. Follow-up survey data used to test external validity.", 59 "source": "opus" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Explicitly states: 'an experiment, by its nature, captures only direct, immediate effects on the selected occupations. There will be many indirect, reinforcing, or counteracting general-equilibrium effects.' Also notes variation by occupation, task, and skill level.", 65 "source": "opus" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Funding disclosed: Emergent Ventures grant, George and Obie Shultz Fund, NSF Graduate Research Fellowship (Grant No. 1745302).", 73 "source": "opus" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both authors listed as MIT affiliates. No product affiliation conflict — they are academic researchers, not employees of OpenAI.", 79 "source": "opus" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "Funders (Emergent Ventures, NSF, Shultz Fund) are general research grants with no commercial stake in ChatGPT's performance.", 85 "source": "opus" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests or financial interests statement included in the paper.", 91 "source": "opus" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "'Productivity' defined as 'earnings per minute.' 'ChatGPT' identified as 'generative AI assistive chatbot.' 'Mid-level professional writing tasks' exemplified (press releases, short reports, emails). Tasks specified as 20-30 minute assignments.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Paper explicitly states 'takes the first step towards answering' questions about ChatGPT's effect on productivity, substitution vs. complementarity, differential effects on ability levels, and worker satisfaction. Novel because prior literature lacks generative-task studies (footnote 1).", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Engages with automation literature (Autor, Acemoglu & Restrepo, historical displacement vs. complementarity debate). Positions this as different from routine-task automation and notes nascent generative-AI labor literature, clearly showing contribution.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": false, 121 "justification": "No repository URL, code release, or data archive mentioned in the paper.", 122 "source": "opus" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "No dataset download link or data release mentioned. The paper references an Online Appendix but does not provide a data archive.", 128 "source": "opus" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "No computational environment or software dependency details provided. The experiment is an online survey, but no platform or analysis software details are given.", 134 "source": "opus" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "No step-by-step reproduction instructions provided. The experimental design is described, and materials are referenced in an Online Appendix, but no replication package is offered.", 140 "source": "opus" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": true, 147 "justification": "95% confidence intervals are reported for main treatment effects (e.g., Figure 1: 'Treatment Effect: -0.83 SDs, 95% CI: [-0.63, -1.03]') and error bars shown on figures.", 148 "source": "opus" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": true, 153 "justification": "P-values reported for main results (e.g., time: p=0.000, grades: p=0.000, inequality slope difference: p=0.004, worry: p=0.006, excitement: p=0.000, optimism: p=0.037).", 154 "source": "opus" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Effect sizes reported in standard deviations throughout (e.g., -0.83 SDs for time, 0.45 SDs for grades, 0.40 SDs for job satisfaction). Also reports 37% reduction in time and absolute grade differences.", 160 "source": "opus" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "No power analysis or justification for the sample size of 444 participants. The number appears to be determined by recruitment capacity rather than statistical planning.", 166 "source": "opus" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": true, 171 "justification": "Standard deviations reported in Table 1 for all descriptive statistics. Standard errors reported for regression slopes (e.g., Figure 2: 'Slope: 0.491 (SE 0.053)').", 172 "source": "opus" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Control group serves as the baseline. Also compares pre-treatment and post-treatment performance within subjects. Raw ChatGPT output is evaluated as an additional comparison point.", 180 "source": "opus" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "The control group is the appropriate baseline for an RCT. ChatGPT was the most prominent generative AI tool at the time.", 186 "source": "opus" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "Multiple supplementary arms probe specific mechanisms: a fixed-time arm (15 minutes, isolating quality effects from time savings), a revision arm (showing first-task output and allowing editing), and comparison of linear vs. convex incentive schemes.", 192 "source": "opus" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "Multiple metrics: time taken, overall grades, writing quality, content quality, originality, earnings per minute, job satisfaction, self-efficacy, automation beliefs.", 198 "source": "opus" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": true, 203 "justification": "Blinded experienced professionals in the same occupations evaluated outputs. Each piece received three evaluations with average cross-evaluator correlation of 0.44. Evaluators were incentivized.", 204 "source": "opus" 205 }, 206 "held_out_test_set": { 207 "applies": false, 208 "answer": false, 209 "justification": "Not applicable — this is a human subjects RCT, not a machine learning benchmark evaluation.", 210 "source": "opus" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results broken down by grade distribution (Figure 2), by incentive scheme (linear vs. convex), by task component (brainstorming, rough-drafting, editing), by writing skill terciles, and by pre-treatment ability level.", 216 "source": "opus" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Discusses cases where ChatGPT doesn't help: the complementarity hypothesis finds no evidence (human editing doesn't improve ChatGPT output), and follow-up survey reveals participants not using ChatGPT because it lacks context-specific knowledge.", 222 "source": "opus" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "Several null/negative results reported: no evidence of human-machine complementarity, no heterogeneity by relative writing skills, no effect on real job satisfaction at two-week follow-up, fixed-time arm imprecisely estimated (p=0.13).", 228 "source": "opus" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper refers only to 'ChatGPT' without specifying the version (GPT-3.5 vs GPT-4) or snapshot date. Given the March 2023 date, this was likely GPT-3.5, but it is not stated.", 236 "source": "opus" 237 }, 238 "prompts_provided": { 239 "applies": false, 240 "answer": false, 241 "justification": "Participants freely prompted ChatGPT themselves — there are no researcher-designed prompts to report. The task prompts given to participants are referenced in the Online Appendix.", 242 "source": "opus" 243 }, 244 "hyperparameters_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "Participants used ChatGPT through its default web interface. No hyperparameter tuning by researchers.", 248 "source": "opus" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "No agentic scaffolding used. Participants used ChatGPT's web interface directly.", 254 "source": "opus" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Data pipeline described: participant output collected with minute-by-minute snapshots, three blinded evaluators per piece, cross-evaluator correlation reported (0.44), objective time measure constructed from snapshots, control group ChatGPT usage detected.", 260 "source": "opus" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": false, 267 "justification": "No raw data or data archive provided. Only aggregated statistics and figures in the paper.", 268 "source": "opus" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Detailed description: online experiment with 444 college-educated professionals across six occupations, occupation-specific writing tasks, 20-30 minute assignments, high-powered bonus incentives, minute-by-minute output snapshots, three blinded evaluators per output.", 274 "source": "opus" 275 }, 276 "recruitment_methods_described": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper does not describe how the 444 professionals were recruited. It states they are 'college-educated professionals' but does not describe the recruitment platform, channels, or process.", 280 "source": "opus" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "Pipeline documented: recruitment → random assignment → first task → treatment intervention (ChatGPT signup vs. Overleaf signup) → second task → evaluation by three blinded professionals → follow-up survey at two weeks. Attrition rates reported by group.", 286 "source": "opus" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "This is an RCT studying productivity effects of ChatGPT use, not evaluating model capability on a benchmark. Contamination is not applicable.", 294 "source": "opus" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": false, 298 "answer": false, 299 "justification": "Not a benchmark evaluation — no train/test overlap concern.", 300 "source": "opus" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": false, 304 "answer": false, 305 "justification": "Not a benchmark evaluation — contamination not applicable.", 306 "source": "opus" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": true, 312 "answer": true, 313 "justification": "Pre-registered at AEA RCT Registry (AEARCTR-0010882), explicitly stated in the acknowledgments.", 314 "source": "opus" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": true, 318 "answer": true, 319 "justification": "Approved by MIT Committee on the Use of Humans as Experimental Subjects, stated in the acknowledgments.", 320 "source": "opus" 321 }, 322 "demographics_reported": { 323 "applies": true, 324 "answer": true, 325 "justification": "Table 1 reports: annual salary, years of tenure, employment status, college degree status, occupation breakdown (HR, consultant, data analyst, grant writer, manager, marketer).", 326 "source": "opus" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": true, 330 "answer": true, 331 "justification": "Participants are college-educated professionals in specified occupations (marketers, grant writers, consultants, data analysts, HR professionals, managers). The paper states 'experienced, college-educated professionals.'", 332 "source": "opus" 333 }, 334 "randomization_described": { 335 "applies": true, 336 "answer": true, 337 "justification": "Random assignment described: 'A randomly-selected 50% of our participants—the treatment group—are instructed to sign up for ChatGPT.' Balance tests on 13 pre-treatment characteristics reported in Table 1.", 338 "source": "opus" 339 }, 340 "blinding_described": { 341 "applies": true, 342 "answer": true, 343 "justification": "Evaluators are described as 'blinded' — they do not know which condition produced the output they are grading. Participant blinding addressed by the Overleaf control (active control).", 344 "source": "opus" 345 }, 346 "attrition_reported": { 347 "applies": true, 348 "answer": true, 349 "justification": "Attrition rates reported: 5% in control, 10% in treatment. Lee (2009) bounds computed to address differential attrition. Follow-up survey response rate: 82% with no differential response by treatment status.", 350 "source": "opus" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": false, 356 "answer": false, 357 "justification": "This is an RCT studying human productivity, not proposing a computational method. Inference cost is not relevant.", 358 "source": "opus" 359 }, 360 "compute_budget_stated": { 361 "applies": false, 362 "answer": false, 363 "justification": "Purely a human subjects experiment; no computational budget to report.", 364 "source": "opus" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "ChatGPT substantially increases productivity on professional writing tasks (time -0.8 SDs, quality +0.4 SDs)", 372 "evidence": "RCT with 444 college-educated professionals; treatment group (ChatGPT) vs. control (Overleaf); time drop 10 min (37% of 27-min baseline), grade increase 0.45 SDs on 1-7 scale; 95% CIs [-0.63, -1.03] and [0.27, 0.63]", 373 "supported": "strong" 374 }, 375 { 376 "claim": "ChatGPT reduces productivity inequality between workers (correlation drops from 0.49 to 0.25)", 377 "evidence": "Within-person comparison: correlation between task 1 and task 2 grades is 0.49 in control, 0.25 in treatment; p=0.004 for difference in slopes; Figure 2 shows larger treatment effect for low-ability workers", 378 "supported": "strong" 379 }, 380 { 381 "claim": "ChatGPT primarily substitutes for worker effort, not complementing skills", 382 "evidence": "68% submit ChatGPT output without editing; no correlation between post-paste time and grade gains; treated outputs not higher quality than raw ChatGPT, even with convex incentives", 383 "supported": "strong" 384 }, 385 { 386 "claim": "ChatGPT restructures tasks away from rough-drafting (50%→20%) toward editing (25%→50%)", 387 "evidence": "Self-reported time allocation across task components; Figure 3 Panel A shows post-treatment shift in time spending", 388 "supported": "strong" 389 }, 390 { 391 "claim": "ChatGPT increases job satisfaction (0.4 SD, p<0.001) despite substitution mechanism", 392 "evidence": "1-10 Likert scale pre-post difference; Figure 4 Panel A shows 0.5 SD treatment effect (typo in abstract vs. text)", 393 "supported": "strong" 394 }, 395 { 396 "claim": "ChatGPT does NOT differentially benefit poor writers vs. strong writers", 397 "evidence": "Willingness-to-pay and grade gains flat across writing skill terciles (Figure 3B); no clear heterogeneity by relative writing ability", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Treated participants use ChatGPT more in real jobs post-experiment (33% vs. 18%, p=0.048)", 402 "evidence": "2-week follow-up survey with 82% response rate; among prior non-users, 26% treatment vs. 9% control using ChatGPT (p=0.048)", 403 "supported": "moderate" 404 }, 405 { 406 "claim": "Real-world adoption is limited by lack of context-specific knowledge in ChatGPT's training", 407 "evidence": "Qualitative feedback from non-users: 'very specifically tailored to [customers],' 'real time information,' 'unique to [company products]' (Section 2.9)", 408 "supported": "moderate" 409 } 410 ], 411 "methodology_tags": [ 412 "rct", 413 "human_evaluation" 414 ], 415 "key_findings": "A preregistered RCT of 444 college-educated professionals found that access to ChatGPT increased productivity on professional writing tasks by 37% (time) and 0.45 SDs (quality), with effects driven by task substitution (rough-drafting automation) rather than skill complementarity. ChatGPT compressed the productivity distribution, reducing inequality—low-ability workers gained more in quality while high-ability workers gained more in speed. Job satisfaction increased despite substitution, though real-world adoption at 2-week follow-up revealed context-specific knowledge limitations, suggesting effects may not generalize to complex, organization-specific writing tasks.", 416 "red_flags": [ 417 { 418 "flag": "Working paper, unvetted", 419 "detail": "Published as MIT working paper (not peer-reviewed). Methods and conclusions unreviewed by external referees at time of submission." 420 }, 421 { 422 "flag": "Limited task scope", 423 "detail": "Tasks are short (20-30 min), self-contained, and lack context-specific knowledge. Authors acknowledge this inflates ChatGPT usefulness; real jobs are longer, context-heavy, and require organization-specific knowledge." 424 }, 425 { 426 "flag": "Model version not specified", 427 "detail": "ChatGPT version/snapshot not documented. Published March 2023 (likely GPT-3.5), but no API version, temperature, or other hyperparameter details prevent exact reproducibility." 428 }, 429 { 430 "flag": "No public data or code", 431 "detail": "Analysis code and raw data (participant essays, evaluator responses) not released. Preregistration available but replication requires private dataset and code." 432 }, 433 { 434 "flag": "Control group contamination", 435 "detail": "10-20% of control group used ChatGPT despite assignment to Overleaf, biasing effects downward. Authors acknowledge this makes estimates lower bounds." 436 }, 437 { 438 "flag": "Demographic incompleteness", 439 "detail": "Age and gender not reported. Sample skewed toward 6 specific occupations and college-educated professionals; generalizability to other demographics unclear." 440 }, 441 { 442 "flag": "Follow-up survey incomplete at publication", 443 "detail": "2-week follow-up survey still in progress when paper published (82% response rate). Real-world adoption and job satisfaction effects based on incomplete data." 444 }, 445 { 446 "flag": "Real-world adoption low and context-limited", 447 "detail": "Non-users cite lack of context-specific knowledge; 33% treatment adoption in real jobs is modest. Core experimental advantages (short, generic tasks) may not exist in practice." 448 } 449 ], 450 "cited_papers": [ 451 { 452 "title": "The Race between Man and Machine: Implications of Technology for Growth, Factor Shares, and Employment", 453 "authors": "Acemoglu & Restrepo", 454 "year": 2018, 455 "relevance": "Foundational displacement vs. complementarity framework; economic framework for AI labor impact." 456 }, 457 { 458 "title": "Robots and Jobs: Evidence from US Labor Markets", 459 "authors": "Acemoglu & Restrepo", 460 "year": 2020, 461 "relevance": "Empirical evidence on automation's employment effects; prior evidence in capital-augmenting automation." 462 }, 463 { 464 "title": "The Growth of Low-Skill Service Jobs and the Polarization of the US Labor Market", 465 "authors": "Autor & Dorn", 466 "year": 2013, 467 "relevance": "Historical routine-task automation and inequality; contrasts with generative AI impact on creative tasks." 468 }, 469 { 470 "title": "Why Are There Still So Many Jobs? The History and Future of Workplace Automation", 471 "authors": "Autor", 472 "year": 2015, 473 "relevance": "Challenges automation pessimism; historical pattern of job creation despite automation." 474 }, 475 { 476 "title": "Artificial Intelligence and the Ambiguous Labor Market Impact of Automating Prediction", 477 "authors": "Agrawal, Gans & Goldfarb", 478 "year": 2019, 479 "relevance": "Labor economics framework for AI; distinguishes prediction tasks (prior AI focus) from generative writing." 480 }, 481 { 482 "title": "AI, Skill, and Productivity: The Case of Taxi Drivers", 483 "authors": "Kanazawa et al.", 484 "year": 2022, 485 "relevance": "Skill-complementarity and AI adoption in labor markets; prior evidence on heterogeneous effects by ability." 486 } 487 ], 488 "engagement_factors": { 489 "practical_relevance": { 490 "score": 3, 491 "justification": "Immediate applicability: grant writers, marketers, consultants, and analysts can adopt ChatGPT today for professional writing. 33% of treated participants already use it on real jobs post-experiment." 492 }, 493 "surprise_contrarian": { 494 "score": 2, 495 "justification": "Productivity boost itself unsurprising (expected AI benefit), but substitution>complementarity finding challenges hopeful skill-augmentation narrative. Contrarian: no heterogeneity by writing skill." 496 }, 497 "fear_safety": { 498 "score": 2, 499 "justification": "Raises job displacement concerns ('substitute for worker effort') and automation anxiety. Labor economics bent, not AI safety focus. Both worry and excitement increase post-exposure." 500 }, 501 "drama_conflict": { 502 "score": 2, 503 "justification": "Tension between productivity gains and job security. Inequality reduction is positive frame, but displacement risk is negative. Automation anxiety documented in experiment." 504 }, 505 "demo_ability": { 506 "score": 3, 507 "justification": "Highly reproducible: anyone can sign up for ChatGPT and attempt similar professional writing tasks (press releases, emails, reports). Experimental tasks designed to be realistic." 508 }, 509 "brand_recognition": { 510 "score": 3, 511 "justification": "MIT authorship (Noy, Zhang). ChatGPT is the most-discussed AI tool of 2023. Massive media coverage; paper widely cited in AI productivity debate." 512 } 513 }, 514 "hn_data": { 515 "threads": [], 516 "top_points": 0, 517 "total_points": 0, 518 "total_comments": 0 519 } 520 }