scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31502B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
      6     "authors": [
      7       "Shakked Noy",
      8       "Whitney Zhang"
      9     ],
     10     "year": 2023,
     11     "venue": "Unknown (working paper)",
     12     "arxiv_id": null,
     13     "doi": "10.1126/science.adh2586"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All major abstract claims are directly supported: ChatGPT reduces time (10 min/37% decrease, p=0.000), improves quality (0.45 SDs, p=0.000), reduces inequality (correlation drops from 0.49 to 0.25), substitutes for effort (68% submit unedited), and restructures tasks (Figure 3a shows time reallocation).",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Preregistered RCT with random assignment, within-person design controlling for baseline ability, regression with clustering at worker level, and supplementary interventions (fixed-time arm). Design is adequate for causal inference despite 10-20% control group contamination (acknowledged as lower bound).",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Paper explicitly bounds results to college-educated professionals in specified occupations (marketing, grant writing, consulting, HR, data analysis, management) performing 20-30 minute writing tasks. Discussion acknowledges effects may vary by occupation, task, and skill level, and that context-specific knowledge limitations inflate estimates.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 2.4 directly tests substitution vs. complementarity and presents evidence against complementarity (no correlation between editing time and grade, treated essays don't exceed raw ChatGPT output quality). Acknowledges control group ChatGPT usage and discusses skill-demand hypothesis (Section 2.6), finding no clear evidence.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Productivity explicitly defined as earnings per minute (combining time + quality). Grades come from professional evaluators assessing writing quality, content quality, and originality separately. Measures match what is claimed (productivity improvements across multiple quality dimensions).",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 3 titled 'Discussion' contains dedicated paragraph: 'The experiment has several important limitations worth enumerating.' Lists task characteristics, measurement scope, and general equilibrium effects.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats discussed: (1) task context-dependency inflates estimates, (2) job satisfaction reflects small task not whole job (no 2-week followup difference), (3) experiment captures only direct immediate effects not GE adaptations, (4) effects likely vary by occupation/task/skill. Each threat is concrete, not boilerplate.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Explicit scope: college-educated professionals in 6 occupations, 20-30 min tasks (press releases, reports, emails, analysis plans), tasks lack context-specific knowledge beyond prompts. 2-week followup shows real-world limitations: participants report needing context-specific knowledge their writing requires.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments state: 'financial support from an Emergent Ventures grant, the George and Obie Shultz Fund, and the National Science Foundation Graduate Research Fellowship under Grant No. 1745302.'",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors listed as MIT. Research approved by 'MIT Committee on the Use of Humans as Experimental Subjects.' No disclosed affiliation with OpenAI/ChatGPT; testing external commercial product.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Emergent Ventures, Shultz Fund, and NSF are independent of ChatGPT productivity outcomes. None have financial stake in ChatGPT success or failure.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement found. Paper does not explicitly state 'Authors declare no competing interests' or list financial interests/patents/equity/consulting arrangements.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Productivity defined as earnings per minute (time + quality). Generative AI defined as systems that 'can be prompted to create novel text or visual outputs from large amounts of training data.' Mid-level professional writing tasks exemplified with specific occupations and task types.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Paper explicitly states: 'This paper takes the first step towards answering these questions' about ChatGPT's productivity effects, substitution vs. complementarity, and differential effects on worker ability. Positions contribution as first empirical evidence on generative AI in creative tasks (vs. prior predictive task literature).",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Introduction situates work in 40+ year history of automation literature (Autor, Acemoglu, etc.), contrasts generative AI (creative tasks) with prior automation (routine tasks), discusses displacement vs. complementarity debate. Not just a citations list but shows how this work relates to and differs from existing contributions.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No mention of released analysis code. Paper is a working paper and does not state code will be available.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No explicit statement that participant data or task outputs are released or will be released. Paper mentions Online Appendix but availability not confirmed.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": false,
    132           "answer": false,
    133           "justification": "Not applicable: this is an online survey experiment, not a computational/software artifact requiring environment specification.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Design described and preregistered (AEARCTR-0010882), but full reproduction would require recruiting professional evaluators and participants. Actual task prompts are in Online Appendix (not main paper). Step-by-step instructions for independent research team are not provided.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Figure 1 shows 95% CIs for all main effects. Example: time treatment effect -0.83 SDs [95% CI: -0.63, -1.03]. Grade effect 0.45 SDs [95% CI: 0.27, 0.63].",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Significance tests throughout: main productivity effects p=0.000, inequality reduction p=0.004 (difference in slopes), job satisfaction p=0.000, automation worry p=0.006, excitement p=0.000. Fixed-time arm treatment effect p=0.13.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Effects reported in both standardized (SDs) and raw units: time -0.83 SDs / -10 minutes (37% of 27-min control average), quality +0.45 SDs, job satisfaction +0.40 SDs. Comparisons include baseline context (e.g., control mean 27 minutes).",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "N=444 recruited but no power analysis or justification provided. Paper does not cite a target effect size or power calculation that determined the sample size.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Figure 1 panels (c)-(d) show full outcome distributions (not just means). Table 1 reports SDs for baseline characteristics. Inter-evaluator agreement reported: 'average within-essay cross-evaluator correlation of 0.44.'",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Active control group assigned to LaTeX (Overleaf) training rather than ChatGPT. Treated group given ChatGPT access. Control group provides comparison; 10-20% contamination acknowledged as lower bound estimate.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Overleaf is contemporary tool (exists as of 2023). Control condition (no ChatGPT access) is appropriate baseline for estimating ChatGPT effect.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Two supplementary interventions probe mechanisms: (1) fixed-time arm holds effort constant to isolate pure ChatGPT effect on quality (treatment +0.39 SDs), (2) edit arm allows editing pre-task output with ChatGPT (23% replace, 25% edit), testing complementarity.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple outcome measures: time taken, overall grade, writing quality grade, content quality grade, originality grade, job satisfaction, self-efficacy, automation beliefs (worry/excitement/optimism), downstream usage (2-week followup). Metrics span productivity, inequality, subjectivity, and real-world takeup.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "All task outputs graded by blinded professional evaluators in same occupations as participants. Each output evaluated by 3 raters. Evaluators incentivized to grade carefully. Ratings on overall, writing quality, content quality, originality dimensions.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "Not applicable: no predictive model being evaluated on held-out data. Task 2 is held-out from Task 1 (within-person design) but this is not a standard held-out test set for a trained model.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results broken down by: (1) quality dimension (writing/content/originality separate from overall), (2) incentive scheme (linear vs. convex), (3) ability level (Figure 2 shows treatment effects across pre-task grade distribution), (4) occupation (balance tests in Table 1).",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "No explicit analysis of failure cases or errors by ChatGPT. Paper notes 68% submit unedited output (could be good or poor) but does not analyze specific instances where ChatGPT produced low-quality outputs that were nonetheless submitted.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Null finding on skill heterogeneity: 'We find no clear evidence for the aforementioned hypothesis' about differential benefits by writing skill (Figure 3b flat slopes). Self-efficacy effect is small and imprecisely estimated (p=0.060). 2-week followup shows lower usefulness in real work (3.65 vs 4.4/5).",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "Paper says 'ChatGPT' but no version/snapshot date specified. Working paper dated March 2, 2023 implies ChatGPT around that date, but exact model version (e.g., GPT-3.5-turbo) not stated. Makes replication difficult.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Actual task prompts not provided in main paper. Paper states 'A copy of relevant survey questionnaires...are included in the Online Appendix' but full prompts not reproduced. Only task examples given (press releases, emails, etc.).",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": false,
    246           "answer": false,
    247           "justification": "Not applicable: ChatGPT is a third-party tool; experimenters did not control temperature, top-p, or other sampling parameters. Not a parameter-tuning experiment.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Treatment participants 'instructed to sign up for ChatGPT...are walked through how to use it, and are told they are permitted to use it on the second task if they find it useful.' Content of walkthrough not detailed but procedure is described. Minimal scaffolding applied.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Data processing reasonably documented: grades are from three evaluators (within-essay correlation 0.44 reported), outcomes are person-evaluator-level (clustering at worker), pre-treatment outcomes control for baseline ability. Some details deferred to Online Appendix/supplementary materials.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "No statement that raw participant data, task outputs, or evaluator grades are available for independent verification. Working paper; data release status unclear.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Detailed collection: 444 professionals assigned two occupation-specific writing tasks (~20-30 min each), outputs graded by 3 professional evaluators (blinded), time tracking via minute-by-minute snapshots, survey responses on satisfaction/beliefs. Incentive structure specified (linear: $1/point, convex: +$3 for grades 6-7).",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": true,
    278           "answer": false,
    279           "justification": "Recruitment method vague: 'online experiment' mentioned, survey 'mostly active only after 5pm EST' (to ensure ChatGPT availability), but platform not named (MTurk? Prolific? Other?). How occupations were targeted not explained.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Pipeline described: collect survey responses → assign tasks → collect outputs + minute-level snapshots → send to evaluators → collect grades + rankings → record time/satisfaction/self-efficacy → estimate treatment effects via person-evaluator OLS. Some details in Online Appendix.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "Not applicable: paper does not evaluate whether ChatGPT was trained on benchmark data. Tests ChatGPT's real-time productivity on novel writing tasks.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Not applicable: not a benchmark evaluation, so train/test overlap not relevant.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "Not applicable: custom tasks created for experiment, not existing benchmarks.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": true,
    312           "answer": true,
    313           "justification": "Explicitly preregistered: 'preregistered at the AEA RCT Registry (AEARCTR-0010882).'",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": true,
    318           "answer": true,
    319           "justification": "'The research described in this article was approved by the MIT Committee on the Use of Humans as Experimental Subjects.'",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": true,
    324           "answer": true,
    325           "justification": "Table 1 reports: annual salary ($71.8K control / $76.3K treatment), tenure in occupation (~10 yrs both), employment rate (90% control / 96% treatment), college degree (100% both), occupational distribution (managers 41-42%, grant writers 16-17%, consultants 11-13%, data analysts/marketers ~10%, HR 6-11%).",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": true,
    330           "answer": true,
    331           "justification": "Stated: college-educated professionals in specified occupations. Criteria somewhat specified but could be more explicit (e.g., minimum experience, exclusion rules).",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": true,
    336           "answer": true,
    337           "justification": "'randomly expose half of them to ChatGPT' and 'A randomly-selected 50% of our participants' indicates assignment method, though specific randomization procedure (simple, blocked, stratified) not detailed.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": true,
    342           "answer": true,
    343           "justification": "Evaluators are blinded: 'Quality is assessed by (blinded) experienced professionals.' Participants cannot be blinded to ChatGPT access (they know if they sign up). Blinding of relevant party (outcome assessor) achieved.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": true,
    348           "answer": true,
    349           "justification": "Attrition clearly reported: '5% in the control group and 10% in the treatment group.' Balance/attrition tests referenced in Online Appendix. Lee (2009) bounds applied to check robustness.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "ChatGPT's inference cost not reported (commercial product, not researchers' system). Willingness-to-pay elicited (0.5% of salary/month) but this indicates perceived value, not actual compute cost.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "Not applicable: study is online experiment. Participant payments mentioned ($1/point + incentives, 2-week followup) but total computational/operational budget not stated.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "ChatGPT reduces time to complete writing tasks by 0.83 SDs (approximately 10 minutes or 37%)",
    372       "evidence": "Figure 1a: control mean 27 minutes, treatment mean 17 minutes, p=0.000. Treatment effect coefficient -0.83 SDs with 95% CI [-0.63, -1.03].",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "ChatGPT increases output quality by 0.45 SDs on evaluator grades",
    377       "evidence": "Figure 1b: control mean grade 3.789, treatment mean 4.54 (on 1-7 scale), p=0.000. Similar effect sizes for writing quality, content quality, and originality separately.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "ChatGPT reduces productivity inequality between workers",
    382       "evidence": "Figure 2a: Grade correlation drops from 0.491 (control) to 0.248 (treatment), change in slope -0.243 with 95% CI [-0.08, -0.41], p=0.004. Effect larger for lower-ability workers.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "ChatGPT acts primarily as a substitute for worker effort rather than complementing skills",
    387       "evidence": "Section 2.4: 68% of participants submit ChatGPT output unedited, only 3 minutes active after pasting, no correlation between editing time and grade, treated essays don't exceed raw ChatGPT output quality, no higher grades despite convex incentives to edit.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "ChatGPT restructures task workflow away from rough-drafting and toward brainstorming/editing",
    392       "evidence": "Figure 3a: rough-drafting time share falls from ~50% to ~25%, editing time more than doubles from ~25% to ~55%, brainstorming stable at ~25%.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Benefits of ChatGPT do not significantly vary by baseline writing skill",
    397       "evidence": "Figure 3b, Section 2.6: willingness to pay and grade gains are flat across thirds of relative writing skill (both self-rated and evaluator-measured). 'We find no clear evidence for the aforementioned hypothesis.'",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "ChatGPT increases job satisfaction by 0.40 SDs",
    402       "evidence": "Figure 4a: treatment effect +0.40 SDs (p=0.000) with 95% CI [0.32, 0.68] on enjoyment of task (1-10 scale).",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Exposure to ChatGPT increases both optimism and worry about future automation",
    407       "evidence": "Figure 4c: worry increases 0.26 SDs (p=0.006), excitement 0.39 SDs (p=0.000), net optimism 0.20 SDs (p=0.037) on 1-10 scales.",
    408       "supported": "strong"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "rct",
    413     "human_evaluation",
    414     "observational"
    415   ],
    416   "key_findings": "ChatGPT substantially increases productivity on mid-level professional writing tasks—reducing time by 37% (0.83 SDs) while improving quality by 0.45 SDs—in a preregistered RCT with 444 college-educated professionals. The tool reduces productivity inequality by benefiting lower-ability workers more (grade correlation drops from 0.49 to 0.25), and operates primarily as a labor-saving substitution (68% submit unedited output) rather than complementing human skills. Tasks restructure toward brainstorming and editing away from rough-drafting. Despite these gains, real-world usage declines when context-specific knowledge requirements increase.",
    417   "red_flags": [
    418     {
    419       "flag": "Narrow task domain",
    420       "detail": "Tasks are 20-30 minute self-contained writing tasks lacking context-specific knowledge. Authors acknowledge this 'may inflate our estimates of ChatGPT's usefulness.' 2-week followup shows usefulness rating drops from 4.4/5 to 3.65/5 in real work."
    421     },
    422     {
    423       "flag": "ChatGPT version not specified",
    424       "detail": "Paper says 'ChatGPT' with March 2023 date but no model version (GPT-3.5-turbo?), snapshot date, or API parameters (temperature, top-p) specified. Reduces reproducibility."
    425     },
    426     {
    427       "flag": "Prompts not provided",
    428       "detail": "Actual task prompts relegated to Online Appendix, not in main text. Required for full reproduction and validation."
    429     },
    430     {
    431       "flag": "No power analysis",
    432       "detail": "N=444 chosen without stated justification, power calculation, or target effect size. Sample size appears adequate but rationale missing."
    433     },
    434     {
    435       "flag": "Modest inter-evaluator agreement",
    436       "detail": "Average within-essay cross-evaluator correlation of 0.44 indicates substantial disagreement on quality grades. Grade quality depends heavily on evaluator identity."
    437     },
    438     {
    439       "flag": "Control group contamination",
    440       "detail": "10-20% of control group used ChatGPT anyway. Authors acknowledge 'estimates provide lower bounds on the effects of ChatGPT usage,' implying true effects could be larger."
    441     },
    442     {
    443       "flag": "Differential attrition",
    444       "detail": "Control attrition 5%, treatment attrition 10% (2x higher). Lee bounds applied but imbalance suggests potential bias."
    445     },
    446     {
    447       "flag": "Data/code not released",
    448       "detail": "Working paper; no statement that raw data, task outputs, or analysis code will be made available. Limits independent verification."
    449     },
    450     {
    451       "flag": "Recruitment method vague",
    452       "detail": "Platform not specified ('online experiment' only; 5pm EST timing mentioned for ChatGPT availability but recruitment source unclear)."
    453     },
    454     {
    455       "flag": "No editing-only control",
    456       "detail": "No arm that allows participants to edit ChatGPT output before submission (only 23% replace, 25% edit in voluntary edit arm). Hard to distinguish productive editing from mere labor substitution."
    457     }
    458   ],
    459   "cited_papers": [
    460     {
    461       "title": "The Race between Man and Machine: Implications of Technology for Growth, Factor Shares, and Employment",
    462       "authors": "Acemoglu, Daron and Pascual Restrepo",
    463       "year": 2018,
    464       "venue": "American Economic Review",
    465       "relevance": "Foundational framework on displacement vs. complementarity effects of automation; establishes conceptual ground for interpreting ChatGPT's labor market impact."
    466     },
    467     {
    468       "title": "Robots and Jobs: Evidence from US Labor Markets",
    469       "authors": "Acemoglu, Daron and Pascual Restrepo",
    470       "year": 2020,
    471       "venue": "Journal of Political Economy",
    472       "relevance": "Empirical evidence on how automation technologies affect employment and productivity; directly relevant baseline for generative AI comparisons."
    473     },
    474     {
    475       "title": "Why Are There Still So Many Jobs? The History and Future of Workplace Automation",
    476       "authors": "Autor, David",
    477       "year": 2015,
    478       "venue": "Journal of Economic Perspectives",
    479       "relevance": "Historical perspective on how routine vs. creative tasks respond to automation; frames generative AI as qualitatively different."
    480     },
    481     {
    482       "title": "The Growth of Low-Skill Service Jobs and the Polarization of the US Labor Market",
    483       "authors": "Autor, David and David Dorn",
    484       "year": 2013,
    485       "venue": "American Economic Review",
    486       "relevance": "Task-based model of labor market effects; establishes distributional consequences framework applicable to AI productivity effects."
    487     },
    488     {
    489       "title": "Artificial Intelligence: The Ambiguous Labor Market Impact of Automating Prediction",
    490       "authors": "Agrawal, Ajay, Joshua S. Gans, and Avi Goldfarb",
    491       "year": 2019,
    492       "venue": "Journal of Economic Perspectives",
    493       "relevance": "Analyzes labor market impacts of AI prediction automation; conceptual scaffold for understanding generative AI's different task domain."
    494     },
    495     {
    496       "title": "Automation After the Assembly Line: Computerized Machine Tools, Employment and Productivity in the United States",
    497       "authors": "Boustan, Leah Platt, Jiwon Choi, and David Clingingsmith",
    498       "year": 2022,
    499       "venue": "NBER Working Paper",
    500       "relevance": "Recent historical evidence on how technology adoption affects productivity distribution and worker heterogeneity."
    501     },
    502     {
    503       "title": "Automation, Workers' Skills and Job Satisfaction",
    504       "authors": "Schwabe, Henrik and Fulvio Castellacci",
    505       "year": 2020,
    506       "venue": "PLOS One",
    507       "relevance": "Examines subjective worker outcomes (satisfaction, efficacy) in response to automation; directly parallels paper's measurement of job satisfaction and self-efficacy."
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 3,
    513       "justification": "ChatGPT is publicly available and commercially deployed; 33% of treated participants use it in real jobs within 2 weeks. Findings directly applicable to professionals making adoption decisions."
    514     },
    515     "surprise_contrarian": {
    516       "score": 2,
    517       "justification": "Positive productivity effects largely expected; the contrarian finding is that benefits don't vary by writing skill (intuitive differences don't emerge) and that substitution dominates complementarity."
    518     },
    519     "fear_safety": {
    520       "score": 2,
    521       "justification": "Raises automation concerns (worry increases 0.26 SDs) but positioned as balanced with optimism (excitement increases 0.39 SDs). Substitution finding does suggest displacement risk but paper lacks AI safety framing."
    522     },
    523     "drama_conflict": {
    524       "score": 1,
    525       "justification": "Straightforward positive productivity findings without significant controversy. No heated debate, only measured evidence. 2-week followup limitations are candid but not dramatic."
    526     },
    527     "demo_ability": {
    528       "score": 3,
    529       "justification": "Anyone can download ChatGPT and try it on professional writing tasks immediately. Results are directly testable by practitioners and general audience."
    530     },
    531     "brand_recognition": {
    532       "score": 3,
    533       "justification": "MIT authors (Noy, Zhang), cites Acemoglu and other prominent economists, ChatGPT is the most-discussed AI tool of 2023, directly relevant to labor economics debates."
    534     }
    535   },
    536   "hn_data": {
    537     "threads": [],
    538     "top_points": 0,
    539     "total_points": 0,
    540     "total_comments": 0
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs