scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20649B)
      1 {
      2   "paper": {
      3     "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
      4     "authors": ["Shakked Noy", "Whitney Zhang"],
      5     "year": 2023,
      6     "venue": "Working Paper (MIT)",
      7     "doi": "10.1126/science.adh2586"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["rct"],
     12   "key_findings": "In a preregistered RCT with 444 college-educated professionals, ChatGPT access reduced task completion time by 0.8 SDs and increased output quality by 0.4 SDs. ChatGPT compressed the productivity distribution, benefiting low-ability workers more and halving the correlation between first-task and second-task grades. ChatGPT primarily substituted for worker effort rather than complementing skills, with 68% of treated participants submitting unedited ChatGPT output.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code release, or data archive mentioned in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset download link or data release mentioned. The paper references an Online Appendix but does not provide a data archive."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No computational environment or software dependency details provided. The experiment is an online survey, but no platform or analysis software details are given."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions provided. The experimental design is described, and materials are referenced in an Online Appendix, but no replication package is offered."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% confidence intervals are reported for main treatment effects (e.g., Figure 1: 'Treatment Effect: -0.83 SDs, 95% CI: [-0.63, -1.03]') and error bars shown on figures."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "P-values reported for main results (e.g., time: p=0.000, grades: p=0.000, inequality slope difference: p=0.004, worry: p=0.006, excitement: p=0.000, optimism: p=0.037)."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes reported in standard deviations throughout (e.g., -0.83 SDs for time, 0.45 SDs for grades, 0.40 SDs for job satisfaction). Also reports 37% reduction in time and absolute grade differences."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No power analysis or justification for the sample size of 444 participants. The number appears to be determined by recruitment capacity rather than statistical planning."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviations reported in Table 1 for all descriptive statistics. Standard errors reported for regression slopes (e.g., Figure 2: 'Slope: 0.491 (SE 0.053)')."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Control group serves as the baseline. Also compares pre-treatment and post-treatment performance within subjects. Raw ChatGPT output is evaluated as an additional comparison point."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The control group is the appropriate baseline for an RCT. ChatGPT was the most prominent generative AI tool at the time."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple supplementary arms probe specific mechanisms: a fixed-time arm (15 minutes, isolating quality effects from time savings), a revision arm (showing first-task output and allowing editing), and comparison of linear vs. convex incentive schemes."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics: time taken, overall grades, writing quality, content quality, originality, earnings per minute, job satisfaction, self-efficacy, automation beliefs."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Blinded experienced professionals in the same occupations evaluated outputs. Each piece received three evaluations with average cross-evaluator correlation of 0.44. Evaluators were incentivized."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Not applicable — this is a human subjects RCT, not a machine learning benchmark evaluation."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down by grade distribution (Figure 2), by incentive scheme (linear vs. convex), by task component (brainstorming, rough-drafting, editing), by writing skill terciles, and by pre-treatment ability level."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Discusses cases where ChatGPT doesn't help: the complementarity hypothesis finds no evidence (human editing doesn't improve ChatGPT output), and follow-up survey reveals participants not using ChatGPT because it lacks context-specific knowledge."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several null/negative results reported: no evidence of human-machine complementarity, no heterogeneity by relative writing skills, no effect on real job satisfaction at two-week follow-up, fixed-time arm imprecisely estimated (p=0.13)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims (0.8 SD time decrease, 0.4 SD quality increase, inequality compression, substitution rather than complementarity, job satisfaction/self-efficacy effects) are all supported by corresponding results sections with statistics."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims justified by RCT design with random assignment. Pre-registered at AEA RCT Registry (AEARCTR-0010882). Balance tests reported in Table 1. Lee bounds and robustness checks for selective attrition reported."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Discussion section explicitly enumerates limitations: tasks are short and self-contained, lack context-specific knowledge, only capture direct/immediate effects, and results may vary by occupation/task/skill level. Acknowledges experiment inflates ChatGPT's usefulness."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Discusses selective attrition (10% vs 5%) with Lee bounds, potential control group contamination (10-20% used ChatGPT, making estimates lower bounds), novelty effects, and placebo effects from the Overleaf control condition."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper defines productivity as 'earnings per minute' and acknowledges that short, self-contained tasks may inflate estimates of ChatGPT's usefulness. Discussion notes tasks lack context-specific knowledge that real work requires, and follow-up survey confirms participants find ChatGPT less useful for real tasks (3.65/5 vs 4.4/5)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper refers only to 'ChatGPT' without specifying the version (GPT-3.5 vs GPT-4) or snapshot date. Given the March 2023 date, this was likely GPT-3.5, but it is not stated."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Participants freely prompted ChatGPT themselves — there are no researcher-designed prompts to report. The task prompts given to participants are referenced in the Online Appendix."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Participants used ChatGPT through its default web interface. No hyperparameter tuning by researchers."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used. Participants used ChatGPT's web interface directly."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Data pipeline described: participant output collected with minute-by-minute snapshots, three blinded evaluators per piece, cross-evaluator correlation reported (0.44), objective time measure constructed from snapshots, control group ChatGPT usage detected."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Discussion section (Section 3) contains substantive limitations discussion spanning multiple paragraphs with specific limitations enumerated."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: tasks are short and self-contained, lack context-specific knowledge (which may inflate estimates), differential attrition (10% vs 5%), control group contamination. Follow-up survey data used to test external validity."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Explicitly states: 'an experiment, by its nature, captures only direct, immediate effects on the selected occupations. There will be many indirect, reinforcing, or counteracting general-equilibrium effects.' Also notes variation by occupation, task, and skill level."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data or data archive provided. Only aggregated statistics and figures in the paper."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Detailed description: online experiment with 444 college-educated professionals across six occupations, occupation-specific writing tasks, 20-30 minute assignments, high-powered bonus incentives, minute-by-minute output snapshots, three blinded evaluators per output."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper does not describe how the 444 professionals were recruited. It states they are 'college-educated professionals' but does not describe the recruitment platform, channels, or process."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Pipeline documented: recruitment → random assignment → first task → treatment intervention (ChatGPT signup vs. Overleaf signup) → second task → evaluation by three blinded professionals → follow-up survey at two weeks. Attrition rates reported by group."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Funding disclosed: Emergent Ventures grant, George and Obie Shultz Fund, NSF Graduate Research Fellowship (Grant No. 1745302)."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Both authors listed as MIT affiliates. No product affiliation conflict — they are academic researchers, not employees of OpenAI."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funders (Emergent Ventures, NSF, Shultz Fund) are general research grants with no commercial stake in ChatGPT's performance."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is an RCT studying productivity effects of ChatGPT use, not evaluating model capability on a benchmark. Contamination is not applicable."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not a benchmark evaluation — no train/test overlap concern."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not a benchmark evaluation — contamination not applicable."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Pre-registered at AEA RCT Registry (AEARCTR-0010882), explicitly stated in the acknowledgments."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Approved by MIT Committee on the Use of Humans as Experimental Subjects, stated in the acknowledgments."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Table 1 reports: annual salary, years of tenure, employment status, college degree status, occupation breakdown (HR, consultant, data analyst, grant writer, manager, marketer)."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "Participants are college-educated professionals in specified occupations (marketers, grant writers, consultants, data analysts, HR professionals, managers). The paper states 'experienced, college-educated professionals.'"
    262       },
    263       "randomization_described": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Random assignment described: 'A randomly-selected 50% of our participants—the treatment group—are instructed to sign up for ChatGPT.' Balance tests on 13 pre-treatment characteristics reported in Table 1."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": true,
    271         "justification": "Evaluators are described as 'blinded' — they do not know which condition produced the output they are grading. Participant blinding addressed by the Overleaf control (active control)."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "Attrition rates reported: 5% in control, 10% in treatment. Lee (2009) bounds computed to address differential attrition. Follow-up survey response rate: 82% with no differential response by treatment status."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is an RCT studying human productivity, not proposing a computational method. Inference cost is not relevant."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Purely a human subjects experiment; no computational budget to report."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "ChatGPT reduces time taken on writing tasks by 0.83 SDs (37%, ~10 minutes)",
    295       "evidence": "Figure 1 Panel (a): treatment effect -0.83 SDs, 95% CI [-0.63, -1.03], p=0.000. Control mean 27 minutes, treatment mean 17 minutes.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "ChatGPT increases average output quality by 0.45 SDs",
    300       "evidence": "Figure 1 Panel (b): treatment effect 0.45 SDs, 95% CI [0.27, 0.63], p=0.000. Similar increases for writing quality, content quality, and originality.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "ChatGPT compresses productivity inequality by benefiting low-ability workers more",
    305       "evidence": "Figure 2: correlation between first-task and second-task grades drops from 0.49 (control) to 0.25 (treatment), p=0.004 on difference in slopes.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "ChatGPT substitutes for worker effort rather than complementing worker skills",
    310       "evidence": "68% of treated participants submit ChatGPT output without editing; average only 3 minutes active after pasting; no correlation between editing time and grade; treated participants do not receive higher grades than raw ChatGPT output.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "ChatGPT increases job satisfaction by 0.40 SDs",
    315       "evidence": "Figure 4 Panel (a): treatment effect 0.5 SDs, 95% CI [0.32, 0.68], p=0.000.",
    316       "supported": "strong"
    317     },
    318     {
    319       "claim": "Exposure to ChatGPT increases both worry about automation and excitement about AI",
    320       "evidence": "Figure 4 Panel (c): worry +0.26 SDs (p=0.006), excitement +0.39 SDs (p=0.000), net optimism +0.20 SDs (p=0.037).",
    321       "supported": "strong"
    322     }
    323   ],
    324   "red_flags": [
    325     {
    326       "flag": "Differential attrition",
    327       "detail": "Attrition is 10% in treatment vs 5% in control. While the paper addresses this with Lee bounds and robustness checks, differential attrition in an RCT is a concern. Two of 13 balance variables show significant differences."
    328     },
    329     {
    330       "flag": "Short, artificial tasks may inflate effects",
    331       "detail": "The paper acknowledges this limitation: 20-30 minute self-contained tasks without context-specific knowledge may overstate ChatGPT's real-world usefulness. The follow-up survey confirms lower usefulness ratings for real tasks (3.65/5 vs 4.4/5)."
    332     },
    333     {
    334       "flag": "Control group contamination",
    335       "detail": "10-20% of control group participants used ChatGPT on the tasks, which the authors acknowledge makes their estimates lower bounds. However, this means the clean treatment-control contrast is muddied."
    336     },
    337     {
    338       "flag": "Recruitment methods not described",
    339       "detail": "The paper does not describe how the 444 professionals were recruited, making it impossible to assess selection bias or generalizability of the sample."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Robots and Jobs: Evidence from US Labor Markets",
    345       "authors": ["Daron Acemoglu", "Pascual Restrepo"],
    346       "year": 2020,
    347       "relevance": "Foundational work on automation's labor market effects, relevant context for AI productivity studies."
    348     },
    349     {
    350       "title": "The Race between Man and Machine: Implications of Technology for Growth, Factor Shares, and Employment",
    351       "authors": ["Daron Acemoglu", "Pascual Restrepo"],
    352       "year": 2018,
    353       "relevance": "Theoretical framework on automation displacing vs complementing workers, directly applicable to AI coding tools."
    354     },
    355     {
    356       "title": "AI, Skill, and Productivity: The Case of Taxi Drivers",
    357       "authors": ["Kyogo Kanazawa", "Daiji Kawaguchi", "Hitoshi Shigeoka", "Yasutora Watanabe"],
    358       "year": 2022,
    359       "relevance": "Empirical study of AI's productivity effects in a non-writing domain, useful comparison for AI productivity claims."
    360     },
    361     {
    362       "title": "Artificial Intelligence: The Ambiguous Labor Market Impact of Automating Prediction",
    363       "authors": ["Ajay Agrawal", "Joshua S Gans", "Avi Goldfarb"],
    364       "year": 2019,
    365       "relevance": "Framework on AI complementing vs substituting for human prediction tasks."
    366     },
    367     {
    368       "title": "Human Decisions and Machine Predictions",
    369       "authors": ["Jon Kleinberg", "Himabindu Lakkaraju", "Jure Leskovec", "Jens Ludwig", "Sendhil Mullainathan"],
    370       "year": 2018,
    371       "relevance": "Empirical study of ML predictions complementing human judgment in bail decisions."
    372     }
    373   ]
    374 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs