scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23542B)
      1 {
      2   "paper": {
      3     "title": "Scaling Laws for Economic Productivity: Experimental Evidence in LLM-Assisted Translation",
      4     "authors": ["Ali Merali"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2409.02391"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No repository URL, code archive, or mention of code release found anywhere in the paper."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No dataset download link or data repository is mentioned. The experimental data from the 300 translators is not released."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment specifications, software versions, or dependency lists are provided."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No reproduction instructions or scripts are provided. The experimental design is described but there are no step-by-step instructions for replication."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Standard errors are reported in parentheses for all regression coefficients in Tables 1-5 in Appendix A."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "P-values are reported for main results (e.g., p=0.001 for time reduction, p=0.000 for quality improvement, p=0.017 for skill heterogeneity). Regression tables use significance stars (*** p<0.01, ** p<0.05, * p<0.1)."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Effect sizes are reported with context: 12.3% quicker per 10x compute, 0.18 standard deviations grade improvement, 16.1% earnings increase per 10x compute, and 0.14 SD for any-AI quality increase."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The sample size of 300 translators and 1,800 tasks is stated but no power analysis or justification for why 300 participants was sufficient is provided."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Standard errors are reported for all regression coefficients. R-squared and adjusted R-squared values are provided for all models."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "A control group with no AI assistance serves as the baseline, with results compared in Section 3.1 and Table 2."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares 13 LLMs of varying compute sizes including models up to Claude 3.5 Sonnet (contemporary at time of writing) against a no-AI control."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is an RCT measuring the effect of model compute on productivity, not a system with components to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Three metrics are used: time taken, quality grade (7-point scale), and earnings per minute (inclusive of bonuses). These are analyzed separately in Sections 3.2-3.4."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Three experienced professional translators (5+ years experience each) graded each task on a 7-point scale, with incentives for consistent grading."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is an RCT, not a model evaluation on a dataset split. There is no train/test split to assess."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down by skill level (high-skill vs low-skill in Section 3.5), by language (Arabic, Hindi, Spanish in regression tables), and by task (tasks 2-5 in regression controls)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No discussion of failure cases, tasks where AI assistance hurt performance, or individual translator difficulties. Only aggregate positive trends are shown."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The quality improvement from any AI usage was not statistically significant (p=0.148) which is reported, but no other negative results or failed approaches are discussed. The non-significant quality finding is somewhat buried rather than highlighted."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims (12.3% speed improvement, 0.18 SD grade improvement, 16.1% earnings increase per 10x compute, 4x larger gains for lower-skilled) are all supported by regression results in Section 3 and Appendix A."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "This is a pre-registered RCT with random assignment to treatment conditions (different LLMs or control). The study design is adequate for the causal claims made about model compute effects on productivity."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper tests only translation tasks but the abstract and Section 4 extrapolate to 'U.S. productivity' and 'aggregate productivity gains from AI over the next decade.' The title says 'Economic Productivity' broadly. Section 5 acknowledges limitations to translation and short tasks, but the 6.9% aggregate estimate in Section 4 extends well beyond translation."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "No substantive discussion of confounds such as: participants might use AI differently based on familiarity, the short task format may not reflect real professional translation, learning effects across the 5 tasks, or whether productivity gains reflect genuine quality or surface-level fluency. Section 5 mentions limitations but does not discuss alternative explanations for the observed scaling relationship."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper measures task completion time, a 7-point grade from 3 evaluators, and earnings per minute, then frames these as 'productivity' and 'economic scaling laws.' No discussion of whether short online translation tasks proxy real professional translation productivity, or whether earnings per minute in an experimental setting maps onto actual economic productivity."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper mentions 13 LLMs including 'Claude-3.5 Sonnet' and references GPT-2/GPT-4 but does not provide specific version IDs or API snapshot dates for the 13 models used. The models are referenced primarily by their training compute, not by exact versions."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "Participants used LLMs as translation aids in a free-form manner; the experiment did not use specific prompts as part of the methodology. The participants chose how to interact with the models."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters (temperature, top-p, etc.) are reported for the 13 LLMs used in the experiment."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. Participants interact directly with LLMs for translation assistance."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper does not describe data cleaning or preprocessing steps between raw experimental data and the regression analyses. No mention of how outliers were handled or how the final sample of 1,392-1,500 observations was derived from 300 participants × 6 tasks = 1,800 potential observations."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 5 (Discussion) includes explicit limitations: 'There are many limitations to this study. This paper focused on a single professional skill (translation) and only tested participants on short tasks. Further, the results were only derived on a range of just over two orders of magnitude of compute.'"
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The limitations are specific to this study: single skill (translation), short tasks only, limited compute range (two orders of magnitude), and uncertainty about generalization to other domains. These are not generic boilerplate."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "While Section 5 mentions limitations, the paper does not explicitly state what the results do NOT show. The 6.9% aggregate productivity estimate in Section 4 extends well beyond what the data supports, and the scope boundaries around this extrapolation are not clearly drawn."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw data is made available. The individual translator responses, timing data, grades, and model interactions are not released."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 2 describes the experimental design: 300 translators recruited via Freelancer and Fiverr, completing 6 tasks each (1 baseline + 5 experimental), with detailed task descriptions in Appendix B, grading by 3 professionals, and payment structure."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 2 states participants were recruited through Freelancer and Fiverr, split evenly across Spanish/Hindi/Arabic, with screening criteria (1+ year experience, paid tasks in past year, language standards, consent to monitoring)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "There is no documentation of the pipeline from raw experimental data to final regression tables. The observation counts vary between tables (1,500 vs 1,392 vs 1,391) with no explanation of why observations were dropped at each stage."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper states: 'I gratefully acknowledge financial support from Open Philanthropy to conduct this research.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliation is disclosed as Yale University, Department of Economics. The paper does not evaluate a product made by Yale or Open Philanthropy."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Open Philanthropy is a major funder of AI safety and AI-related research with a known position that AI will be transformative. The paper's finding that AI scaling produces large productivity gains aligns with the funder's worldview. While not a direct financial conflict, the funder is not fully independent of the outcome."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement is present in the paper. There is no declaration of whether the author has any financial interests related to the findings."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate pre-trained model capability on a benchmark. It is an RCT measuring human productivity when assisted by LLMs. Contamination of LLM training data is not relevant to the experimental design."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Not a benchmark evaluation. The paper measures human performance on novel translation tasks, not model performance on a test set."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not a benchmark evaluation. The tasks are original translation passages, not drawn from a pre-existing benchmark."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "The paper states it 'was preregistered at the AEA RCT Registry (AEARCTR-0013743)' in the abstract footnote."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The paper states: 'The research described in this article was approved by the Yale Human Research Protection Program.'"
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Section 2 reports: 70%+ had 3+ years experience, ~33% had 5+ years, self-reported translation ability 4.52/5, professional fluency 4.85/5, AI familiarity 4.15/5, AI ability 4.18/5. Languages split evenly across Spanish, Hindi, Arabic."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 2 states: at least one year professional experience, completed paid translation tasks in past year, met language ability standards, comfortable with AI usage monitoring. Baseline task served as quality screen."
    260       },
    261       "randomization_described": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "The paper states participants were 'randomly assigned to either treatment groups where they could utilize one of thirteen LLMs... or to a control group' but does not describe the randomization procedure (stratification, tool used, block randomization, etc.)."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "No mention of blinding. It is unclear whether participants knew which model they received, whether graders were blind to treatment condition, or whether the analyst was blinded during analysis."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper mentions 300 participants and 1,800 tasks, but regression tables show varying N (1,500, 1,392, 1,391) with no explanation of attrition or why observations were excluded."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Section 4 footnote states: 'the inference costs for all 300 translators completing nearly 1,400 tasks with AI assistance combined came to significantly less than a dollar.' Also mentions $1 per 100,000 syllables for leading models."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget for the experiment is stated. While inference costs are mentioned, the total experimental cost (participant payments, grader payments, infrastructure) is not reported."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "A 10x increase in model training compute improves task completion speed by 12.3%",
    293       "evidence": "Section 3.2, Table 3: regression coefficient of -51.35 seconds per log10 compute unit (p=0.001), corresponding to 12.3% reduction in time.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "A 10x increase in model compute improves grades by 0.18 standard deviations (0.25 points on 7-point scale)",
    298       "evidence": "Section 3.3, Table 4: regression coefficient of 0.2584 grade points per log10 compute (p=0.000).",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "A 10x increase in model compute increases earnings per minute by 16.1%",
    303       "evidence": "Section 3.4, Table 5: regression coefficient of 0.1921 earnings per minute per log10 compute (p=0.001).",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "Productivity gains are 4x larger for lower-skilled workers (21.1% vs 4.9% time reduction per 10x compute)",
    308       "evidence": "Section 3.5, Table 1: interaction term Skill×logmodelcompute = -69.399 (p=0.017). High-skill 4.9% reduction, low-skill 21.1% reduction.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Continued model scaling could boost U.S. productivity by at least 6.9% over the next decade",
    313       "evidence": "Section 4: calculation using 19.9% task exposure × 61.2% productivity effect × 57% labor share = 6.95%. Based on Hulten's theorem framework from Acemoglu (2024).",
    314       "supported": "weak"
    315     }
    316   ],
    317   "methodology_tags": ["rct"],
    318   "key_findings": "In a pre-registered RCT with 300 professional translators and 13 LLMs, a 10x increase in model training compute reduced task completion time by 12.3%, improved quality grades by 0.18 SD, and increased earnings per minute by 16.1%. Gains were 4x larger for lower-skilled workers. The paper extrapolates from these translation-specific findings to estimate 6.9% U.S. productivity growth over the next decade, an order of magnitude higher than Acemoglu (2024).",
    319   "red_flags": [
    320     {
    321       "flag": "Extrapolation far beyond tested domain",
    322       "detail": "The 6.9% aggregate U.S. productivity estimate in Section 4 extrapolates from a single task domain (short translation tasks) to the entire economy. The paper acknowledges translation may be the most AI-exposed occupation, yet uses the same scaling ratio for all tasks. The claim of 'at least 6.9%' (framed as a lower bound) rests on strong assumptions about scaling law generalizability."
    323     },
    324     {
    325       "flag": "Unexplained observation attrition",
    326       "detail": "300 participants × 6 tasks = 1,800 potential observations, but Table 2 shows 1,500 (any-AI analysis) and Tables 3-5 show 1,392/1,391 (scaling analysis). The difference between 1,800 and 1,500 and the further reduction to 1,392 is never explained. Missing data could bias results."
    327     },
    328     {
    329       "flag": "Short artificial tasks may not represent professional work",
    330       "detail": "Tasks averaged ~10 minutes each. Participants rated similarity to professional work at only 3.53/5. Real translation involves longer documents, domain expertise, client interactions, and iterative revision — the experimental setup may overstate productivity gains for simple tasks."
    331     },
    332     {
    333       "flag": "Funder alignment with findings",
    334       "detail": "Open Philanthropy, which funded the research, is a major funder of AI-related research and has organizational positions on AI's transformative potential. The paper's core finding (large productivity gains from AI scaling) aligns with the funder's worldview."
    335     },
    336     {
    337       "flag": "Recruitment via gig platforms",
    338       "detail": "Participants were recruited from Freelancer and Fiverr. Gig-platform translators may differ systematically from translators working in agencies or in-house roles in terms of skill level, AI familiarity, and task approach. The paper does not discuss this selection bias."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "The Simple Macroeconomics of AI",
    344       "authors": ["Daron Acemoglu"],
    345       "year": 2024,
    346       "relevance": "Key framework paper for estimating aggregate productivity gains from AI; this paper directly extends and critiques its estimates."
    347     },
    348     {
    349       "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
    350       "authors": ["Shakked Noy", "Whitney Zhang"],
    351       "year": 2023,
    352       "doi": "10.1126/science.adh2586",
    353       "relevance": "RCT finding 37% productivity improvements from ChatGPT for professional writing tasks; used as a key parameter in the aggregate estimate."
    354     },
    355     {
    356       "title": "Generative AI at Work",
    357       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey R. Raymond"],
    358       "year": 2023,
    359       "relevance": "Real-world case study of AI productivity effects in call centers (14% improvement); used as a key parameter in the aggregate estimate."
    360     },
    361     {
    362       "title": "Navigating the Jagged Technological Frontier: Field Experimental Evidence of the Effects of AI on Knowledge Worker Productivity and Quality",
    363       "authors": ["Fabrizio Dell'Acqua"],
    364       "year": 2023,
    365       "relevance": "BCG consultant field experiment showing heterogeneous productivity effects of GPT-4 across task types."
    366     },
    367     {
    368       "title": "Scaling Laws for Neural Language Models",
    369       "authors": ["Jared Kaplan"],
    370       "year": 2020,
    371       "arxiv_id": "2001.08361",
    372       "relevance": "Foundational scaling laws paper showing consistent relationship between compute and cross-entropy loss; this paper extends scaling laws to economic outcomes."
    373     },
    374     {
    375       "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
    376       "authors": ["Tyna Eloundou"],
    377       "year": 2023,
    378       "arxiv_id": "2303.10130",
    379       "relevance": "Estimates 19.9% of tasks exposed to AI, directly used as a parameter in the aggregate productivity calculation."
    380     },
    381     {
    382       "title": "Evidence of a Log Scaling Law for Political Persuasion with Large Language Models",
    383       "authors": ["Kobi Hackenburg"],
    384       "year": 2024,
    385       "arxiv_id": "2406.14508",
    386       "relevance": "Derives scaling laws for LLM persuasion performance, the most closely related prior work on economic/social scaling laws."
    387     },
    388     {
    389       "title": "Beyond AI Exposure: Which Tasks are Cost-Effective to Automate with Computer Vision?",
    390       "authors": ["Maja Svanberg", "Wensu Li", "Martin Fleming", "Brian Goehring", "Neil Thompson"],
    391       "year": 2024,
    392       "relevance": "Estimates 23% of automatable tasks are economically feasible; this paper argues the estimate is too conservative for LLM tasks."
    393     },
    394     {
    395       "title": "Productivity Assessment of Neural Code Completion",
    396       "authors": ["Albert Ziegler"],
    397       "year": 2022,
    398       "relevance": "GitHub Copilot productivity assessment showing double-digit code completion improvements; one of the key AI productivity studies referenced."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs