scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22753B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
      6     "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
      7     "year": 2023,
      8     "venue": "arXiv",
      9     "arxiv_id": "2303.10130"
     10   },
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No code repository or URL is provided in the paper. The exposure rubric is described but no analysis code is released."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The paper uses public O*NET and BLS data but does not release its own annotated exposure dataset. The human and GPT-4 annotations that form the core contribution are not made publicly available."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment specifications, dependencies, or software versions are provided."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No reproduction instructions are provided. The rubric is given in Appendix A.1 but there are no instructions for replicating the annotation process or analysis pipeline."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Main results (Table 3, Figures 3-5) report means and standard deviations of exposure scores but no confidence intervals or error bars on the key exposure estimates."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Table 9 reports OLS regressions with standard errors and significance levels (*, **, ***) for validation against prior measures. Table 5 similarly reports significance for skill regressions."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports effect sizes in context throughout — e.g., '15% of all worker tasks could be completed significantly faster,' '80% of workers have at least 10% of tasks affected,' with baseline percentages across exposure levels (α, β, ζ) in Tables 3, 6, 7, 10."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper uses 1,016 occupations from O*NET and 19,265 tasks but does not justify why these sample sizes are adequate for the claims made. No power analysis is discussed."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Table 3 reports standard deviations for all exposure measures. Table 4 identifies occupations with highest variance. Table 8 provides std dev for all prior measures."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 5 and Table 9 compare the new LLM exposure measure against multiple prior efforts: Frey & Osborne, Webb, Brynjolfsson et al. (SML), Felten et al., and Acemoglu & Autor routine task measures."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Baselines include Felten et al. (2023) and Brynjolfsson et al. (2023), which are contemporary. Older measures like Frey & Osborne (2017) are included for historical comparison with justification."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper effectively ablates by comparing three exposure levels (α = LLM only, β = weighted, ζ = LLM + software) and comparing human vs. GPT-4 annotations, and two different GPT-4 rubric prompts (Table 2)."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper uses three exposure measures (α, β, ζ), agreement rates, and Pearson correlations (Table 2). Regressions use R² and coefficient significance."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Human annotators labeled DWAs and tasks using the exposure rubric (Section 3.3). The authors and experienced annotators from OpenAI's alignment work provided human ratings."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is not a predictive modeling paper — it produces descriptive exposure measures, not predictions to be tested on held-out data."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down by occupation (Table 4), Job Zone (Table 6, Figure 5), education level (Table 10), on-the-job training (Table 7), skill importance (Table 5), and industry (Figures 6-7)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 3.4.3 discusses sources of disagreement, including tasks where annotators got 'stuck.' Table 11 lists 34 occupations with zero exposure. The paper acknowledges discrepancies between human and GPT-4 labels."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper reports that AI Occupational Exposure Score from Felten et al. is 'not correlated' with their measure in most specifications (Table 9). Also reports that LLM exposure appears uncorrelated with recent productivity growth."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims about 80% of workforce having 10% of tasks affected, 19% having 50% affected, ~15% direct exposure, 47-56% with software — all supported by Tables 3 and summary statistics in Section 4.1."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper is careful to frame findings as 'exposure' and 'potential impact' rather than causal effects. Section 3.3 explicitly states exposure is 'a proxy for potential economic impact without distinguishing between labor-augmenting or labor-displacing effects.' The paper avoids causal language."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Section 6.3 explicitly limits to the US, acknowledges the forward-looking nature of predictions, and notes that 'our focus on the United States restricts the generalizability of our findings to other nations.' The abstract says 'U.S. labor market' not labor markets generally."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section 3.4 discusses multiple limitations including annotator subjectivity (3.4.1), sensitivity to rubric wording (3.4.2), validity of the task-based framework, and sources of disagreement (3.4.3). Section 6.1 discusses adoption bottlenecks and confounds."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 3.3 explicitly states 'We define exposure as a proxy for potential economic impact without distinguishing between labor-augmenting or labor-displacing effects.' The paper repeatedly distinguishes technical exposure from actual adoption, productivity, or automation outcomes."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper says 'an early version of GPT-4' was used for classification (Section 3.3) without specifying the exact version, snapshot date, or API endpoint. Section 1 notes 'GPT-3.5 family but not in the GPT-4 family' for the rubric definition but not the classifier version."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The full exposure rubric/prompt used for annotation is provided in Appendix A.1, including detailed examples and category definitions. This is the prompt used for GPT-4 classification (with acknowledged slight modifications, Section 3.4.2)."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the GPT-4 classification runs."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. GPT-4 is used as a direct classifier via prompting."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3.1 describes the O*NET data (19,265 tasks, 2,087 DWAs). Section 3.2 describes BLS data linkage via crosswalk. Section 3.3 describes the aggregation from DWA to task to occupation level with core/supplemental weighting."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.4 'Limitations of our methodology' contains four substantive subsections: subjective judgments (3.4.1), measuring with GPT-4 (3.4.2), additional weaknesses (3.4.3). Section 6.3 adds further limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3.4.1 discusses annotator homogeneity ('not occupationally diverse'). Section 3.4.2 discusses prompt sensitivity. Section 3.4.3 identifies specific issues: validity of task-based framework, annotator lack of occupation expertise, and forward-looking uncertainty."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6.3 states 'our focus on the United States restricts the generalizability.' The paper explicitly says it does not predict adoption timelines (abstract) and 'does not consider total factor productivity or capital input potential' (Section 4). It notes exposure ≠ automation."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The annotated exposure labels are not released. O*NET and BLS data are public but the paper's core contribution — the human and GPT-4 annotations — is not available for verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3.1-3.3 describes data sources (O*NET 27.2, BLS 2020-2021), annotation procedure (authors + contracted annotators), and the exposure rubric. The crosswalk method is specified."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 3.3 describes annotators: 'The authors personally labeled a large sample' and 'enlisted experienced human annotators who have reviewed GPT-3, GPT-3.5 and GPT-4 outputs as part of OpenAI's alignment work.' Acknowledgments name specific annotators."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline from O*NET DWAs → task aggregation → occupation aggregation with core/supplemental weighting is described. Three measures (α, β, ζ) are defined. Human vs. GPT-4 annotation tracks are described. BLS crosswalk is specified."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding sources are disclosed. Acknowledgments thank individuals for feedback but do not mention grants or funding."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are prominently listed: three authors from OpenAI, one from OpenResearch, one from University of Pennsylvania. The OpenAI affiliation is clear."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Three of four authors are OpenAI employees. OpenAI has a direct financial interest in demonstrating the broad economic impact of LLMs/GPTs. The funder (effectively OpenAI as employer) is not independent of the outcome."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement is present. OpenAI employees evaluating the economic impact potential of OpenAI's products represents a clear potential conflict that is not explicitly declared."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. GPT-4 is used as a classifier/annotator, not tested on its knowledge."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "No benchmark evaluation of model capability is performed. GPT-4 is used as a labeling tool, not evaluated on a benchmark."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No benchmark evaluation is performed. The paper uses GPT-4 as an annotator, not as a test subject."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants in the experimental sense. Human annotators labeled tasks but were not study participants. This is a labor market analysis, not a human subjects study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants. Annotators were paid workers performing a labeling task, not research subjects."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study. Annotator characteristics are briefly described (familiar with LLM capabilities, OpenAI alignment annotators) but this is not a human subjects study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "The paper used GPT-4 to classify 19,265 tasks but does not report the API cost or compute time required."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No computational budget is stated for the GPT-4 annotation runs or the statistical analysis."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "About 80% of the U.S. workforce could have at least 10% of their work tasks affected by LLMs",
    293       "evidence": "Table 3 and Section 4.1: based on β exposure values aggregated to occupation level with core/supplemental task weighting.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "Approximately 19% of workers may see at least 50% of their tasks impacted by LLMs and LLM-powered software",
    298       "evidence": "Section 4.1: '19% of workers are in an occupation where over half of its tasks are labeled as exposed' based on β values.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "About 15% of all worker tasks could be completed significantly faster with LLM alone",
    303       "evidence": "Table 3: mean α values of 0.14-0.15 from both human and GPT-4 annotations.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "With LLM-powered software, 47-56% of all tasks could be completed significantly faster",
    308       "evidence": "Table 3: mean ζ values of 0.47 (human) and 0.56 (GPT-4) at occupation level.",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "Higher-income jobs face greater exposure to LLM capabilities",
    313       "evidence": "Figure 4 binscatter, Table 6 Job Zone analysis, Table 10 education-based analysis all show increasing exposure with wage/education level.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "LLMs exhibit traits of general-purpose technologies",
    318       "evidence": "Section 6.1 argues LLMs meet three GPT criteria: improvement over time (literature review), pervasiveness (exposure analysis), complementary innovations (α vs ζ gap). Evidence is conceptual rather than rigorous testing of GPT criteria.",
    319       "supported": "moderate"
    320     }
    321   ],
    322   "methodology_tags": ["observational", "qualitative"],
    323   "key_findings": "Using human and GPT-4 annotations on O*NET occupation data, the paper estimates ~15% of US worker tasks are directly exposed to LLMs, rising to 47-56% with LLM-powered software. Higher-wage occupations show greater exposure, reversing patterns seen with prior automation technologies. The exposure measure correlates positively with prior software/AI exposure scores but negatively with manual/robotics measures, with 28-40% unexplained variance. The authors argue LLMs qualify as general-purpose technologies based on their pervasiveness and complementary innovation potential.",
    324   "red_flags": [
    325     {
    326       "flag": "Company evaluating its own product",
    327       "detail": "Three of four authors are OpenAI employees. The paper argues for the broad economic impact of GPTs/LLMs — OpenAI's core product. While the paper frames results as 'exposure' rather than benefits, the framing still serves OpenAI's narrative that GPTs are transformative general-purpose technologies."
    328     },
    329     {
    330       "flag": "Annotator pool not occupationally diverse",
    331       "detail": "The paper acknowledges in Section 3.4.1 that annotators 'are not occupationally diverse, potentially leading to biased judgments regarding LLMs' reliability and effectiveness in performing tasks within unfamiliar occupations.' Annotators were OpenAI alignment workers, likely biased toward overestimating LLM capabilities."
    332     },
    333     {
    334       "flag": "Forward-looking exposure estimates presented as findings",
    335       "detail": "The ζ measure (LLM + software) requires imagining future software that doesn't yet exist. This makes 47-56% exposure figure speculative by design, yet it is the headline number in the abstract. The β measure weights this at 50%."
    336     },
    337     {
    338       "flag": "No release of annotation data",
    339       "detail": "The core contribution — human and GPT-4 exposure labels for 19,265 tasks — is not released, preventing independent verification of the coding decisions that drive all results."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Evaluating large language models trained on code",
    345       "authors": ["Chen, M.", "Tworek, J.", "Jun, H."],
    346       "year": 2021,
    347       "arxiv_id": "2107.03374",
    348       "relevance": "Foundational code generation benchmark (Codex/HumanEval) widely used in LLM programming evaluations."
    349     },
    350     {
    351       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    352       "authors": ["Peng, S.", "Kalliamvakou, E.", "Cihon, P.", "Demirer, M."],
    353       "year": 2023,
    354       "arxiv_id": "2302.06590",
    355       "relevance": "RCT on GitHub Copilot productivity effects — directly relevant to LLM programming productivity claims."
    356     },
    357     {
    358       "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
    359       "authors": ["Noy, S.", "Zhang, W."],
    360       "year": 2023,
    361       "relevance": "Experimental evidence on generative AI productivity effects on writing tasks."
    362     },
    363     {
    364       "title": "On the opportunities and risks of foundation models",
    365       "authors": ["Bommasani, R.", "Hudson, D. A.", "Adeli, E."],
    366       "year": 2021,
    367       "arxiv_id": "2108.07258",
    368       "relevance": "Comprehensive survey of foundation model capabilities, risks, and societal implications."
    369     },
    370     {
    371       "title": "Augmented language models: a survey",
    372       "authors": ["Mialon, G.", "Dessì, R.", "Lomeli, M."],
    373       "year": 2023,
    374       "arxiv_id": "2302.07842",
    375       "relevance": "Survey of augmented LLMs including tool use and retrieval — relevant to LLM-powered software capabilities."
    376     },
    377     {
    378       "title": "Toolformer: Language models can teach themselves to use tools",
    379       "authors": ["Schick, T.", "Dwivedi-Yu, J.", "Dessì, R."],
    380       "year": 2023,
    381       "arxiv_id": "2302.04761",
    382       "relevance": "Key paper on LLM tool use capabilities that enable the LLM-powered software discussed in this paper."
    383     },
    384     {
    385       "title": "Training language models to follow instructions with human feedback",
    386       "authors": ["Ouyang, L.", "Wu, J.", "Jiang, X."],
    387       "year": 2022,
    388       "arxiv_id": "2203.02155",
    389       "relevance": "RLHF paper fundamental to the instruction-following capabilities that make LLMs useful for task completion."
    390     },
    391     {
    392       "title": "Language models are few-shot learners",
    393       "authors": ["Brown, T.", "Mann, B.", "Ryder, N."],
    394       "year": 2020,
    395       "relevance": "GPT-3 paper establishing large-scale language model capabilities for diverse tasks."
    396     },
    397     {
    398       "title": "The future of employment: How susceptible are jobs to computerisation?",
    399       "authors": ["Frey, C. B.", "Osborne, M. A."],
    400       "year": 2017,
    401       "relevance": "Seminal automation exposure study that this paper directly builds upon and compares against."
    402     },
    403     {
    404       "title": "What can machines learn, and what does it mean for occupations and the economy?",
    405       "authors": ["Brynjolfsson, E.", "Mitchell, T.", "Rock, D."],
    406       "year": 2018,
    407       "relevance": "Suitability for Machine Learning (SML) rubric that this paper's methodology directly extends to LLMs."
    408     }
    409   ]
    410 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs