scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31657B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
      6     "authors": [
      7       "Eloundou, T.",
      8       "Manning, S.",
      9       "Mishkin, P.",
     10       "Rock, D."
     11     ],
     12     "year": 2023,
     13     "venue": "arXiv",
     14     "arxiv_id": "2303.10130",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All headline figures in the abstract (80%/19%/15%/47-56%) are supported by Table 3 and Section 4.1 from both human and GPT-4 annotations; the GPT-as-GPT general-purpose technology argument is developed in Section 6.1.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Comparative claims (higher wages correlate with more exposure, writing/programming skills positively associated) are framed as correlational and supported by OLS regressions; the paper explicitly states exposure is a proxy that does not imply causal displacement.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly limits findings to U.S. workers (O*NET/BLS data), acknowledges non-generalizability to other nations in Section 6.3, and repeatedly distinguishes technical exposure from actual economic outcomes.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 3.4.2 discusses sources of annotator disagreement; Section 6.1 considers alternative adoption trajectories (augmentation before automation); the unexpected higher-wage exposure pattern is discussed relative to prior automation literature.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly defines exposure as 'a proxy for potential economic impact' and states that 'social, economic, regulatory, and other determinants imply that technical feasibility does not guarantee labor productivity or automation outcomes' (Section 3.3).",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 3.4 'Limitations of our methodology' has three dedicated subsections; Section 6.3 'Limitations and Future Work' provides additional substantive discussion beyond boilerplate.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats enumerated include: annotators not occupationally diverse (Section 3.4.1), GPT-4 output sensitivity to prompt wording and examples (Section 3.4.2), task-based framework may omit tacit skills, and annotators were unaware of occupational context during labeling (Section 3.4.3).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicit boundaries stated: U.S. only; LLMs limited to text/code (vision excluded from direct exposure α); exposure ≠ adoption or displacement; no adoption timeline predictions made.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure statement appears anywhere in the paper. The acknowledgments thank individuals but do not identify a funding source.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed on the title page: Eloundou, Manning, and Mishkin at OpenAI; Manning also at OpenResearch; Rock at University of Pennsylvania.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Three of four authors are OpenAI employees, and the paper evaluates the labor market impact potential of GPT-family models — OpenAI's own commercial products. OpenAI's revenue and reputation are directly tied to findings about GPT capabilities.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, equity disclosures, or financial interest declarations appear anywhere in the paper despite authors' employment at OpenAI.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Exposure levels (E0/E1/E2/E3), DWA vs. task distinction, LLM vs. LLM-powered software vs. generative AI, and general-purpose technology criteria are all formally defined in Section 3.3 and Appendix A.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper states: 'This paper's primary contributions are to provide a set of measurements of LLM impact potential and to demonstrate the use case of applying LLMs to develop such measurements efficiently and at scale' (Section 2).",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 provides a thorough literature review; Section 5 directly compares the new measures to six prior exposure measures with OLS regression tables showing R² of 60-73%, demonstrating quantitative engagement rather than mere citation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No code repository, GitHub link, or data release is mentioned. The exposure rubric is provided in Appendix A.1 but no processing or analysis code is released.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "O*NET is publicly available but the human and GPT-4 annotation data and derived occupation-level exposure scores are not released; no data repository or supplemental data files are provided.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No software environment, API version, Python dependencies, or system specifications are provided for the GPT-4 annotation pipeline or statistical analysis.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided. The rubric in Appendix A.1 describes the labeling task but the annotation data required to reproduce the results is not available.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The widely-cited headline figures (80% of workers, 19% with ≥50% tasks exposed) are presented as point estimates without confidence intervals. Regression coefficients have standard errors but the primary descriptive claims lack uncertainty quantification.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "OLS regression tables (Tables 5 and 9) report statistical significance at 1%, 5%, and 10% levels with standard errors for all coefficients.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Exposure proportions are reported in original units (e.g., mean α = 0.14), regression coefficients are in interpretable original scales, and the variance explained (R² 60-73%) is reported for validation regressions.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or justification is provided for the number of human annotations collected; the paper does not state total number of DWA/task labels assigned by human raters versus GPT-4.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Table 3 reports standard deviations for occupation-level and task-level exposure across α, β, and ζ measures for both human and GPT-4 annotations.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Section 5 directly compares new measures to six prior baselines: Frey & Osborne automation, Brynjolfsson SML, Webb software/robot/AI patent scores, Felten AI Occupational Exposure Score, and Acemoglu & Autor routine task scores.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Comparisons include Brynjolfsson et al. (2023) and Felten et al. (2023), both contemporaneous with this paper; older measures (Frey & Osborne 2017) represent established field standards rather than weak baselines.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The paper tests three exposure thresholds (α/β/ζ), two GPT-4 prompts for robustness (Rubric 1 and 2), both human and model annotations, and multiple aggregation weighting schemes (equal weight vs. core/supplemental weights).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics used: three exposure levels, percent agreement, Pearson correlation (Table 2), R² (Table 9), and occupation-level breakdowns by wage, education, job zone, industry, and skill type.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Human annotators (authors plus contracted workers with LLM alignment experience) applied the exposure rubric to O*NET DWAs and a subset of tasks, serving as the paper's primary results source.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "NA — this is an annotation and measurement study, not a predictive modeling task; the concept of a held-out test set does not apply.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by job zone (Table 6), education level (Table 10), on-the-job training (Table 7), skill type (Table 5), industry (Appendix C), and individual occupation with top/bottom lists (Table 4).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Appendix D lists 34 occupations with no exposed tasks; Section 3.4.2 systematically discusses categories of tasks where annotators consistently disagreed (meetings, regulated tasks, already-automated activities).",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper explicitly reports that recent industry productivity growth is uncorrelated with LLM exposure (Appendix C) — a notable null result with direct policy implications discussed in relation to Baumol's cost disease.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "GPT-4 is described only as 'an early version of GPT-4' with no snapshot date, API version identifier, or model checkpoint — insufficient for replication.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix A.1 provides the full exposure rubric used as a GPT-4 prompt, including definitions for all four exposure levels, decision criteria, and annotated examples.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No temperature, top-p, frequency penalty, or other generation hyperparameters are reported for the GPT-4 annotation pipeline.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "NA — GPT-4 is used as a zero-shot classifier with a text prompt; there is no agentic scaffolding, tool use, or multi-step orchestration.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Aggregation methodology is documented: DWA-level labels aggregated to task level, then to occupation level; core tasks weighted 2× supplemental tasks; O*NET-BLS crosswalk procedure described in Sections 3.1-3.2.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "O*NET is publicly available but the derived human and GPT-4 exposure annotations are not released; no data repository or DOI for annotation data is provided.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.3 describes the two-stage annotation process: authors labeled physically demanding DWAs, contracted alignment workers labeled remaining activities, with additional task labels for disagreement cases.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": false,
    281           "justification": "Contractors described as 'experienced human annotators who have reviewed GPT-3, GPT-3.5 and GPT-4 outputs as part of OpenAI's alignment work' — but no count of annotators, formal selection criteria, or compensation details are provided.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from O*NET database selection → DWA/task annotation → task-level aggregation → occupation-level aggregation → regression analysis is documented in Sections 3.1-3.3 with explicit weighting rules.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "NA — GPT-4 is used as an annotation instrument rather than evaluated on a benchmark; standard benchmark contamination analysis does not apply.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "NA — same as above; O*NET task descriptions in GPT-4's training corpus could bias annotations but the paper does not evaluate model benchmark performance where this would be a standard concern.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "NA — no benchmark evaluation of model capabilities is performed; GPT-4 is used as a classification tool generating novel judgments.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "NA — human annotators serve as measurement instruments for data collection rather than as research subjects in a human subjects study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "NA — paid annotation workers are not research subjects in the clinical/behavioral sense requiring IRB review.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "NA — same as above.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "NA — same as above.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "NA — same as above.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "NA — same as above.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "NA — same as above.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs, token counts, or latency figures are reported for running GPT-4 across 19,265 task/occupation pairs.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total compute budget, GPU hours, annotation labor costs, or API expenditure is stated anywhere in the paper.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Approximately 80% of U.S. workers could have at least 10% of their work tasks affected by the introduction of LLMs.",
    374       "evidence": "Table 3 and Section 4.1: β-measure shows 80% of workers belong to an occupation with at least 10% of tasks exposed, consistent across human and GPT-4 annotations.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Approximately 19% of workers may see at least 50% of their tasks impacted by LLMs.",
    379       "evidence": "Table 3 and Section 4.1: both human (19%) and GPT-4 (21.6%) β-measures indicate this share of occupations have ≥50% task exposure.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "About 15% of all U.S. worker tasks could be completed significantly faster with LLM access alone, without additional software.",
    384       "evidence": "Table 3: mean α-measure (direct LLM exposure only) is 0.15 for human and 0.14 for GPT-4 annotations at the task level.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "With LLM-powered software, 47-56% of all U.S. worker tasks could be significantly accelerated.",
    389       "evidence": "Table 3: ζ-measure (full software stack) shows 0.47 (human) to 0.56 (GPT-4) mean task-level exposure — the 47-56% range maps directly to these two estimates.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Higher-income workers face greater LLM exposure than lower-income workers, contrary to prior automation findings.",
    394       "evidence": "Figure 4 binscatter shows positive correlation between log wage and β-exposure; Table 6 shows exposure increasing from Job Zone 1 ($30K median, 6% β-exposure) to Job Zone 4 ($77K, 47-51%).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "LLMs exhibit characteristics of general-purpose technologies: pervasiveness, continuous improvement, and complementary innovation spawning.",
    399       "evidence": "Section 6.1 argues pervasiveness from this paper's data, improvement from prior literature, and complementary software potential from the α-to-ζ gap of 0.32-0.42. However, this is an interpretive argument applied to a technology still in early deployment, not a direct empirical test of GPT adoption diffusion.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "observational",
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "Using a novel exposure rubric applied by human annotators and GPT-4 to O*NET occupational task data, the paper finds that approximately 80% of U.S. workers could have at least 10% of their tasks affected by LLMs, with 19% facing ≥50% task exposure. Critically, higher-wage knowledge workers face greater LLM exposure than lower-wage workers — contrary to prior automation research that found the opposite. When accounting for LLM-powered software (tools built on top of LLMs), the share of affected tasks rises from ~15% (LLM alone) to 47-56%, suggesting complementary software innovations will drive far more economic impact than raw model capabilities. The paper argues these characteristics qualify LLMs as general-purpose technologies. Significant methodological concerns: three of four authors are OpenAI employees assessing their own product, GPT-4 is used as a self-referential annotator, no annotation data or code is released, and headline figures lack confidence intervals.",
    408   "red_flags": [
    409     {
    410       "flag": "OpenAI employees evaluating own product",
    411       "detail": "Three of four authors are OpenAI employees assessing the labor market impact potential of GPT-family models — OpenAI's commercial products. No competing interests are declared despite this direct financial and reputational interest in favorable findings."
    412     },
    413     {
    414       "flag": "GPT-4 as self-referential annotator",
    415       "detail": "GPT-4 is used both as the annotator judging which tasks it could accelerate and as the technology being assessed. This circularity means the model's self-assessment of its own capabilities directly shapes the study's conclusions."
    416     },
    417     {
    418       "flag": "No data or code released",
    419       "detail": "Human and GPT-4 annotation data, derived exposure scores, and analysis code are not released. Only the rubric prompt is provided in the appendix, making independent verification or replication effectively impossible."
    420     },
    421     {
    422       "flag": "Annotators not occupationally diverse",
    423       "detail": "Annotators were OpenAI alignment workers unfamiliar with most occupations they labeled. The paper acknowledges this leads to 'potentially biased judgments regarding LLMs' reliability and effectiveness in performing tasks within unfamiliar occupations.'"
    424     },
    425     {
    426       "flag": "Headline figures lack confidence intervals",
    427       "detail": "The widely-cited 80%/19% figures are presented without confidence intervals or uncertainty bounds. The 50% time-reduction threshold defining 'exposure' is acknowledged as 'somewhat arbitrary.'"
    428     },
    429     {
    430       "flag": "Model version unspecified",
    431       "detail": "GPT-4 is described only as 'an early version of GPT-4' with no snapshot date, API version, or generation hyperparameters — the specific model used cannot be identified or reproduced."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "The Future of Employment: How Susceptible Are Jobs to Computerisation?",
    437       "relevance": "Frey & Osborne (2017) is the foundational prior work this paper directly compares against; their automation potential measure shows negative correlation with LLM exposure for physical tasks, situating LLMs in the broader automation literature."
    438     },
    439     {
    440       "title": "What Can Machines Learn, and What Does It Mean for Occupations and the Economy?",
    441       "relevance": "Brynjolfsson, Mitchell & Rock's Suitability for Machine Learning (SML) rubric directly inspired this paper's methodology; the SML measure is the strongest correlate of their new LLM exposure measure in Table 9."
    442     },
    443     {
    444       "title": "How Will Language Models Like ChatGPT Affect Occupations and Industries?",
    445       "relevance": "Contemporaneous work by Felten et al. (2023) using AI occupational exposure scores; directly compared in the validation section as the closest parallel independent effort."
    446     },
    447     {
    448       "title": "The Impact of Artificial Intelligence on the Labor Market",
    449       "relevance": "Webb (2020) patent-based automation measures are used as a primary baseline comparison; software patent exposure shows the strongest positive correlation with LLM exposure among prior measures."
    450     },
    451     {
    452       "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
    453       "relevance": "Noy & Zhang (2023) provides actual RCT-based productivity evidence complementing this paper's exposure analysis; cited as empirical validation that LLM exposure translates to real productivity gains."
    454     },
    455     {
    456       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    457       "relevance": "Peng et al. (2023) provides domain-specific empirical evidence (55% faster task completion for developers) that grounds the paper's theoretical exposure claims in real-world productivity measurement."
    458     },
    459     {
    460       "title": "On the Opportunities and Risks of Foundation Models",
    461       "relevance": "Bommasani et al. (2021) provides the broad context on LLM capabilities, risks, and societal implications that motivates this labor market analysis."
    462     },
    463     {
    464       "title": "Could Machine Learning Be a General Purpose Technology? A Comparison of Emerging Technologies Using Data from Online Job Postings",
    465       "relevance": "Goldfarb et al. (2023) is directly challenged: this paper argues LLMs independently qualify as general-purpose technologies even within the broader ML category."
    466     },
    467     {
    468       "title": "Skills, Tasks and Technologies: Implications for Employment and Earnings",
    469       "relevance": "Acemoglu & Autor (2011) task-based framework is the theoretical foundation for the occupation-as-task-bundle approach; their routine cognitive task scores serve as a validation baseline."
    470     },
    471     {
    472       "title": "New Frontiers: The Origins and Content of New Work, 1940-2018",
    473       "relevance": "Autor et al. (2022) on job creation and task reinstatement provides the framework for understanding LLMs as potentially creating new types of work in addition to displacing existing tasks."
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 3,
    479       "justification": "Directly actionable for workers, employers, and policymakers — the occupation-level exposure breakdown names specific jobs and their percentage exposure, enabling immediate career and policy planning."
    480     },
    481     "surprise_contrarian": {
    482       "score": 2,
    483       "justification": "The finding that higher-wage knowledge workers face MORE LLM exposure than lower-wage workers directly inverts the dominant narrative that AI primarily threatens blue-collar and routine jobs."
    484     },
    485     "fear_safety": {
    486       "score": 2,
    487       "justification": "Claims that 80% of U.S. workers face meaningful LLM task exposure raise significant labor displacement concerns, though carefully framed as potential exposure rather than certain displacement."
    488     },
    489     "drama_conflict": {
    490       "score": 2,
    491       "justification": "OpenAI employees publishing findings about GPT-4's potential to transform labor markets — using GPT-4 as an annotator for that analysis — is an inherent conflict-of-interest angle that attracted substantial public commentary."
    492     },
    493     "demo_ability": {
    494       "score": 1,
    495       "justification": "The rubric is provided and could be applied manually, but no interactive tool, public dataset, or API enables practitioners to explore occupation-level results themselves."
    496     },
    497     "brand_recognition": {
    498       "score": 3,
    499       "justification": "Published by OpenAI — the organization behind ChatGPT and GPT-4 — at peak public interest in LLMs in early 2023, maximizing both credibility and media reach."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [
    504       {
    505         "hn_id": "35226364",
    506         "title": "GPTs Are GPTs: An Early Look at the Labor Market Impact Potential of LLMs",
    507         "points": 190,
    508         "comments": 230,
    509         "url": "https://news.ycombinator.com/item?id=35226364",
    510         "created_at": "2023-03-20T02:04:37Z"
    511       },
    512       {
    513         "hn_id": "34600232",
    514         "title": "DetectGPT: Zero-Shot Machine-Generated Text Detection",
    515         "points": 64,
    516         "comments": 56,
    517         "url": "https://news.ycombinator.com/item?id=34600232",
    518         "created_at": "2023-01-31T19:12:56Z"
    519       },
    520       {
    521         "hn_id": "37847563",
    522         "title": "Grande: Gradient-Based Decision Tree Ensembles",
    523         "points": 25,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=37847563",
    526         "created_at": "2023-10-11T17:32:38Z"
    527       },
    528       {
    529         "hn_id": "34543315",
    530         "title": "DetectGPT: Detecting if a passage was written by a language model",
    531         "points": 5,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=34543315",
    534         "created_at": "2023-01-27T06:42:32Z"
    535       },
    536       {
    537         "hn_id": "43556236",
    538         "title": "HiRAG: RAG with Hierarchical Knowledge",
    539         "points": 3,
    540         "comments": 1,
    541         "url": "https://news.ycombinator.com/item?id=43556236",
    542         "created_at": "2025-04-02T13:06:12Z"
    543       },
    544       {
    545         "hn_id": "39745657",
    546         "title": "Raft: Adapting Language Model to Domain Specific RAG",
    547         "points": 3,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=39745657",
    550         "created_at": "2024-03-18T15:12:18Z"
    551       },
    552       {
    553         "hn_id": "37640509",
    554         "title": "Clustering Compact RISC-V-Based Vector Units to Maximize Computing Efficiency",
    555         "points": 3,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=37640509",
    558         "created_at": "2023-09-25T07:14:34Z"
    559       },
    560       {
    561         "hn_id": "47400835",
    562         "title": "An early look at the labor market impact potential of LLMs (2023)",
    563         "points": 2,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=47400835",
    566         "created_at": "2026-03-16T16:05:22Z"
    567       },
    568       {
    569         "hn_id": "35236455",
    570         "title": "A Recipe for Watermarking Diffusion Models",
    571         "points": 1,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=35236455",
    574         "created_at": "2023-03-20T18:38:07Z"
    575       },
    576       {
    577         "hn_id": "37667449",
    578         "title": "Baichuan 2: Open Large-Scale Language Models. (ArXiv:2309.10305v1 [Cs.cl])",
    579         "points": 1,
    580         "comments": 0,
    581         "url": "https://news.ycombinator.com/item?id=37667449",
    582         "created_at": "2023-09-26T23:38:48Z"
    583       }
    584     ],
    585     "top_points": 190,
    586     "total_points": 297,
    587     "total_comments": 288
    588   }
    589 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs