scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28753B)
      1 {
      2   "paper": {
      3     "title": "Beyond Automation: Redesigning Jobs with LLMs to Enhance Productivity",
      4     "authors": [
      5       "Andrew Ledingham",
      6       "Michael Hollins",
      7       "Matthew Lyon",
      8       "David Gillespie",
      9       "Umar Yunis-Guerra",
     10       "Jamie Siviter",
     11       "David Duncan",
     12       "Oliver P. Hauser"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2512.05659",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The authors released an open-source Python package called LLMBo (Large Language Model Batch Operations) under MIT license on PyPI, with documentation at https://co-cddo.github.io/gds-idea-llmbo/. However, the full analysis pipeline code is not explicitly stated as released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The GRID dataset of UK Civil Service job vacancies is described as not publicly available — 'historic data is not publicly available: adverts are removed once applications close, and the Government does not publish an archive or database of past job adverts.' No dataset download is provided."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or detailed environment specifications are provided. The paper mentions Python and specific model names but does not provide dependency or environment details sufficient to recreate the setup."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. The methodology is described in detail, but there are no README-style instructions or scripts to replicate the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Main results are reported as point estimates (e.g., '£5.2bn productivity gains', '18% of UKCS jobs fully automatable'). No confidence intervals or error bars are provided for the exposure scores or savings estimates."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes comparative claims (e.g., lower grades more exposed than higher grades, productivity gains outweigh cost reductions) but no statistical significance tests (p-values, t-tests, etc.) are reported."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports effect sizes in context throughout, e.g., '£5.2bn in productivity gains vs £1.1bn in cost reductions', 'productivity gains are four times higher than potential cost reductions' at θ=0.8, and specific time changes per week (e.g., '97 minutes less on data analysis', '91 minutes more on stakeholder engagement')."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The sample of 193,497 vacancies is described in detail but its adequacy is not formally justified. No power analysis is discussed. The paper acknowledges representativeness issues (over-representation of higher grades) but does not justify sample size for statistical claims."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Standard deviations are reported for exposure scores (mean 0.53, SD 0.18), salary data (SD £13,622), tasks per role (SD 3.36), and interquartile ranges for FTE shares freed up (lower quartile 0.17, upper quartile 0.50). The K-means clustering is based on mean and standard deviation of exposure scores."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares its bottom-up GRID approach against O*NET and ISCO-08 baseline datasets (Section 3.2.2, Figure 3), showing the distribution of economist exposure scores across all three. It also compares equal weighting versus decay-weighted approaches."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baselines include O*NET 29.3 and ISCO-08, which are the standard databases used in contemporary literature (Felten et al. 2021, 2023; Gmyrek et al. 2023; Eloundou et al. 2024). These are the most current datasets available for this type of analysis."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper includes multiple robustness checks that function as ablations: varying the decay rate parameter (δ = 0.5, 0.75, 1.0), varying the automation threshold (θ from 0 to 1), comparing single focus task vs. task reordering vs. new task creation, and comparing across income deciles."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses multiple metrics: mean and standard deviation of exposure scores, proportion of time on high-exposure tasks, four exposure clusters, estimated cost reductions vs. productivity gains, Spearman's rank correlation and Pearson correlation for validation, and Krippendorff's Alpha for inter-rater reliability."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section B.2 describes a validation exercise where four AI experts independently assigned exposure scores to a sample of 100 tasks, compared against the LLM's scores using Spearman's rank correlation (mean 0.673) and Krippendorff's Alpha (0.526). Manual task extraction comparison is also described in Section B.1."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "This is not a machine learning benchmark evaluation paper. The study applies LLMs as tools for scoring and redesigning jobs, not training or testing predictive models on held-out data."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive breakdowns are provided: by grade (AA/AO through SCS), by department (37 departments), by profession (28+), by exposure cluster (Low, Augmentation, Adaptation, Automation), by task category (10 categories with 3 subcategories each), and by income decile."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses where the LLM approach fails or has limitations: poor inter-rater agreement among human experts (Krippendorff's Alpha 0.526), LLM tendency to assign higher exposure scores than humans, the risk management over-emphasis in new task creation, and job descriptions being 'promotional products' that may not fully represent actual tasks."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports that Krippendorff's Alpha among human experts was only 0.526 (indicating poor agreement), that the LLM tends to assign higher scores than humans, and that the new-task robustness check likely overstates risk management tasks. The paper also notes that the task reordering robustness check shows 'little difference' to the main focus task approach."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims about heterogeneous AI exposure, focus on strategic leadership/complex problem resolution/stakeholder management post-redesign, and productivity gains outweighing displacement are all supported by results in Sections 3.2-3.4 with specific figures (e.g., 26% strategic leadership, £5.2bn vs £1.1bn)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes predictive causal claims ('automation and augmentation are expected to have nuanced effects', 'most economic value of AI is expected to arise from productivity gains') but the study design is a simulation using LLM-generated exposure scores, not an actual deployment. The paper says AI 'will automate' tasks and 'leads to' various outcomes without adequate causal identification strategies."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper explicitly bounds its findings to the UK Civil Service context while arguing the framework is transferable: 'While the investigation in this paper focused on the impact of AI in the UKCS, we believe our approach can be used for any organisation with job descriptions.' The Discussion section notes context-dependence and limitations of the UKCS sample."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 4 (Discussion) discusses alternative explanations: job descriptions may be 'promotional products' that understate routine tasks, the LLM-led redesign may overstate risk management, the decay rate assumption could bias results, and actual time allocation may differ from assumed weights. Multiple robustness checks address alternative parameter choices."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 2.1 specifies: 'Claude Sonnet 3.5 v2 (\"anthropic.claude-3-5-sonnet-20241022-v2:0\")'. Foundation model comparison in Section B.3 also lists: Claude Haiku 3.5 ('anthropic.claude-3-5-haiku-20241022-v1:0'), Llama 3.3 70B Instruct ('meta.llama3-3-70b-instruct-v1:0'), and Mistral Large ('mistral.mistral-large-2402-v1:0')."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix C provides the full prompts used for task extraction (C.1), exposure scoring for O*NET/ISCO-08 (C.2), task clustering (C.3), and all five focus task/redesign prompts (C.4, Prompts 1-5), including the complete Pydantic tool schemas."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, or other LLM sampling hyperparameters are reported. The paper specifies model versions and the use of structured output via tool calling, but omits sampling parameters which significantly affect output."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The LLM pipeline is described in detail: tool calling for structured output, batch inference via AWS Bedrock, the LLMBo Python package, retry logic for swapped job summary/description fields, and the multi-stage workflow (Figure 1: task extraction → exposure scoring → clustering → redesign)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Sections 1, 2.2, and A.1 document preprocessing: filtering to departments with 1,000+ vacancies and UKCSS matching data, removing processing errors and jobs with under 2 tasks (212,048 → 193,260), removing 'Other'/'Industrial' grades, NLP preprocessing for clustering (removing digits, punctuation, stopwords, lowercase, tokenizing, lemmatizing)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4 (Discussion) contains extensive limitations discussion spanning multiple paragraphs, covering task weighting assumptions, job description representativeness, LLM reliability, the automation threshold choice, and workforce transition challenges."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper discusses specific threats: 'job descriptions are often promotional products aimed at attracting applicants' which may understate routine tasks; the Krippendorff's Alpha of 0.526 among human raters indicates poor baseline agreement; the LLM's tendency to assign higher exposure scores than humans; the GRID data over-represents higher grades; and the risk management bias in new task creation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope boundaries: it focuses on the UK Civil Service only, does not consider new job creation from technological shocks (footnote 2), does not include employment costs beyond salary (footnote 15), acknowledges the dataset does not cover all UKCS roles, and notes that results 'will depend on the specific job role and context.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The GRID data is not publicly available — it is a backend feed from Civil Service Jobs with no public archive. The authors 'obtained an extract' for their analysis but do not provide it for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section A.1 describes data collection: 'an extract of all roles published on Civil Service Jobs between 16th January 2019 and 3rd December 2024', covering the GRID backend feed. Table 4 lists all 11 variables. Section A.2 describes the UKCSS join using Tables 21 and 25."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "For the AI expert validation (Section B.2), recruitment is described: 'a group of four AI experts' including 'an academic, with research focussing on the impact of AI, and three UKCS Data Scientists, each with experience of building internal AI tools.' The main dataset uses the universe of Civil Service job postings rather than recruited participants."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented across Sections 1-2 and Appendices A-D: GRID extraction → department filtering (187 → 37) → LLM task extraction → error removal (212,048 → 193,260) → exposure scoring → IPF reweighting → clustering → redesign. Batch job statistics (Table 8) document token counts and costs per department."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The footnote on page 1 states: 'O.P.H. is grateful for financial support from the University of Exeter and the UKRI Future Leaders Fellowship (project reference: 1067).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are disclosed: seven authors are from Government Digital Service, UK Department of Science, Innovation and Technology; one (Hauser) from University of Exeter and UK Cabinet Office Evaluation Task Force. They are studying the employer they work for."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is from the University of Exeter and UKRI Future Leaders Fellowship, which are independent research funding bodies without a financial interest in whether AI adoption leads to productivity gains or job displacement in the UKCS."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided. Seven of eight authors work for the UK government organization being studied, which is a potential conflict (demonstrating AI value could benefit their department), but this is not explicitly acknowledged as a conflict."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper uses LLMs as tools for scoring and redesigning tasks, not evaluating model capability on a benchmark. There is no benchmark contamination concern in the traditional sense."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — the paper does not evaluate a pre-trained model's capability on a benchmark. The LLM is used as an analytical tool, not as the subject of evaluation."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — no benchmark evaluation is performed. The paper uses LLMs for task extraction and exposure scoring, not benchmark testing."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The main study analyzes job vacancy data, not human participants. The expert validation (4 AI experts scoring 100 tasks) is a minor validation check, not the core study design."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The study analyzes publicly posted job advertisements and government statistics, not human subjects. The small expert validation exercise does not constitute a human subjects study requiring IRB."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the core study. The expert validators are briefly characterized (1 academic, 3 data scientists) but this is a minor validation check."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the core study. The paper analyzes job vacancy data."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No experimental study with human participants requiring randomization."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No experimental study with human participants requiring blinding."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the core study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 8 in Appendix D provides detailed cost breakdowns per department for the batch LLM jobs, including input tokens, output tokens, and estimated cost in dollars. Total costs can be computed from the table."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 8 provides per-department token counts and dollar costs for the AWS Bedrock batch operations. The paper also mentions using AWS Bedrock batch inference as the compute platform. The total cost across all departments can be summed from the table."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Most UKCS roles (>60%) have medium AI exposure scores with high variance, while approximately 18% could be fully automated and 20% have low exposure.",
    296       "evidence": "Section 3.2.1 reports four exposure clusters: Low (20.81%), Augmentation (30.56%), Adaptation (30.95%), Automation (17.67%). Figure 4 visualizes the distribution.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "At the θ=0.8 threshold, productivity gains (£5.2bn) are approximately four times higher than potential cost reductions (£1.1bn) for the UKCS.",
    301       "evidence": "Section 3.3.1 and Figure 7 report that at θ=0.8, 34,182 roles (6.7% FTE) would be displaced (£1.1bn cost savings) while 392,065 roles (76% FTE) could gain productivity (£5.2bn). Figure 7c shows the 4:1 ratio.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "After job redesign, 26% of new tasks focus on strategic leadership, 18% on complex problem resolution, and 17% on stakeholder management.",
    306       "evidence": "Table 3 in Section 3.4.1 provides the breakdown of reasoning themes for focus tasks with percentages. Figure 9 shows the time reallocation before and after.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Lower-grade roles are more exposed to AI than senior roles, with 47% of SCS roles in the Low exposure cluster vs. only 9% of EO roles.",
    311       "evidence": "Section 3.2.3 reports these percentages directly, noting that AA/AO roles in Automation spend 60.4% of tasks on Records Management and Admin Support.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "The bottom-up GRID approach reveals substantial heterogeneity in AI exposure across seemingly identical jobs, unlike single-point estimates from O*NET or ISCO-08.",
    316       "evidence": "Section 3.2.2 and Figure 3 compare 2,773 economist roles in GRID (with a distribution of scores) against single point estimates from O*NET (0.65) and ISCO-08 (0.74).",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "LLM exposure scores show moderately strong correlation with human expert assessments (Spearman's ρ = 0.673).",
    321       "evidence": "Section B.2 reports mean Spearman's rank correlation of 0.673 and Pearson of 0.649 between LLM and 4 AI experts on 100 tasks. Krippendorff's Alpha among humans was only 0.526.",
    322       "supported": "moderate"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "observational",
    327     "case-study"
    328   ],
    329   "key_findings": "The paper analyzes 193,497 UK Civil Service job vacancies (1,542,411 tasks) using LLM-assigned AI exposure scores and finds that most jobs have heterogeneous, partial AI exposure rather than being fully automatable. At a conservative 80% automation threshold, productivity gains from job redesign (£5.2bn) far exceed cost savings from full automation (£1.1bn). Post-redesign, freed-up time shifts toward human-centric tasks: strategic leadership (26%), complex problem resolution (18%), and stakeholder management (17%). Lower-grade roles are significantly more exposed to AI automation than senior roles, which are concentrated in low-exposure clusters.",
    330   "red_flags": [
    331     {
    332       "flag": "Authors studying their own employer",
    333       "detail": "Seven of eight authors work for the UK Government Digital Service/DSIT, and the study analyzes UK Civil Service job data. This creates a potential conflict where demonstrating productivity gains from AI could benefit their own department's priorities, though the funding is independent."
    334     },
    335     {
    336       "flag": "LLM-generated exposure scores treated as ground truth",
    337       "detail": "The entire analysis rests on Claude Sonnet 3.5 v2's AI exposure scores, which showed only moderately strong correlation with human experts (ρ = 0.673) and systematic upward bias. The human experts themselves had poor agreement (Krippendorff's Alpha = 0.526), making the validity of the scoring uncertain."
    338     },
    339     {
    340       "flag": "No statistical significance testing for key claims",
    341       "detail": "The paper makes numerous comparative claims (lower vs. higher grades, productivity vs. cost savings, pre- vs. post-redesign time allocation) without any statistical significance tests. All comparisons are based on point estimates."
    342     },
    343     {
    344       "flag": "Circular reasoning in LLM-led job redesign",
    345       "detail": "The same LLM that scores tasks for automation potential is also used to redesign jobs post-automation. The finding that redesigned jobs emphasize 'human-centric' tasks may partly reflect the LLM's inherent biases about what work should look like rather than optimal productivity allocation."
    346     },
    347     {
    348       "flag": "Monetary savings estimates based on multiple stacked assumptions",
    349       "detail": "The £5.2bn and £1.1bn figures are computed by stacking assumptions: LLM exposure scores, a 0.75 decay rate for task ordering, median salary proxies (not actual salaries), and a single threshold value (θ=0.8). No uncertainty quantification is provided for these estimates."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "GPTs are GPTs: Labor Market Impact Potential of LLMs",
    355       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    356       "year": 2024,
    357       "relevance": "Foundational study on LLM exposure of occupations using O*NET tasks, which this paper extends with bottom-up task extraction."
    358     },
    359     {
    360       "title": "Generative AI and Jobs: A Global Analysis of Potential Effects on Job Quantity and Quality",
    361       "authors": ["Pawel Gmyrek", "Janine Berg", "David Bescond"],
    362       "year": 2023,
    363       "relevance": "Uses ISCO-08 framework to estimate AI exposure at task and occupation level, providing the classification methodology adapted by this paper."
    364     },
    365     {
    366       "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
    367       "authors": ["Shakked Noy", "Whitney Zhang"],
    368       "year": 2023,
    369       "relevance": "RCT measuring productivity effects of generative AI on writing tasks, foundational evidence for AI productivity claims."
    370     },
    371     {
    372       "title": "Generative AI at Work",
    373       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    374       "year": 2025,
    375       "doi": "10.1093/qje/qjae044",
    376       "relevance": "Documents 15% productivity boost from AI conversational assistant in customer service, key evidence for AI productivity gains."
    377     },
    378     {
    379       "title": "Navigating the Jagged Technological Frontier: Field Experimental Evidence of the Effects of AI on Knowledge Worker Productivity and Quality",
    380       "authors": ["Fabrizio Dell'Acqua", "Edward McFowland III", "Ethan R Mollick"],
    381       "year": 2023,
    382       "relevance": "Field experiment showing management consultants completed more tasks using generative AI, evidence for AI augmentation of knowledge work."
    383     },
    384     {
    385       "title": "Beyond AI Exposure: Which Tasks Are Cost-Effective to Automate with Computer Vision?",
    386       "authors": ["Maja Svanberg", "Wensu Li", "Martin Fleming", "Brian Goehring", "Neil Thompson"],
    387       "year": 2024,
    388       "relevance": "Argues many AI-exposed jobs are not economically attractive to automate, supporting the job redesign rather than displacement perspective."
    389     },
    390     {
    391       "title": "Canaries in the Coal Mine? Six Facts About the Recent Employment Effects of Artificial Intelligence",
    392       "authors": ["Erik Brynjolfsson", "Bharat Chandar", "Ruyu Chen"],
    393       "year": 2025,
    394       "relevance": "Presents evidence of employment displacement from AI in some contexts, directly relevant to the automation vs. augmentation debate."
    395     },
    396     {
    397       "title": "Artificial Intelligence and the Labor Market",
    398       "authors": ["Menaka Hampole", "Dimitris Papanikolaou", "Lawrence DW Schmidt", "Bryan Seegmiller"],
    399       "year": 2025,
    400       "relevance": "Studies heterogeneity of task-level AI exposure across firms using O*NET, complementary to this paper's within-firm approach."
    401     },
    402     {
    403       "title": "The EPOCH of AI: Human-Machine Complementarities at Work",
    404       "authors": ["Isabella Loaiza", "Roberto Rigobon"],
    405       "year": 2024,
    406       "relevance": "Finds a shift toward more human-intensive tasks in the US since 2016 using O*NET data, supporting this paper's redesign findings."
    407     },
    408     {
    409       "title": "On the Biology of a Large Language Model",
    410       "authors": ["Jack Lindsey", "Wes Gurnee", "Emmanuel Ameisen"],
    411       "year": 2025,
    412       "relevance": "Anthropic research on Claude's internal reasoning mechanisms, cited to qualify LLM 'reasoning' claims in the job redesign methodology."
    413     },
    414     {
    415       "title": "Generative AI Enhances Individual Creativity but Reduces the Collective Diversity of Novel Content",
    416       "authors": ["Anil R Doshi", "Oliver P Hauser"],
    417       "year": 2024,
    418       "relevance": "Study on AI's effect on creativity and innovation by one of this paper's authors, relevant to AI augmentation literature."
    419     },
    420     {
    421       "title": "Expertise",
    422       "authors": ["David Autor", "Neil Thompson"],
    423       "year": 2025,
    424       "relevance": "Proposes expertise as critical moderating factor for whether automation leads to higher or lower wages, aligning with this paper's findings on senior roles."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs