scan.json (28610B)
1 { 2 "paper": { 3 "title": "Generative AI at Work", 4 "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"], 5 "year": 2023, 6 "venue": "NBER Working Paper / arXiv", 7 "arxiv_id": "2304.11771", 8 "doi": "10.3386/w31161" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "methodology_tags": ["observational", "rct"], 13 "key_findings": "Access to a generative AI conversational assistant increases customer support worker productivity by 15% (resolutions per hour), with gains of ~30% for the least skilled/experienced workers and minimal impact on the most skilled. AI assistance facilitates durable worker learning, as shown by productivity gains persisting during system outages. Customer sentiment improves markedly (0.5 SD increase), manager escalation requests decline ~25%, and worker attrition decreases, especially among newer agents.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository or analysis scripts are provided or linked in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The data comes from a Fortune 500 firm's proprietary customer service records. No data is released; this is understandable for confidentiality but still a NO." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, software versions, or computational setup details are provided for the econometric analysis." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included. The econometric specifications are described (Equations 1-7, Appendix A.3) but no runnable code or scripts are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "95% confidence intervals are shown on all event study figures (Figures 2, 3, 6, 8, 10, etc.) and robust standard errors are reported in parentheses in all regression tables." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Statistical significance is reported throughout with asterisks (*** p<0.01, ** p<0.05, * p<0.10) and clustered robust standard errors in all tables (Tables 2-4, A.1-A.11)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported with baseline context throughout: e.g., '0.30 chats or 15.2%' off pre-treatment mean of 1.97 RPH (Table 2), '3.7 minute decrease... an 8.5% decline from the baseline mean of 43 minutes' (Table 3), customer sentiment improves by 0.18 points 'equivalent to half of a standard deviation' (Section 6.1)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The sample size of 5,172 agents and 3 million chats is large and naturally determined by the firm's workforce, but no power analysis or formal sample size justification is provided." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Standard deviations are reported in summary statistics (Table 1, e.g., 'St. Average Handle Time 23-24 min'). Robust standard errors clustered at agent level are reported for all regressions. Multiple estimators are compared (Appendix Table A.9, Figure A.4)." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The difference-in-differences design uses pre-treatment observations and never-treated agents as baselines. The paper also compares against multiple alternative DiD estimators (Sun-Abraham, Callaway-Sant'Anna, Borusyak et al., de Chaisemartin-D'Haultfœuille)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The robust DiD estimators used (Sun and Abraham 2021, Borusyak et al. 2022, Callaway and Sant'Anna 2021) represent the state of the art in causal inference for staggered adoption designs." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically decomposes productivity into components (AHT, CPH, resolution rate, NPS in Table 3), examines heterogeneity by skill quintile (Figure 3A), tenure (Figure 3B), adherence (Figure 5B), topic frequency (Figure 7), and examines outage periods to isolate learning from real-time assistance (Figure 6)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Five main outcome metrics: resolutions per hour, average handle time, chats per hour, resolution rate, and net promoter score (Tables 2-3). Additional metrics include customer sentiment, agent sentiment, manager escalation rate, attrition, language fluency, and textual similarity." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Human evaluation is used to validate LLM-generated topic classifications (3 independent human evaluators on 100 conversations, Appendix A.2.6) and language fluency scores (2 independent human reviewers on 100 conversations, Appendix A.2.5)." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is an observational/quasi-experimental study, not a prediction task. The concept of held-out test sets does not apply to DiD causal inference designs." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive breakdowns by worker skill quintile (Figure 3A, A.6), tenure group (Figure 3B, A.7), adherence quintile (Figure 5B, A.12), topic frequency (Figure 7), agent location (Figure 8C-D), and adoption cohort (Figure A.10)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper identifies where AI assistance fails or has negative effects: highest-skilled workers see 'small declines in quality' (Section 4.2.1, Figure A.6 Panels C-D showing negative effects on resolution rate and NPS for Q5 workers). The paper also discusses potential over-reliance by top performers." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results: no significant impact on customer satisfaction (NPS) overall (Table 3 Col 4); negative quality effects for highest-skilled workers (Panels C-D of Figure A.6); top performers' over-reliance on AI may reduce future model quality (Section 5.1). The paper also notes that AI recommendations may 'distract top performers' (Section 4.2.1)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "All abstract claims are supported: 15% productivity increase (Table 2, Col 3: 0.301/1.97 = 15.2%), heterogeneity by skill (Figure 3), worker learning from outage analysis (Figure 6), gains largest for rare problems (Figure 7), customer sentiment improvement and reduced escalation (Figure 10, Table 4)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper uses difference-in-differences with staggered rollout, includes a small RCT pilot (Section 4.1.1, Appendix Table A.1), instruments individual adoption with team-level adoption timing (Appendix Table A.2), tests parallel trends via event studies, and uses multiple robust DiD estimators. The causal identification strategy is well-justified for the main productivity claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly bounds generalization: 'these findings apply for a particular AI tool, used in a single firm, within a single occupation, and should not be generalized across all occupations and AI systems' (Section 7). It also notes inability to observe wages, overall labor demand, or skill composition changes." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Multiple alternative explanations are discussed: mean reversion (Section 4.2.1, Figure A.9), selection into treatment (addressed via agent FE and IV in Appendix Table A.2), selection on adherence vs. causal effect of following recommendations (Section 5.1), Hawthorne effects (implicitly via outage analysis), and confounds in attrition estimates (Section 6.3)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper carefully distinguishes its proxy measures from broader outcomes. RPH is presented as a specific industry metric, not a general productivity measure. The paper explicitly discusses what it cannot observe: 'we do not have access to pay data,' 'our paper is not designed to shed light on the aggregate employment or wage effects,' and discusses equilibrium effects it cannot measure (Section 7)." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper says the tool 'is built on a recent version of the Generative Pre-trained Transformer (GPT) family of large language models developed by OpenAI' (Section 2.2) but does not specify which GPT version (GPT-3, GPT-3.5, etc.) or any version identifier." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "The AI tool is a deployed commercial product that generates real-time suggestions; the authors are studying its workplace impact, not designing or controlling prompts. The prompts/fine-tuning are internal to the AI firm." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters for the AI model (temperature, sampling settings) or fine-tuning process are reported. The econometric specifications are well-documented but the AI system's technical parameters are not." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "The paper evaluates a third-party commercial AI tool as deployed. The authors describe the tool's two main outputs (suggested responses and documentation links, Section 2.3, Appendix Figure A.1) but cannot be expected to describe internal scaffolding they have no access to." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A.1 describes sample construction in detail: starting with 3,006,395 chats, dropping single-message chats, merging across databases via chat identifiers, winsorizing call duration at 99th percentile, and aggregating to agent-month level. Variable construction is detailed in Appendix A.2." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "The Conclusion (Section 7) contains extensive limitations discussion spanning multiple paragraphs covering single-firm generalization, equilibrium effects, wage data absence, potential ratchet effects on performance targets, and longer-run incentive challenges." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed throughout: manager selection bias in onboarding (addressed via IV, Section 4.1.2), mean reversion concern (Section 4.2.1, Figure A.9), inability to distinguish voluntary from involuntary attrition (Appendix A.3.5), lack of agent fixed effects for attrition (Section 6.3), and potential over-reliance on AI by top performers degrading future model quality." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Explicitly stated: 'our paper is not designed to shed light on the aggregate employment or wage effects' (Section 1), 'these findings apply for a particular AI tool, used in a single firm, within a single occupation' (Section 7), 'we do not have access to pay data' and cannot observe 'longer run equilibrium responses in worker demand or job design' (Section 7)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Raw data is proprietary (Fortune 500 firm's customer service records) and not available for independent verification. Understandable given confidentiality constraints, but still NO." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data collection is described in Appendix A.1: chat conversations from the firm's software systems (September 2019–June 2021), merged with internal company datasets and AI firm records. Agent information includes employer, location, manager/team, tenure, and AI onboarding date." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Agent selection for AI treatment is described: managers scheduled onboarding to minimize customer disruption, training sessions were limited by AI firm capacity, contractual license limits applied, and replacement occurred when AI-enabled agents left (Section 3.1). Agent employment details are in Section 2.2 and Appendix A.2.2." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Appendix A.1 documents the pipeline: starting sample (3M chats, 5,172 agents), merging steps using chat identifiers, dropping criteria (single-message chats, missing times/identifiers), winsorization (99th percentile), and aggregation to agent-month level. Topic classification (Appendix A.2.6) and sentiment scoring (A.2.4) pipelines are also documented." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is disclosed in the footnote: 'We thank... the Stanford Digital Economy Lab for funding. The content is solely the responsibility of the authors and does not necessarily represent the official views of Stanford University, MIT, or the NBER.'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: Brynjolfsson (Stanford & NBER), Li (MIT & NBER), Raymond (MIT). The paper refers to the AI firm and data firm anonymously but the authors are academic researchers, not employees of either firm." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The Stanford Digital Economy Lab is an academic research center with no apparent financial stake in whether AI tools increase or decrease productivity. The funder appears independent of the outcome." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper. While the authors appear to be independent academics, the absence of an explicit declaration is NO per the schema criteria." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This study examines the impact of a deployed AI tool on worker productivity. It does not evaluate a pre-trained model's capability on any benchmark." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Not a benchmark evaluation study. The paper studies workplace outcomes, not model capability on test sets." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not a benchmark evaluation study." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "No pre-registration is mentioned for either the main observational study or the small RCT pilot. No link to OSF, AsPredicted, AEA registry, or similar." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No IRB or ethics board approval is mentioned anywhere in the paper, despite studying 5,172 workers whose chat data and performance records were analyzed." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": true, 257 "justification": "Demographics are reported: 89% of agents are outside the US, mainly in the Philippines (Section 2.2, Table 1). Agent tenure distribution, employer type (direct vs. subcontractor), and geographic distribution across 25 locations are provided. Detailed breakdown in Table 1." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": true, 262 "justification": "Inclusion criteria are described: agents providing chat-based technical support for US-based small businesses at the data firm and its subcontractors. Exclusions: chats with only one message, missing start/end times, missing identifiers (Appendix A.1). Agent sample construction is documented." 263 }, 264 "randomization_described": { 265 "applies": true, 266 "answer": true, 267 "justification": "The staggered rollout mechanism is described in detail (Section 3.1): limited training session capacity, manager scheduling to minimize disruption, contractual license limits. The small RCT pilot is mentioned (Section 4.1.1, ~50 workers, half randomized to treatment), though limited detail on the randomization mechanism. The IV approach instruments individual adoption with team-level timing (Section 4.1.2)." 268 }, 269 "blinding_described": { 270 "applies": true, 271 "answer": false, 272 "justification": "No blinding is described. Agents knew whether they had AI access (they received a 3-hour onboarding training). Whether managers or customers were blinded is not discussed." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": true, 277 "justification": "Worker attrition is explicitly analyzed as an outcome (Section 6.3, Figure 11, Table A.11). The paper reports baseline attrition rate of 28.8% (Table A.11 DV Mean) and notes that attrition analysis drops pre-treatment observations for treated agents because they 'must survive to be treated' (Section 6.3, Appendix A.3.5)." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No costs of the AI system are reported. The paper mentions 'generative AI was costly and relatively untested' (Section 3.1) and that the firm had a 'limited budget for its deployment' but provides no specific cost figures." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No computational budget is stated for either the AI system or the econometric analysis." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "Access to AI assistance increases worker productivity by 15% as measured by resolutions per hour.", 296 "evidence": "Table 2 Column 3: coefficient 0.301 (p<0.01) with agent, year-month, and tenure FE, off pre-treatment mean of 1.97 RPH. Confirmed by Sun-Abraham event study (Figure 2) showing immediate and persistent effect.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Less-skilled workers see productivity gains of ~36% while the most skilled workers see no significant gains.", 301 "evidence": "Figure 3A: Q1 (lowest skill) gains 0.5 RPH (~36%), Q5 (highest skill) gains ~0 RPH. Table A.6 Column 1: Q1 coefficient 0.527 (p<0.01), Q5 coefficient 0.015 (not significant). Controlling for tenure FE.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "AI assistance leads to durable worker learning, not just real-time reliance on AI suggestions.", 306 "evidence": "Figure 6: During AI system outages, workers with more prior AI exposure handle chats faster (Panel B shows increasing effect over months of exposure). Workers with high initial adherence show gains during outages (Panel C) while low-adherence workers do not (Panel D).", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "AI assistance improves customer sentiment by half a standard deviation.", 311 "evidence": "Table 4 Column 1: customer sentiment improves by 0.177 points (p<0.01) on a -1 to 1 scale. Authors state this is 'equivalent to half of a standard deviation' (Section 6.1). Event study in Figure 10A shows immediate and persistent effect.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Customer requests to speak to a manager decline by ~25% after AI deployment.", 316 "evidence": "Figure 10C and Table 4 Column 3: coefficient -0.00875 (p<0.01) off baseline of 0.0377, approximately 23% decline. Event study shows gradual decline after AI introduction.", 317 "supported": "strong" 318 }, 319 { 320 "claim": "Gains from AI adoption are largest for moderately rare problems, not the most common ones.", 321 "evidence": "Figure 7A: largest duration reduction (5-6 min, 14%) for 75th-90th percentile topic rarity, vs. 4-5 min (10%) for most common topics. Figure 7B shows monotonic relationship when controlling for overall topic frequency: AI helps most for problems an individual agent encounters least often.", 322 "supported": "moderate" 323 }, 324 { 325 "claim": "AI assistance improves English language fluency, especially for Philippines-based agents.", 326 "evidence": "Figure 8: event studies show significant improvement in both comprehensibility and native fluency scores. Panels C-D show larger gains for Filipino agents than US-based agents. Validated against human evaluators (Appendix A.2.5).", 327 "supported": "moderate" 328 } 329 ], 330 "red_flags": [ 331 { 332 "flag": "Company evaluating its own product's impact", 333 "detail": "While the academic authors are independent, the study relies on data from an anonymous 'AI firm' and 'data firm.' The AI firm tracks adherence and outage data, and the data firm selects which workers receive AI access. The relationship between the researchers and the firms is not fully disclosed — the AI firm provided the data and has a financial interest in positive results." 334 }, 335 { 336 "flag": "Non-random treatment assignment in main analysis", 337 "detail": "The main analysis relies on staggered rollout where managers selected who got AI access and when. Though the paper addresses this with IV estimation and argues managers prioritized scheduling convenience, positive selection bias (giving AI to more promising workers first) cannot be fully ruled out. The RCT pilot (N=22 treated) is too small for robust inference." 338 }, 339 { 340 "flag": "No IRB or ethics approval mentioned", 341 "detail": "The study analyzes detailed performance data, chat transcripts, and employment records of 5,172 workers without mentioning ethics review. While this may be covered by the firms' data use agreements, the absence of ethics oversight disclosure is notable for a study of workplace surveillance data." 342 }, 343 { 344 "flag": "Proprietary data prevents independent replication", 345 "detail": "All data comes from anonymous firms and cannot be independently verified. No code, data, or reproduction materials are provided. This is a structural limitation but nonetheless means the results cannot be independently verified." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 351 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 352 "year": 2023, 353 "relevance": "RCT studying Copilot's productivity impact on software developers — directly comparable methodology for AI-assisted coding productivity." 354 }, 355 { 356 "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence", 357 "authors": ["Shakked Noy", "Whitney Zhang"], 358 "year": 2023, 359 "relevance": "Online experiment showing ChatGPT access compresses productivity distribution for professional writing tasks, finding similar skill-heterogeneity patterns." 360 }, 361 { 362 "title": "Navigating the Jagged Technological Frontier: Field Experimental Evidence of the Effects of AI on Knowledge Worker Productivity and Quality", 363 "authors": ["Fabrizio Dell'Acqua", "Edward McFowland III", "Ethan Mollick"], 364 "year": 2023, 365 "relevance": "Field experiment showing GPT-4 improves management consulting task quality within capabilities but hurts performance outside — directly relevant to AI-human complementarity." 366 }, 367 { 368 "title": "Do Users Write More Insecure Code with AI Assistants?", 369 "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"], 370 "year": 2022, 371 "relevance": "Study finding AI coding assistants may produce insecure code — relevant counterpoint to productivity gains narrative." 372 }, 373 { 374 "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models", 375 "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"], 376 "year": 2023, 377 "arxiv_id": "2303.10130", 378 "relevance": "Taxonomy of LLM exposure across occupations — provides framework for understanding which jobs are most affected by AI tools." 379 }, 380 { 381 "title": "When Are Combinations of Humans and AI Useful?", 382 "authors": ["Michelle Vaccaro", "Abdullah Almaatouq", "Thomas Malone"], 383 "year": 2024, 384 "relevance": "Meta-analysis of 100+ studies finding human-AI collaborations often underperform AI or humans alone — important counterpoint to this paper's positive findings." 385 }, 386 { 387 "title": "The Uneven Impact of Generative AI on Entrepreneurial Performance", 388 "authors": ["Nicholas G. Otis", "Rowan Clarke", "Rembrand Koning"], 389 "year": 2023, 390 "relevance": "Study finding limited or negative effects of AI adoption in entrepreneurship — provides contrasting evidence on AI productivity impacts." 391 }, 392 { 393 "title": "The Adoption of ChatGPT", 394 "authors": ["Anders Humlum", "Emilie Vestergaard"], 395 "year": 2024, 396 "relevance": "Study showing AI adoption depends on training access and regulations — relevant to understanding organizational factors in AI deployment." 397 }, 398 { 399 "title": "AI, Skill, and Productivity: The Case of Taxi Drivers", 400 "authors": ["Kyogo Kanazawa", "Daiji Kawaguchi", "Hitoshi Shigeoka", "Yasutora Watanabe"], 401 "year": 2022, 402 "relevance": "Non-generative AI tool for taxi drivers showing 5% search time reduction with largest gains for low-skill workers — parallel finding in different domain." 403 }, 404 { 405 "title": "AI Assistance in Legal Analysis: An Empirical Study", 406 "authors": ["Jonathan H. Choi", "Daniel Schwarcz"], 407 "year": 2023, 408 "relevance": "Empirical study of AI assistance for law students showing productivity compression, consistent with skill-heterogeneity findings." 409 }, 410 { 411 "title": "Scaling laws for neural language models", 412 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 413 "year": 2020, 414 "relevance": "Foundational work on LLM scaling laws — relevant to understanding the technical foundations of the AI tool studied." 415 }, 416 { 417 "title": "Training language models to follow instructions with human feedback", 418 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 419 "year": 2022, 420 "arxiv_id": "2203.02155", 421 "relevance": "RLHF methodology used in the AI system's fine-tuning process — relevant to understanding how the tool was built." 422 } 423 ] 424 }