scan-v5.json (32479B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Generative AI at Work", 6 "authors": [ 7 "Erik Brynjolfsson", 8 "Danielle Li", 9 "Lindsey Raymond" 10 ], 11 "year": 2023, 12 "venue": "Social Science Research Network", 13 "arxiv_id": "2304.11771", 14 "doi": "10.3386/w31161" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All major abstract claims — 15% productivity gain, heterogeneity by skill/tenure, worker learning, fluency improvement, gains on rare problems, and improved customer experience — are supported by Tables 2-4 and Figures 2-11.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Staggered DiD with individual-level quasi-random rollout variation is validated by Sun-Abraham IW estimator, IV robustness (using team-level adoption date), multiple alternative estimators (Callaway-Sant'Anna, Borusyak et al., de Chaisemartin-D'Haultfœuille), and pre-trend tests — all support causal interpretation.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "The conclusion explicitly states: 'our findings apply for a particular AI tool, used in a single firm, within a single occupation, and should not be generalized across all occupations and AI systems' and specifies they capture 'medium run impacts.'", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "Authors formally address mean reversion (Appendix Figure A.9 rules it out empirically), manager selection bias in onboarding timing (IV approach, Section 4.1.2), selection into adherence (Section 5.1), and non-randomness of outage periods (Section 5.2).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "RPH (resolutions per hour) is clearly defined and operationalized; the paper uses multiple complementary metrics (AHT, CPH, resolution rate, NPS) to cross-validate, and the conclusion explicitly notes inability to observe wages or aggregate employment effects.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "There is no dedicated limitations section; limitations are embedded across multiple paragraphs within Section 7 (Conclusion) rather than a standalone section.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats discussed include: manager selection bias in onboarding timing, mean reversion in skill heterogeneity findings, non-random nature of outage events, inability to observe wages, ratchet effect on performance targets, and single-firm generalizability limits.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "Explicit boundaries stated: 'our paper is not designed to shed light on the aggregate employment or wage effects' and 'our data do not allow us to observe changes in wages, overall labor demand, or the skill composition of workers hired at the firm.'", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": true, 73 "justification": "Funding is disclosed in the acknowledgments: 'We thank...the Stanford Digital Economy Lab for funding.'", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are stated on the title page: Brynjolfsson at Stanford & NBER, Li and Raymond at MIT & NBER / MIT.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": true, 85 "justification": "The Stanford Digital Economy Lab is an academic research center independent of the AI firm or data firm; the disclaimer states the content does not represent Stanford or MIT views.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No formal competing interests statement is included; the paper does not disclose whether authors have equity, patents, or consulting relationships with OpenAI, the AI firm, or the data firm.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Footnote 1 explicitly defines 'artificial intelligence,' 'machine learning,' 'large language models,' and 'generative AI'; productivity is operationally defined as resolutions per hour (RPH).", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper states: 'This is, to our knowledge, the first study of the impact of generative AI deployed at scale in the workplace,' clearly positioning its contribution as first micro-level real-world evidence.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper engages substantively with prior work on IT skill complementarity (Bartel et al., Acemoglu et al.), lab-based AI productivity studies (Peng et al., Noy and Zhang, Dell'Acqua et al.), and human-AI collaboration failures (Vaccaro et al., Hoffman et al.), explaining how this work differs from each.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "No analysis code is released; the paper uses proprietary firm data and does not link to any public code repository.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "Data is proprietary to the Fortune 500 firm and its AI vendor; it is not publicly available and the paper does not indicate plans to release it.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No software environment, package versions, or reproducibility specs are provided; this is an economics study using proprietary data.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions are included; the underlying data is proprietary and inaccessible to outside researchers.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": true, 148 "justification": "All event study figures include 95% confidence intervals; all regression tables report clustered standard errors with significance stars (*** p<0.01, ** p<0.05, * p<0.10).", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": true, 154 "justification": "OLS with clustered standard errors is used throughout; significance tests are reported for all main comparisons across Tables 2-4 and appendix tables.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Effect sizes are reported as both absolute (0.30 chats/hr) and percentage changes from baseline means (15.2%), clearly contextualized against pre-treatment means shown in each table.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "Sample size (5,172 agents, 3M chats) is described but not formally justified through power analysis; it is determined by the firm's total workforce during the study period rather than by design.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": true, 172 "justification": "Table 1 reports standard deviations for key variables (e.g., 'St. Average Handle Time: 23'), and event study figures show confidence bands capturing variance across cohorts.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Never-treated agents serve as controls; pre-treatment outcomes for treated agents provide within-agent baselines; agent fixed effects control for time-invariant differences.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Control observations are contemporaneous, drawn from the same firm during the same period (Sep 2019-Jun 2021), with year-month fixed effects controlling for common shocks.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": false, 191 "answer": false, 192 "justification": "Not applicable to this observational economics study of a deployed system; there are no system components to ablate.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Seven distinct outcome measures are used: RPH, AHT, CPH, resolution rate, NPS, customer sentiment, and manager escalation rate, plus secondary measures of attrition and language fluency.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": true, 204 "justification": "Human evaluators validated LLM-generated language fluency scores (100 conversations for comprehensibility and 100 for native fluency, Appendix A.2.5) and topic classifications (3 independent evaluators on 100 conversations, Appendix A.2.6).", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": false, 209 "answer": false, 210 "justification": "Not a prediction task; this is a causal inference study using DiD, not a machine learning evaluation requiring train/test splits.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Extensive breakdowns provided: by skill quintile (Figure 3A), tenure group (Figure 3B), adherence quintile (Figure 5B), country (Figure 8C-D), topic frequency (Figure 7), and adoption cohort (Figure A.10).", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "The paper discusses quality declines for highest-skilled workers (Figure A.6, panels C-D), no improvement during outages for low-adherence agents (Figure 6D), and the risk of AI-induced over-reliance reducing training data quality.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Negative results are reported: small but significant declines in resolution rate and NPS for highest-skill/tenure agents, no significant average NPS improvement (Table 3 Column 4), and no outage-period improvement for low-adherence workers.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "The AI tool 'builds on a recent version of GPT' — no specific model version is stated despite the 2020-2021 deployment window when GPT-3 variants were available; likely proprietary vendor restriction.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Full prompts are provided for both the Gemini-based native fluency scoring (Appendix A.2.5) and comprehensibility scoring, and for the topic classification task (Appendix A.2.6).", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "No hyperparameters (temperature, top-p, etc.) are reported for either the core AI tool or the Gemini Pro-based analysis components.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": false, 253 "answer": false, 254 "justification": "The paper evaluates a deployed commercial black-box tool; internal agentic scaffolding is proprietary and not accessible to the researchers.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Appendix A.1 details sample construction: dropping chats with one message, dropping missing timestamps, winsorizing duration at 99th percentile, and the merge procedure across database systems.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "Data is proprietary to the Fortune 500 firm and its AI vendor; it cannot be shared for independent verification.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Data collection is described: firm's software systems tracked chat start/end times, resolution outcomes, and customer surveys; agent information came from internal company datasets; Appendix A.1 provides full detail.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": true, 279 "answer": true, 280 "justification": "Agent recruitment is through employment at the firm/subcontractors; Section 3.1 describes how agents were onboarded into the AI system (training sessions, manager allocation decisions, budget constraints).", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Appendix A.1 and A.2 document the full pipeline: chat-level extraction, merging across database systems, aggregation to agent-month level, and construction of all key variables including sentiment, language scores, and topic categories.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "This is not a benchmark evaluation study; the primary analysis concerns worker productivity outcomes, not model capabilities on held-out test sets.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": false, 299 "answer": false, 300 "justification": "Not applicable; the study evaluates real-world productivity outcomes rather than model performance on benchmarks with potential training overlap.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": false, 305 "answer": false, 306 "justification": "Not applicable; no benchmark evaluation is performed in this study.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": true, 313 "answer": false, 314 "justification": "No pre-registration is mentioned; the study is a retrospective analysis of a naturally occurring deployment, and economics quasi-experiments are typically not pre-registered.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": true, 319 "answer": false, 320 "justification": "No IRB or ethics approval is mentioned anywhere in the paper, which is common for economics studies using firm administrative data but is not disclosed.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": true, 325 "answer": true, 326 "justification": "Key demographics are reported: 89% of agents outside the US (mainly Philippines), 11% US-based, 25 distinct locations; Table 1 reports share of US agents and team/location counts across treatment groups.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": true, 331 "answer": true, 332 "justification": "Appendix A.1 explicitly states inclusion/exclusion: dropping chats with only one message, dropping chats missing start/end times or agent identifiers, winsorizing outliers, including only active agent-months.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": true, 337 "answer": false, 338 "justification": "The main analysis is not randomized; for the pilot RCT, only 'approximately 50 workers, about half of whom were randomized into treatment' is stated — no randomization procedure, stratification, or allocation mechanism is described.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": true, 343 "answer": false, 344 "justification": "No blinding is mentioned; agents knew they had AI access, and blinding is not feasible in a setting where workers actively use the AI tool in their workflow.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": true, 349 "answer": true, 350 "justification": "Section 6.3 and Table A.11 explicitly analyze attrition, reporting a ~40% decrease in attrition likelihood for newer agents relative to a 25% baseline monthly attrition rate.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": false, 357 "answer": false, 358 "justification": "This is a labor economics study of a deployed commercial system; inference latency and cost of the AI tool are not the focus and are not reported.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": false, 363 "answer": false, 364 "justification": "No compute budget is stated; this economics paper does not involve training models, and the commercial AI tool's compute costs are proprietary.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "AI assistance increases worker productivity (resolutions per hour) by 15% on average", 373 "evidence": "Table 2, Column 3: coefficient 0.301 on 'Post AI X Ever Treated' with baseline mean of 2.176, representing 15.2% increase; confirmed across multiple DiD estimators in Table A.9", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Less-skilled workers see the largest gains, with the lowest skill quintile improving resolutions per hour by ~36%", 378 "evidence": "Table A.6: Q1 (lowest skill) coefficient 0.527 vs Q5 (highest) coefficient 0.015; Figure 3A shows monotonically declining effects from Q1 to Q5", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Highest-skilled and most-experienced workers experience small but significant declines in resolution rate and customer satisfaction", 383 "evidence": "Figure A.6 Panels C-D and Figure A.7 Panels C-D show statistically significant negative effects on resolution rate and NPS for Q5 skill and >12 months tenure groups", 384 "supported": "strong" 385 }, 386 { 387 "claim": "AI assistance facilitates durable worker learning, with productivity gains persisting during AI system outages", 388 "evidence": "Figure 6 Panel B: workers exposed longer to AI maintain productivity improvements even during outages; Panel C shows this effect is concentrated among high-adherence workers; Panel D shows no improvement for low-adherence workers", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "AI assistance improves English fluency, with larger gains for Filipino agents than US agents", 393 "evidence": "Figure 8 and language fluency table: native fluency coefficient 0.251 for Philippines-based vs 0.159 for US-based agents; LLM scores validated against human evaluators (mean difference not significant, p=0.22)", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "AI gains are largest for moderately rare problems where workers have limited experience but AI has sufficient training data", 398 "evidence": "Figure 7A: largest treatment effect (-5 to -6 minutes duration, ~14%) for topics in 75th-90th percentile of rarity, not for most common (Payroll/Account, ~10%) or rarest topics (~11%)", 399 "supported": "strong" 400 }, 401 { 402 "claim": "AI assistance improves customer sentiment by 0.18 points (half a standard deviation)", 403 "evidence": "Table 4, Column 1: coefficient 0.177 on 'Mean(Customer Sentiment)' with baseline mean 0.141; stated as 'equivalent to half of a standard deviation'; Figure 10A confirms immediate persistent effect", 404 "supported": "strong" 405 }, 406 { 407 "claim": "Customer requests to speak to a manager decline by approximately 25% following AI deployment", 408 "evidence": "Table 4, Column 3: coefficient -0.00875 on 'Share Req. Manager' with baseline mean of 0.0377, representing a 23% decline; Figure 10C shows gradual decline in event study", 409 "supported": "strong" 410 } 411 ], 412 "methodology_tags": [ 413 "observational", 414 "rct" 415 ], 416 "key_findings": "This is the first large-scale real-world study of generative AI in the workplace, using 5,172 customer support agents across a staggered natural experiment. AI assistance increases productivity by 15% on average, but effects are strongly heterogeneous: least-skilled workers improve by ~36% while most-skilled workers see near-zero productivity gains and small but significant quality declines. The paper provides compelling evidence that AI facilitates durable worker learning (via AI outage analysis), improves English fluency particularly for Filipino agents, markedly reduces hostile customer interactions, and cuts manager escalation requests by ~25% — improving both productivity and the experience of work. The reversal of traditional skill-biased technical change patterns is the paper's most striking finding.", 417 "red_flags": [ 418 { 419 "flag": "Proprietary data, no reproducibility", 420 "detail": "All data is proprietary to the Fortune 500 firm and AI vendor; no code, data, or reproduction instructions are available, making independent verification impossible." 421 }, 422 { 423 "flag": "GPT version unspecified", 424 "detail": "The paper only states the tool 'builds on a recent version of GPT'; the specific model version used during the 2020-2021 deployment is not disclosed, likely due to vendor restrictions." 425 }, 426 { 427 "flag": "Attrition analysis lacks agent fixed effects", 428 "detail": "Section 6.3 explicitly notes that agent fixed effects cannot be included in attrition regressions because attrition is a one-time event, creating potential upward bias if firms preferentially grant AI access to retention-prone workers." 429 }, 430 { 431 "flag": "Outage periods may not be random", 432 "detail": "The learning analysis using AI outages acknowledges that outages 'are rare and not necessarily random,' potentially confounding the durable learning effect interpretation since outage-period chats may systematically differ." 433 }, 434 { 435 "flag": "Single firm, single occupation", 436 "detail": "All findings are from one Fortune 500 software firm's customer support function with a stable product and uniform task type; generalizability to other industries, occupations, or AI tools is explicitly limited." 437 }, 438 { 439 "flag": "No competing interests declaration", 440 "detail": "Authors do not formally declare absence of financial interests with OpenAI, the AI firm, or the data firm — particularly relevant given potential consulting relationships of prominent economists with industry." 441 } 442 ], 443 "cited_papers": [ 444 { 445 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot (Peng et al. 2023)", 446 "relevance": "Closest comparable study of generative AI on software developer productivity; also finds large productivity gains (2x speed) in a controlled setting" 447 }, 448 { 449 "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence (Noy and Zhang 2023)", 450 "relevance": "Online RCT showing ChatGPT improves professional writing task completion speed; consistent skill-compression finding with this paper" 451 }, 452 { 453 "title": "Navigating the Jagged Technological Frontier: Field Experimental Evidence of the Effects of AI on Knowledge Worker Productivity and Quality (Dell'Acqua et al. 2023)", 454 "relevance": "Field experiment on GPT-4 in management consulting; finds AI helps within-frontier tasks but hurts outside-frontier — parallel to the top-worker quality decline finding here" 455 }, 456 { 457 "title": "AI, Skill, and Productivity: The Case of Taxi Drivers (Kanazawa et al. 2022)", 458 "relevance": "Closest prior study of non-generative AI tool on worker productivity; finds 5% search time reduction with largest gains for low-skill drivers — parallel heterogeneity pattern" 459 }, 460 { 461 "title": "When Are Combinations of Humans and AI Useful? (Vaccaro et al. 2024)", 462 "relevance": "Meta-analysis of 100+ experiments showing human-AI collaboration often underperforms AI alone — important counterpoint and context for the positive productivity findings" 463 }, 464 { 465 "title": "Estimating dynamic treatment effects in event studies with heterogeneous treatment effects (Sun and Abraham 2021)", 466 "relevance": "Key methodological reference for the interaction-weighted DiD estimator used as the main identification strategy throughout the paper" 467 }, 468 { 469 "title": "How Does Information Technology Affect Productivity? Plant-Level Comparisons (Bartel, Ichniowski, Shaw 2007)", 470 "relevance": "Key reference for prior IT-productivity literature showing skill-complementarity; this paper's findings directly contrast with the traditional skill-biased technical change narrative" 471 }, 472 { 473 "title": "Difference-in-Differences with multiple time periods (Callaway and Sant'Anna 2021)", 474 "relevance": "Alternative robust DiD estimator used for robustness checks; part of the staggered DiD literature that motivates the identification strategy" 475 }, 476 { 477 "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models (Eloundou et al. 2023)", 478 "relevance": "Complementary analysis of LLM labor market exposure; contextualizes which occupations face AI augmentation vs. automation and the non-routine task exposure" 479 }, 480 { 481 "title": "AI Assistance in Legal Analysis: An Empirical Study (Choi and Schwarcz 2023)", 482 "relevance": "Comparable generative AI productivity study in law school exam context; consistent skill-compression effect where lower-performing students benefit most" 483 } 484 ], 485 "engagement_factors": { 486 "practical_relevance": { 487 "score": 3, 488 "justification": "Directly applicable to businesses deploying AI customer service tools; provides actionable insight that AI helps newest workers most and that training/adherence matters for outcomes." 489 }, 490 "surprise_contrarian": { 491 "score": 3, 492 "justification": "Finding that AI benefits least-skilled workers most directly reverses decades of evidence for skill-biased technical change; the quality decline for top workers is also counterintuitive." 493 }, 494 "fear_safety": { 495 "score": 1, 496 "justification": "Raises concerns about potential deskilling of top workers, ratchet effects on performance targets, and reduced training data quality, but does not focus on AI safety risks." 497 }, 498 "drama_conflict": { 499 "score": 1, 500 "justification": "Some tension around whether productivity gains translate to job cuts vs. service improvements, but the paper is measured in tone with no significant controversy." 501 }, 502 "demo_ability": { 503 "score": 2, 504 "justification": "Similar AI customer service tools are commercially available and can be experienced, though the specific proprietary tool studied cannot be accessed." 505 }, 506 "brand_recognition": { 507 "score": 3, 508 "justification": "Brynjolfsson (Stanford/NBER, co-author of 'The Second Machine Age') is among the most prominent economists studying technology; MIT/Stanford/NBER affiliations add significant visibility and credibility." 509 } 510 }, 511 "hn_data": { 512 "threads": [ 513 { 514 "hn_id": "43577957", 515 "title": "A Study of Undefined Behavior Across Foreign Function Boundaries in Rust Libs", 516 "points": 4, 517 "comments": 1, 518 "url": "https://news.ycombinator.com/item?id=43577957", 519 "created_at": "2025-04-04T03:10:05Z" 520 }, 521 { 522 "hn_id": "36513194", 523 "title": "On the Planning Abilities of Large Language Models – A Critical Investigation", 524 "points": 3, 525 "comments": 0, 526 "url": "https://news.ycombinator.com/item?id=36513194", 527 "created_at": "2023-06-28T21:57:00Z" 528 }, 529 { 530 "hn_id": "31457199", 531 "title": "Masked image modeling advances 3D medical image analysis", 532 "points": 2, 533 "comments": 0, 534 "url": "https://news.ycombinator.com/item?id=31457199", 535 "created_at": "2022-05-21T12:15:54Z" 536 }, 537 { 538 "hn_id": "36053359", 539 "title": "Why Is the Winner the Best?", 540 "points": 1, 541 "comments": 1, 542 "url": "https://news.ycombinator.com/item?id=36053359", 543 "created_at": "2023-05-24T02:27:19Z" 544 }, 545 { 546 "hn_id": "44041341", 547 "title": "Grounded in Context: Retrieval-Based Method for Hallucination Detection", 548 "points": 1, 549 "comments": 0, 550 "url": "https://news.ycombinator.com/item?id=44041341", 551 "created_at": "2025-05-20T13:23:42Z" 552 }, 553 { 554 "hn_id": "39736862", 555 "title": "The Planning Abilities of LLMs: A Critical Investigation (2023)", 556 "points": 1, 557 "comments": 0, 558 "url": "https://news.ycombinator.com/item?id=39736862", 559 "created_at": "2024-03-17T18:45:17Z" 560 }, 561 { 562 "hn_id": "38586493", 563 "title": "On the Planning Abilities of Large Language Models: A Critical Investigation", 564 "points": 1, 565 "comments": 0, 566 "url": "https://news.ycombinator.com/item?id=38586493", 567 "created_at": "2023-12-09T21:57:24Z" 568 }, 569 { 570 "hn_id": "37627101", 571 "title": "How Robust Is Google's Bard to Adversarial Image Attacks?", 572 "points": 1, 573 "comments": 0, 574 "url": "https://news.ycombinator.com/item?id=37627101", 575 "created_at": "2023-09-23T20:20:29Z" 576 }, 577 { 578 "hn_id": "37158226", 579 "title": "What Types of Questions Require Conversation to Answer? AskReddit Study", 580 "points": 1, 581 "comments": 0, 582 "url": "https://news.ycombinator.com/item?id=37158226", 583 "created_at": "2023-08-17T07:07:27Z" 584 }, 585 { 586 "hn_id": "36105713", 587 "title": "On the Planning Abilities of Large Language Models – A Critical Investigation", 588 "points": 1, 589 "comments": 0, 590 "url": "https://news.ycombinator.com/item?id=36105713", 591 "created_at": "2023-05-28T16:51:29Z" 592 } 593 ], 594 "top_points": 4, 595 "total_points": 16, 596 "total_comments": 2 597 } 598 }