scan-v5.json (23911B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "LLM Harms: A Taxonomy and Discussion", 6 "authors": [ 7 "Kevin Chen", 8 "Saleh Afroogh", 9 "Abhejay Murali", 10 "David Atkinson", 11 "Amit Dhurandhar" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2512.05929", 16 "doi": "10.48550/arXiv.2512.05929" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract's three claims—a five-category harm taxonomy, mitigation strategies, and a dynamic auditing proposal—are all substantiated across Sections IV–VII and the conclusion.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper regularly presents causal-sounding claims (e.g., 'RLHF filters amplify polarisation,' 'diffusion models displaced 15% of freelance illustrators') without noting that the underlying cited studies are mostly observational; the narrative synthesis strips away study design caveats.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "While scope is nominally restricted to text-based LLMs ≥7B parameters, many conclusions are stated without this qualifier (e.g., 'LLMs can erode trust in electoral information ecosystems faster than current content-moderation tooling adapts'), overgeneralizing beyond the 200-paper corpus.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "Counter-evidence is occasionally noted (Acemoglu & Restrepo on net job creation; declining overt toxicity in newer models) but systematic alternative explanations for major harm claims are absent; the paper is primarily harm-confirmatory rather than analytically balanced across competing interpretations.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper regularly conflates proxy measures with claimed outcomes—e.g., OpenAI 'task exposure' indexes are treated as evidence of actual labor displacement, and lab hallucination error rates are generalized to clinical deployment risk without discussing the measurement gap.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section; methodological constraints are briefly noted in the conclusion ('quantifying harm frequency was beyond the scope of our qualitative review') but are not systematically catalogued in a named section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats to validity are discussed; the paper does not address potential biases from the saturation-based search stopping rule, lack of inter-rater reliability reporting for the coding scheme, or English-language publication dominance in the corpus.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "Scope is explicitly bounded to 'text-based LLMs ≥7B parameters' with exclusion of vision-language hybrids and small domain-specific models; the 2021–June 2025 time window is justified as capturing the period of accelerated foundation-model deployment.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Funding is explicitly disclosed: 'NSF grants 2125858, 2236305 and UT-Good Systems Grand Challenge.'", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All author affiliations are disclosed on the title page, including University of Texas at Austin (Urban Information Lab, McCombs), and IBM Research.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "The funders (NSF and UT-Good Systems Grand Challenge) are public/academic entities with no commercial stake in the taxonomy's conclusions; one co-author is from IBM Research (an AI product developer), but this is disclosed via affiliation.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper explicitly states: 'The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.'", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": false, 101 "justification": "The central organizing concept 'harm' is never formally defined; coding categories (harm type, severity, prevalence) are listed but lack operational definitions, and terms like 'downstream' are used structurally without conceptual grounding.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three explicit objectives are stated: (1) develop a development-timeline taxonomy of harms, (2) analyze causal linkages between categories, and (3) assess adequacy of existing technical, organizational, and regulatory mitigations.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section II substantively engages with Weidinger et al.'s risk taxonomy, Blodgett et al. on social bias, InstructGPT/RLHF literature, HELM, OWASP, and EU AI Act governance frameworks, explicitly situating this paper relative to each.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "survey": { 120 "search_and_selection": { 121 "search_strategy_reproducible": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper uses a 'saturation rule' (stop when two successive result pages yield no new relevant titles) that is explicitly dependent on database ranking at search time; a re-run on a different date would yield a different corpus.", 125 "source": "haiku" 126 }, 127 "inclusion_exclusion_explicit": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper describes coding procedures but does not state explicit inclusion/exclusion criteria (language restriction, peer-review requirement, minimum study size, etc.) that would allow consistent replication of selection decisions.", 131 "source": "haiku" 132 }, 133 "prisma_or_structured_protocol": { 134 "applies": true, 135 "answer": true, 136 "justification": "The paper explicitly states adherence to PRISMA-2020, registered a protocol on OSF with a permanent DOI, and includes a PRISMA flow diagram as Figure 1 documenting the screening funnel.", 137 "source": "haiku" 138 }, 139 "search_terms_provided": { 140 "applies": true, 141 "answer": true, 142 "justification": "Table 2 provides harm cluster nodes and corresponding screening phrases; anchor terms ('large language model*', 'LLM', 'foundation model*') combined with topical harm-axis phrases are described explicitly in the methodology.", 143 "source": "haiku" 144 }, 145 "databases_listed": { 146 "applies": true, 147 "answer": true, 148 "justification": "Nine databases are explicitly named: Google Scholar, ResearchGate, ScienceDirect, JSTOR, IEEE Xplore, Semantic Scholar, ACM DL, SpringerLink, and ORCID's pre-print relay.", 149 "source": "haiku" 150 }, 151 "screening_process_documented": { 152 "applies": true, 153 "answer": true, 154 "justification": "The paper reports 1,986 records plus 24 manual seed papers narrowed to a final 200-paper corpus, with a PRISMA flow diagram (Figure 1) documenting counts at each stage.", 155 "source": "haiku" 156 }, 157 "review_scope_justified": { 158 "applies": true, 159 "answer": true, 160 "justification": "The ≥7B parameter threshold is linked to LLaMA/Llama-2/3 families; the 2021–June 2025 window is justified as capturing accelerated foundation-model deployment; exclusion of vision-language hybrids and small models is stated with rationale.", 161 "source": "haiku" 162 } 163 }, 164 "synthesis_quality": { 165 "conflicting_findings_acknowledged": { 166 "applies": true, 167 "answer": true, 168 "justification": "Conflicting evidence is noted in multiple places: Acemoglu & Restrepo's net job-creation scenario, declining overt toxicity in GPT-4-class models alongside persistent subtle bias (Section 6.2), and the utility-safety trade-off in content filters (BLEU/ROUGE degradation).", 169 "source": "haiku" 170 }, 171 "quality_assessment_of_sources": { 172 "applies": true, 173 "answer": false, 174 "justification": "Despite claiming PRISMA compliance, the paper applies no formal risk-of-bias instrument or quality rubric to the 200 reviewed papers; arXiv preprints, peer-reviewed articles, and web reports are cited interchangeably without quality weighting.", 175 "source": "haiku" 176 }, 177 "publication_bias_discussed": { 178 "applies": true, 179 "answer": false, 180 "justification": "Publication bias is never discussed; the paper acknowledges focusing on 'harm-related studies' (introducing negative-result selection bias) but does not address the broader tendency for published AI papers to skew toward positive capability claims that inflate the base rate of documented harms.", 181 "source": "haiku" 182 }, 183 "quantitative_synthesis_present": { 184 "applies": true, 185 "answer": false, 186 "justification": "The synthesis is primarily narrative; Figure 2 shows aggregate publication category distribution by harm cluster but provides no meta-analytic effect sizes, vote counts, or quantitative aggregation of harm prevalence or severity across papers.", 187 "source": "haiku" 188 }, 189 "recommendations_supported_by_evidence": { 190 "applies": true, 191 "answer": true, 192 "justification": "Core recommendations (layered technical defenses, compute governance, cross-disciplinary collaboration) are each tied to specific cited evidence—e.g., 40% jailbreak reduction from red-teaming, EU AI Act binding transparency obligations, and Seoul AI Safety Commitments securing voluntary pledges from 16 labs.", 193 "source": "haiku" 194 } 195 } 196 } 197 }, 198 "claims": [ 199 { 200 "claim": "GPT-style models leak ~3.6 bits per parameter despite privacy budget tuning, creating persistent privacy exposure from training corpora.", 201 "evidence": "Attributed to Wang & Li 2025 [31] on LoRA fine-tuning memorization—a very recent preprint whose finding is presented as established fact.", 202 "supported": "weak" 203 }, 204 { 205 "claim": "Fine-tuning a 70B model emits ~500 metric tons of CO₂-equivalent, though per-document carbon cost can beat human writing on low-carbon grids.", 206 "evidence": "Cited from Ren et al. 2024 Nature Scientific Reports [48/49]; same study is cited twice with slightly different framing.", 207 "supported": "moderate" 208 }, 209 { 210 "claim": "Annotation task prices fell 17% between 2022–2024, signalling race-to-the-bottom dynamics in global annotation markets.", 211 "evidence": "Attributed to [62] (Holden & Harsh 2024), a political geography paper on AI infrastructure in Africa—the specific wage statistic appears mismatched to this source.", 212 "supported": "weak" 213 }, 214 { 215 "claim": "GPT-4 misclassifies drug-disease interactions in 14% of prompts, well above the 1% JAMA threshold for decision-support software.", 216 "evidence": "Attributed to Wu et al. [170] using 500 synthetic cases; the 1% threshold sourced to JAMA is not directly cited.", 217 "supported": "moderate" 218 }, 219 { 220 "claim": "Constitutional fine-tuning on Llama-3-8B cuts jailbreak success by 41% while preserving utility.", 221 "evidence": "Attributed to an Anthropic method replication [204]; the cited paper (Cai et al. 2024 on suffix gradient compression) is not the Anthropic Constitutional AI paper.", 222 "supported": "weak" 223 }, 224 { 225 "claim": "LLM-driven bot-nets generated up to 30% of political tweets during peak periods in 2024 national campaigns.", 226 "evidence": "Attributed to an IEEE case study [149]; the cited paper (Kuo et al. 2025 on AI-powered debiasing of news) does not appear to contain this election bot-net finding.", 227 "supported": "unsupported" 228 }, 229 { 230 "claim": "Text-to-image diffusion models displaced 15% of freelance illustrators on one global platform within a year of launch.", 231 "evidence": "Attributed to a Nature Communications paper [144]; cited reference (Hou et al. on limits to growth in AI-driven economy) does not appear to match this specific finding.", 232 "supported": "weak" 233 }, 234 { 235 "claim": "ChatGPT-ghost-written assignments were detected by 62% of faculty across nine universities within one semester of launch.", 236 "evidence": "Attributed to Lim et al. [115]; cited reference (Yusuf et al. 2024 on generative AI and academic integrity) appears to be the actual source but via a different author chain.", 237 "supported": "moderate" 238 } 239 ], 240 "methodology_tags": [ 241 "qualitative", 242 "theoretical" 243 ], 244 "key_findings": "This paper presents a PRISMA-guided systematic review of 200 papers (screened from 1,986 records, 2021–June 2025) developing a lifecycle-aware taxonomy of LLM harms across five categories: pre-deployment (training data, environmental, labor), direct output (representational, content, quality), misuse/malicious (harmful content, deception, security attacks), societal/systemic (economic disruption, democratic harms, power inequities), and downstream application (high-stakes decisions, education, creative work). Key synthesis findings are that harm clusters layer rather than replace one another (privacy leaks in training enable downstream misuse), mitigation efficacy is uneven across harm types (red-teaming plus constitutional tuning reduces jailbreaks ~40% but multilingual filters miss ~7% of non-English slurs), and governance remains fragmented between binding EU obligations and voluntary US frameworks. The paper identifies multi-agent LLM ecosystems as an emerging harm category introducing recursive failures not covered by single-model taxonomies, and advocates for compute governance regimes (FLOP-cap-and-trade) as a preventive governance tool.", 245 "red_flags": [ 246 { 247 "flag": "Saturation rule makes search non-reproducible", 248 "detail": "The paper stops searching 'when two successive pages yield no new relevant titles'—an approach that is explicitly non-reproducible because results depend on database ranking algorithms at search time, violating PRISMA's reproducibility requirements." 249 }, 250 { 251 "flag": "No quality assessment of sources", 252 "detail": "Despite claiming PRISMA compliance, no risk-of-bias or quality rubric is applied to the 200-paper corpus; arXiv preprints, peer-reviewed journals, and web reports are cited interchangeably and treated as equivalent evidence." 253 }, 254 { 255 "flag": "Citation-reference mismatches", 256 "detail": "Multiple citation numbers appear mismatched to their stated content: [149] (cited for election bot-nets generating 30% of tweets) resolves to a news-debiasing paper; [92] is used for both OWASP LLM risks and climate misinformation unlearning; [57] is used for both carbon scheduling and longitudinal toxicity evidence." 257 }, 258 { 259 "flag": "Expert interviews undocumented", 260 "detail": "The methodology claims 'ten expert interviews with safety engineers and policymakers' but no interview data, quotes, themes, or analysis appear anywhere in the paper—their contribution to the taxonomy is invisible." 261 }, 262 { 263 "flag": "Heavy self-citation of concurrent preprints", 264 "detail": "At least five cited references ([86], [87], [138], [200], [257], [258]) are concurrent papers by the same author group (Jiao, Afroogh, Chen et al.), several of which are arXiv preprints not yet peer-reviewed, forming a self-reinforcing citation cluster." 265 }, 266 { 267 "flag": "AI tools used for content generation", 268 "detail": "Acknowledgements disclose use of 'AI-powered tools, such as OpenAI's applications, for assistance in editing and brainstorming'—for a survey paper about LLM harms, this raises questions about content integrity and the provenance of synthesized claims." 269 }, 270 { 271 "flag": "'Harm' undefined as core construct", 272 "detail": "The paper's central organizing concept is never formally defined; the operational threshold between a documented 'harm' and a general limitation or side-effect is left implicit, undermining replicability of the 200-paper classification." 273 } 274 ], 275 "cited_papers": [ 276 { 277 "title": "Ethical and social risks of harm from Language Models (Weidinger et al. 2021)", 278 "relevance": "Foundational taxonomy of LLM risks that this paper explicitly builds on and extends across five harm categories." 279 }, 280 { 281 "title": "Taxonomy of Risks posed by Language Models (Weidinger et al. 2022, ACM FAccT)", 282 "relevance": "Directly cited as providing the conceptual framework for harm categories including discrimination, information hazards, and socioeconomic impacts." 283 }, 284 { 285 "title": "Bias and Fairness in Large Language Models: A Survey (Gallegos et al.)", 286 "relevance": "Core survey on demographic bias used to support representational harm findings showing stereotype benchmarks still surface bias in GPT-4-class models." 287 }, 288 { 289 "title": "A Survey on Hallucination in Large Language Models (Huang et al. 2024)", 290 "relevance": "Hallucination taxonomy reporting error rates up to 23% in open-ended QA, central evidence for quality/reliability harms." 291 }, 292 { 293 "title": "PRISMA 2020 statement (Page et al.)", 294 "relevance": "Methodological basis for the systematic review protocol; paper claims PRISMA-2020 compliance and registered an OSF protocol." 295 }, 296 { 297 "title": "NIST AI Risk Management Framework (AI RMF 1.0)", 298 "relevance": "Key US governance framework discussed as a complement to EU AI Act in Section 5.2." 299 }, 300 { 301 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models (Zou et al. 2023)", 302 "relevance": "Adversarial suffix jailbreaks bypassing safety filters with >80% success, cited as core evidence for misuse/security attack harms." 303 }, 304 { 305 "title": "Making AI Less Thirsty: Water Footprint of AI Models (Li et al.)", 306 "relevance": "Environmental harm evidence—inference-time water consumption exceeding 500ml per 1k tokens on evaporative-cooled data centers." 307 }, 308 { 309 "title": "Reconciling contrasting narratives on environmental impact of LLMs (Ren et al. 2024, Sci Rep)", 310 "relevance": "Provides the 70B model fine-tuning CO₂ estimate (~500 tonnes) and per-document carbon comparison with human writing." 311 }, 312 { 313 "title": "A Collaborative, Human-Centred Taxonomy of AI, Algorithmic, and Automation Harms (Abercrombie et al. 2024)", 314 "relevance": "Parallel taxonomy work providing complementary harm classification for situating this paper's lifecycle-based approach." 315 } 316 ], 317 "engagement_factors": { 318 "practical_relevance": { 319 "score": 2, 320 "justification": "Provides a structured five-category taxonomy with specific mitigation strategies and governance recommendations that safety engineers, policymakers, and product teams can directly reference." 321 }, 322 "surprise_contrarian": { 323 "score": 1, 324 "justification": "The taxonomy covers largely familiar ground; the compute governance angle and multi-agent harm category are forward-looking but not strongly contrarian relative to existing literature." 325 }, 326 "fear_safety": { 327 "score": 3, 328 "justification": "Directly catalogues existential and societal AI risks including election manipulation with 30% bot-net tweet share, medical hallucination at 14% error rates, and recursive multi-agent failure modes—high fear/safety salience throughout." 329 }, 330 "drama_conflict": { 331 "score": 2, 332 "justification": "Covers contentious topics (job displacement displacing 15% of illustrators, data colonialism, annotation worker exploitation at $2.10/hr median) with specific statistics that generate conflict-driven interest." 333 }, 334 "demo_ability": { 335 "score": 0, 336 "justification": "Pure literature review with no tool, dataset, benchmark, or interactive artifact; readers cannot try anything from this paper." 337 }, 338 "brand_recognition": { 339 "score": 1, 340 "justification": "Authors are from UT Austin and IBM Research—recognizable but not the top AI lab tier (OpenAI, DeepMind, Anthropic) that drives HN traction." 341 } 342 }, 343 "hn_data": { 344 "threads": [ 345 { 346 "hn_id": "38737262", 347 "title": "Direct initialization of transformers using larger pretrained ones", 348 "points": 48, 349 "comments": 14, 350 "url": "https://news.ycombinator.com/item?id=38737262", 351 "created_at": "2023-12-22T18:54:56Z" 352 }, 353 { 354 "hn_id": "31318574", 355 "title": "Flares from black hole binaries: black hole shadows via light-curve tomography", 356 "points": 43, 357 "comments": 1, 358 "url": "https://news.ycombinator.com/item?id=31318574", 359 "created_at": "2022-05-09T19:24:38Z" 360 }, 361 { 362 "hn_id": "43164753", 363 "title": "Switch-Based Antagonist Actuation with a Single Motor for a Soft Exosuit", 364 "points": 2, 365 "comments": 0, 366 "url": "https://news.ycombinator.com/item?id=43164753", 367 "created_at": "2025-02-24T20:46:00Z" 368 }, 369 { 370 "hn_id": "42418821", 371 "title": "Specifications: The missing link to make development of LLM an eng discipline", 372 "points": 2, 373 "comments": 0, 374 "url": "https://news.ycombinator.com/item?id=42418821", 375 "created_at": "2024-12-14T19:07:39Z" 376 }, 377 { 378 "hn_id": "33976318", 379 "title": "Measuring Data", 380 "points": 2, 381 "comments": 0, 382 "url": "https://news.ycombinator.com/item?id=33976318", 383 "created_at": "2022-12-13T21:43:20Z" 384 }, 385 { 386 "hn_id": "42443177", 387 "title": "Memristor-Based Selective Convolutional Circuit for Salt-N-Pepper Noise Removal", 388 "points": 1, 389 "comments": 0, 390 "url": "https://news.ycombinator.com/item?id=42443177", 391 "created_at": "2024-12-17T17:20:24Z" 392 }, 393 { 394 "hn_id": "33980774", 395 "title": "Graph algorithms for predicting subcellular localization at the pathway level", 396 "points": 1, 397 "comments": 0, 398 "url": "https://news.ycombinator.com/item?id=33980774", 399 "created_at": "2022-12-14T06:51:58Z" 400 } 401 ], 402 "top_points": 48, 403 "total_points": 99, 404 "total_comments": 15 405 } 406 }