scan-v5.json (25686B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Does Reasoning Introduce Bias? A Study of Social Bias Evaluation and Mitigation in LLM Reasoning", 6 "authors": [ 7 "Xuyang Wu", 8 "Jinming Nian", 9 "Ting-Ruen Wei", 10 "Zhiqiang Tao", 11 "Hsin-Tai Wu" 12 ], 13 "year": 2025, 14 "venue": "Conference on Empirical Methods in Natural Language Processing", 15 "arxiv_id": "2502.15361", 16 "doi": "10.18653/v1/2025.findings-emnlp.1006" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All four major abstract claims — first systematic evaluation of reasoning bias, accuracy-without-bias-mitigation finding, bias amplification in ambiguous contexts, and ADBP outperforming SfRP — are supported by Tables 1–2 and Figures 3–6.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The SfRP ablation (remove biased steps → re-run instruction model) constitutes an intervention test for the causal claim that biased reasoning causes errors; robustness of the LLM judge across prompt variants (Section 5.1, Table 5) partially validates the oracle, though human annotation is absent.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Conclusions like 'reasoning-based models do not mitigate biases' are stated broadly across the paper while evidence covers only five open-source models plus cited OpenAI system card numbers on a single English-language benchmark (BBQ); computational constraints also excluded the full DeepSeek-R1 model.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section 4.3 explicitly notes that incorrect answers can arise without biased reasoning (white lines in Figure 3b) and identifies non-negative polarity misinterpretation as an independent source of errors (Figure 4b, Appendix A.7).", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper uses distinct metrics for different constructs: exact-match accuracy for prediction correctness and formally-defined bias scores (Equations 2–3) for stereotype expression, and keeps these separate throughout analysis.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "A dedicated 'Limitations' section is present, separate from the conclusion, addressing the LLM judge, mitigation design, computational constraints, and refusal behavior.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats include: LLM-as-a-judge uncertainty without human annotation verification, inability to test full DeepSeek-R1 due to compute costs, distilled models potentially carrying inherent biases, and rare refusal behavior due to BBQ's controlled framing.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly bounds scope to distilled DeepSeek-R1 variants rather than the full model, and to the BBQ benchmark in zero-shot English-language settings; computational and cost limitations are explicitly named.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No acknowledgment or funding section appears anywhere in the paper; funding sources are entirely absent.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All author affiliations are clearly listed on the title page (Santa Clara University, Rochester Institute of Technology, Docomo Innovations).", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding source is disclosed, making this criterion not applicable.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper defines social bias (stereotype-based associations), SfRP, ADBP, ambiguous vs. disambiguated contexts, and provides formal equations for both accuracy and bias score metrics.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three explicit contributions are enumerated: first evaluation of bias in reasoning steps (not just outputs), empirical finding that reasoning amplifies stereotypes, and the ADBP mitigation strategy.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The related work section distinguishes this study from prior bias work (which focused on outputs, not reasoning steps) and from prior CoT studies (which focused on math/code), clearly positioning the contribution as a gap-filling study.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract links directly to https://github.com/elviswxy/LLM_reasoning_bias for evaluation and mitigation code.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "BBQ is a publicly available standard benchmark (Parrish et al., 2022); no novel dataset was created.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Only hardware (NVIDIA A100 GPUs) is mentioned; no requirements.txt, Dockerfile, or Python/library versions are provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": true, 142 "justification": "Algorithm 1 specifies the ADBP procedure step-by-step, all prompts are provided in Appendix A.2, model HuggingFace links are given, and code is released.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported for the main accuracy or bias score results in Tables 1–2.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "Comparative claims (e.g., 'ADBP outperforms SfRP in most cases') are made without any statistical significance tests.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Numerical accuracy improvements from mitigation are reported in absolute terms (e.g., +0.517 in Case 1, +0.717 in Case 3), providing quantified effect sizes.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The full BBQ dataset (58,492 examples) is used without any power analysis or justification for whether this is sufficient to detect the observed effect sizes reliably.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "The LLM-as-a-judge scores each step 5 times and takes majority vote, but variance across those runs is not reported; main accuracy/bias results show no spread.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "ADBP is compared against SfRP-based mitigation, Self-debiasing via Explanation (Gallegos et al., 2025), and Combined Debiasing Prompt (Liu et al., 2025a) in Figure 6 and Table 2.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "All mitigation baselines (Gallegos et al., 2025; Liu et al., 2025a) are from 2025, published at NAACL 2025 and contemporaneous with this work.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "The SfRP component is an ablation: removing biased steps from DeepSeek reasoning and re-querying the base model isolates the contribution of biased reasoning to incorrect predictions (Figure 5).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Two distinct metrics are used throughout: accuracy (Acc) and bias score (Bias), each split by context type (ambiguous/disambiguated).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "The paper explicitly states 'We did not conduct human labeling to verify [LLM-as-a-judge] reliability due to the extremely high cost of manual annotation.'", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": false, 211 "answer": false, 212 "justification": "This is an inference-only evaluation study; no model training or train/test splits are involved.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Tables 1a and 1b provide per-category accuracy and bias scores across all 11 BBQ categories for each evaluated model.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Appendix A.6 provides full qualitative failure examples (biased reasoning leading to wrong answers across Religion, Age, Disability categories), and Section 4.3 analyzes failure patterns systematically.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that ADBP underperforms SfRP for the Qwen-32B base model (Case 3, Table 2) and that reasoning models fail to reduce bias despite improving accuracy.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact HuggingFace model IDs are provided via footnotes for all open-source models (e.g., deepseek-ai/DeepSeek-R1-Distill-Llama-8B); OpenAI results are sourced from the o3-mini system card.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "All prompts are provided verbatim in Appendix A.2 (Figures 7–12), including evaluation prompts for instruction-tuned models, DeepSeek models, LLM-as-a-judge, and ADBP.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "The paper states generation parameters follow each model's system card without specifying temperature, top-p, or other decoding hyperparameters.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used; this is a direct inference evaluation study.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Reasoning step extraction (newline splitting of <think> tokens), k=100 bin normalization for visualization, exact-match response normalization with regex, and LLM-as-a-judge 5-run majority voting are all documented.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "While code is released, the intermediate scored outputs (per-step LLM judge bias scores for all reasoning traces) are not explicitly released as a dataset artifact.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The BBQ dataset is described (Table 3, dataset statistics), model inference procedure is described, and the LLM-as-a-judge scoring procedure (GPT-4o, 5 runs, majority vote) is detailed in Section 3.4.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; the study uses a pre-existing benchmark dataset.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from BBQ input → model inference → reasoning step extraction → LLM-as-a-judge scoring → accuracy/bias computation is described across Sections 3–4.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff dates are stated for any of the evaluated models (DeepSeek, Llama, Qwen, OpenAI).", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "BBQ (published 2022) could be in any of the 2024–2025 model training corpora; the paper does not discuss or test for this possibility.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "BBQ predates all evaluated models' training, making contamination plausible, but the paper does not address whether BBQ examples were in the models' pretraining data.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "The paper mentions 'monetary cost' as a drawback of the LLM-as-a-judge approach (Section 6) but provides no actual cost figures for GPT-4o calls or model inference.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "NVIDIA A100 GPUs are mentioned but no total compute budget (GPU-hours, cost) is stated.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Reasoning-based models (DeepSeek-R1 variants) improve prediction accuracy compared to base instruction-tuned models but do not reduce social bias — in many categories they exhibit equal or worse bias scores.", 375 "evidence": "Table 1: DeepSeek-8B achieves highest accuracy in all 11 categories but shows similar or worse bias in 9/11 ambiguous categories vs. Llama-8B.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Biased reasoning steps are significantly more prevalent in incorrect predictions than in correct predictions across multiple BBQ categories.", 380 "evidence": "Figure 3 and Table 6 show higher average bias scores in subsets where the reasoning model is wrong (e.g., Age: 1.06 vs. 0.23; SES: 1.57 vs. 0.46; Religion: 1.55 vs. 0.64).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Removing biased reasoning steps (SfRP) and re-querying instruction-tuned models consistently improves accuracy on previously failed cases.", 385 "evidence": "Figure 5: SfRP improves accuracy by +0.517 (Case 1) and +0.717 (Case 3); even in both-fail cases accuracy increases by +0.100 and +0.526.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "ADBP outperforms SfRP mitigation in most cases by using answer distribution shifts across incremental reasoning steps as a proxy for bias.", 390 "evidence": "Table 2: ADBP exceeds SfRP accuracy in Cases 1, 2, and 4 for both model families; ADBP corrects 38–60% of initially incorrect cases vs. SfRP's 24–44%. Exception is Qwen-32B Case 3.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Ambiguity amplifies bias in reasoning models: reasoning-based models that outperform base models in disambiguated contexts often fail to do so in ambiguous contexts.", 395 "evidence": "Section 4.2: OpenAI o1, o1-mini, o3-mini underperform GPT-4o in ambiguous contexts; DeepSeek-32B fails to consistently outperform Qwen-32B under ambiguity in categories like Age and Physical Appearance.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Bias tends to intensify in later reasoning steps: once a biased step appears, the model tends to persist along a faulty trajectory.", 400 "evidence": "Figure 3b and 3d show bias accumulating toward the end of reasoning chains for incorrect predictions, while correct predictions show isolated, non-propagating bias.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "observational", 407 "case-study" 408 ], 409 "key_findings": "Reasoning-based LLMs (DeepSeek-R1 variants, Marco-o1) improve accuracy on the BBQ social bias benchmark over base instruction-tuned models but do not mitigate social bias — in ambiguous contexts they often amplify stereotypes, particularly for categories like Age, Physical Appearance, and SES. Biased reasoning steps are strongly correlated with incorrect predictions, and removing them via SfRP consistently improves accuracy, supporting a causal role for biased reasoning in prediction errors. The proposed ADBP method, which uses answer distribution shifts across incremental reasoning steps as a bias proxy, outperforms SfRP and prompt-only debiasing baselines in most tested scenarios without requiring an external judge at inference time.", 410 "red_flags": [ 411 { 412 "flag": "LLM judge unvalidated", 413 "detail": "The paper explicitly acknowledges it did not conduct human annotation to validate the LLM-as-a-judge reliability; this oracle underpins the primary causal claim (bias → incorrect predictions) and the SfRP intervention." 414 }, 415 { 416 "flag": "No statistical significance testing", 417 "detail": "Comparative claims (ADBP outperforms SfRP, reasoning models amplify bias vs. base models) are made without any significance tests despite per-category sample sizes that would permit them." 418 }, 419 { 420 "flag": "BBQ contamination unaddressed", 421 "detail": "BBQ was published in 2022; all evaluated models (DeepSeek-R1, Llama-3.1, Qwen2.5) were trained after 2022. Potential benchmark contamination in pretraining data is not discussed." 422 }, 423 { 424 "flag": "Broad generalization from few models", 425 "detail": "Conclusions about 'reasoning-based LLMs' in general are drawn from 2–3 open-source model families on a single English-language benchmark, without the full DeepSeek-R1 or diverse proprietary models." 426 }, 427 { 428 "flag": "Depth analysis limited to 3/11 categories", 429 "detail": "The reasoning step bias analysis (Figure 3, Section 5.2) is demonstrated for only Age, Religion, and SES categories out of 11 BBQ categories, limiting generalizability of the mechanism claim." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "BBQ: A Hand-Built Bias Benchmark for Question Answering", 435 "relevance": "Primary evaluation benchmark; defines ambiguous/disambiguated context structure and bias scoring methodology used throughout." 436 }, 437 { 438 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 439 "relevance": "Foundational CoT paper that motivates evaluating whether reasoning chains introduce bias." 440 }, 441 { 442 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 443 "relevance": "Primary evaluated models (distilled variants DeepSeek-8B and DeepSeek-32B) are sourced from this work." 444 }, 445 { 446 "title": "Decoding Biases: Automated Methods and LLM Judges for Gender Bias Detection in Language Models", 447 "relevance": "Provides the LLM-as-a-judge methodology adopted for per-step bias scoring." 448 }, 449 { 450 "title": "On Second Thought, Let's Not Think Step by Step! Bias and Toxicity in Zero-Shot Reasoning", 451 "relevance": "Most closely related prior work showing CoT prompting can increase harmful outputs in sensitive domains." 452 }, 453 { 454 "title": "Evaluating Gender Bias in Large Language Models via Chain-of-Thought Prompting", 455 "relevance": "Related prior work on CoT and gender bias; key differentiator is that this paper analyzes native reasoning chains rather than prompted CoT." 456 }, 457 { 458 "title": "Self-Debiasing Large Language Models: Zero-Shot Recognition and Reduction of Stereotypes", 459 "relevance": "One of the two mitigation baselines compared against ADBP in Figure 6." 460 }, 461 { 462 "title": "Evaluating and Mitigating Social Bias for Large Language Models in Open-Ended Settings", 463 "relevance": "Combined Debiasing Prompt baseline compared against ADBP in Figure 6." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "ADBP is a lightweight, annotation-free method applicable at inference time to any model that exposes reasoning traces, making it directly usable by practitioners deploying reasoning LLMs." 470 }, 471 "surprise_contrarian": { 472 "score": 3, 473 "justification": "The central finding — that reasoning capability improves accuracy but simultaneously amplifies social bias — directly contradicts the intuition that 'better reasoning = better alignment.'" 474 }, 475 "fear_safety": { 476 "score": 2, 477 "justification": "Demonstrates that widely deployed reasoning models (DeepSeek-R1, o1) systematically express stereotypes against protected groups in QA tasks, raising concrete deployment safety concerns." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "Challenges DeepSeek-R1 and OpenAI o1's implicit safety claims but without naming adversarial parties; low drama framing." 482 }, 483 "demo_ability": { 484 "score": 2, 485 "justification": "Code is released and models (DeepSeek-R1 distilled variants) are freely available on HuggingFace, enabling reproduction; BBQ dataset is public." 486 }, 487 "brand_recognition": { 488 "score": 2, 489 "justification": "DeepSeek-R1 and OpenAI o1/o3-mini are high-profile models; EMNLP venue adds credibility, though the institution (Santa Clara University) is not a top-tier AI lab." 490 } 491 }, 492 "hn_data": { 493 "threads": [ 494 { 495 "hn_id": "43405094", 496 "title": "Politicians' misinformation behavior and public engagement, in 4 countries", 497 "points": 3, 498 "comments": 0, 499 "url": "https://news.ycombinator.com/item?id=43405094", 500 "created_at": "2025-03-18T21:03:45Z" 501 } 502 ], 503 "top_points": 3, 504 "total_points": 3, 505 "total_comments": 0 506 } 507 }