scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32921B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Does Reasoning Introduce Bias? A Study of Social Bias Evaluation and Mitigation in LLM Reasoning",
      6     "authors": [
      7       "Xuyang Wu",
      8       "Jinming Nian",
      9       "Ting-Ruen Wei",
     10       "Zhiqiang Tao",
     11       "Hsin-Tai Wu"
     12     ],
     13     "year": 2025,
     14     "venue": "Conference on Empirical Methods in Natural Language Processing",
     15     "arxiv_id": "2502.15361",
     16     "doi": "10.18653/v1/2025.findings-emnlp.1006"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims are supported: 'reasoning-based models improve accuracy' (Table 1), bias amplification in reasoning (Section 4.3, Figures 3-4), ADBP 'outperforms SfRP baseline in most cases' (Table 2, with exception noted). The 'most cases' hedge is appropriate.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal-adjacent claims ('biased reasoning steps contribute to incorrect predictions') but is careful: 'this strong correlation between bias and wrong answer does not imply perfect causality' (Section 4.3). The SfRP ablation (removing biased steps → improved accuracy) is a controlled single-variable manipulation adequate for the hedged claims.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Does Reasoning Introduce Bias?' and the abstract's 'first systematic evaluation of social bias within LLM-generated reasoning' are broader than the evidence supports. The study uses only the BBQ dataset (US-centric biases), only a few models (mainly DeepSeek-R1 distilled variants), and zero-shot setting only. Results may not generalize to other bias benchmarks, languages, or full-size models.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 4.3 discusses non-negative polarity question misinterpretation as an alternative explanation for incorrect answers (Figure 4b), and notes 'many cases in Figure 3b have unbiased reasoning steps (white lines), indicating that DeepSeek-8B can still produce incorrect answers without biased reasoning.'",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper's claims match its measurements: it measures BBQ accuracy and bias scores and frames results in those terms. The BBQ benchmark is specifically designed to measure social bias in QA, and the paper does not overframe its measurements as capturing all forms of social bias.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section is present after Section 7, discussing LLM-as-a-judge reliability, lack of human verification, cost-efficiency, computational constraints, distilled model limitations, and refusal behavior.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats discussed: 'The LLM-as-a-judge method... may still introduce uncertainty', 'we were unable to conduct experiments on the full-size DeepSeek-R1 model, and the distilled versions may carry inherent biases', 'refusal behavior was rare... likely due to the controlled nature of BBQ dataset questions.' These are specific to this study.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The Limitations section discusses what wasn't done (no human labeling, no full-size models, no cost analysis) but does not explicitly state what the results do NOT show or which populations/settings are excluded. No statement bounding conclusions to US-centric biases, English-only, or zero-shot settings.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgments section found in the paper. No mention of grants, sponsors, or funding agencies.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Santa Clara University, Rochester Institute of Technology, and Docomo Innovations (corporate affiliation).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "One author is affiliated with Docomo Innovations (NTT subsidiary). No funding disclosure means funder independence cannot be verified. The paper does not evaluate Docomo products, so the conflict risk is low, but independence is not established.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement found in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Social bias is defined through examples and formal metrics (Equations 2-3); stereotyping and non-stereotyping responses are defined precisely in Section 3.1; reasoning models are distinguished from instruction-tuned models.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit numbered contributions are stated at the end of Section 1: first systematic evaluation of bias in reasoning steps, empirical finding that reasoning amplifies bias, and the ADBP mitigation strategy.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 situates the work relative to prior bias evaluation (BBQ, Shaikh et al. 2023, Kaneko et al. 2024) and explains the novel angle of evaluating intermediate reasoning steps rather than final outputs.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code repository provided in abstract: 'Evaluation and mitigation code is available at https://github.com/elviswxy/LLM_reasoning_bias.'",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "They use the BBQ dataset (Parrish et al., 2022), a publicly available benchmark. The dataset is referenced by name and citation.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions 'NVIDIA A100 GPUs' (Section 4.1) but provides no requirements.txt, Dockerfile, conda environment, or library versions.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions found in the paper. The code repository is linked but no README or reproduction guide is described.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 1, 2, and 4 report only point estimates for accuracy and bias scores. No confidence intervals, error bars, or uncertainty measures on any main results.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Claims like 'DeepSeek-8B consistently outperforms similar-sized models' (Section 4.2) are based on comparing raw numbers. No statistical tests (p-values, t-tests, etc.) are reported for any comparison.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Accuracy differences with baseline context are reported: 'accuracy increases compared to using the original (biased) reasoning: by 0.517 and 0.717 in Cases 1 and 3' (Section 4.4). Table 2 provides absolute accuracy values enabling effect size comparison.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification given for why this specific set of models was chosen, or why the BBQ dataset size (58,492 examples) is adequate for the claims. No power analysis discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance or standard deviation reported across experimental runs. The LLM-as-a-judge uses majority voting over 5 runs, but main model accuracy and bias results appear to be single-run without any spread measure.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines included: instruction-tuned counterparts (Llama-8B, Qwen-32B) as base models, SfRP as mitigation baseline, plus Self-debiasing via Explanation and Combined Debiasing Prompt (Figure 6, Section 6.1).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All models are recent: DeepSeek-R1 (2025), OpenAI o1/o3-mini (2025), Qwen2.5 (2024), Llama 3.1 (2024), Marco-o1 (2024). Mitigation baselines from 2025.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4.4 performs a bias ablation evaluation: comparing full reasoning (with bias) vs. SfRP (without biased steps) across four cases, directly measuring the contribution of biased reasoning steps to incorrect predictions (Figure 5).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Two primary metrics are used throughout: Accuracy (Acc, Eq. 1) and Bias Score (Bias, Eqs. 2-3), reported separately for ambiguous and disambiguated contexts.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation conducted. Reasoning step bias is assessed by LLM-as-a-judge (GPT-4o). The Limitations section states: 'We did not conduct human labeling to verify its reliability due to the extremely high cost of manual annotation.'",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Models are evaluated zero-shot without fine-tuning on BBQ (Section 4.1: 'under a zero-shot setting without fine-tuning'), so the entire BBQ dataset serves as an unseen test set.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 1 provides per-category breakdowns across all 11 BBQ bias categories (Age, Disability, Gender, etc.) for each model, for both ambiguous and disambiguated contexts.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 4.3 provides detailed failure analysis with Figures 3-4. They identify non-negative polarity question misinterpretation as a specific failure mode (Section 4.3, Figure 4b, Appendix A.7), and show qualitative failure examples (Appendix A.6).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "ADBP underperforms SfRP when base model is Qwen-32B (Table 2, Case 3). They also report reasoning models don't reduce bias despite improving accuracy, and that prompt-only debiasing strategies 'do not reliably correct reasoning-induced bias' (Section 6.1).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Open-source models are specified with HuggingFace URLs (e.g., 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B'). However, GPT-4o used as the LLM-as-a-judge (a critical methodology component) has no version or snapshot date specified. OpenAI model results are 'taken from o3-mini system card' without specifying exact versions.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt text provided in Appendix A.2: outcome evaluation prompts (Figures 7, 8), LLM-as-a-judge prompts (Figures 9, 10, 11), and ADBP prompt (Figure 12).",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Section 4.1 states 'we adhere to the same generation parameters as specified in each model's system card' but does not list the actual hyperparameters (temperature, top-p, etc.) used. Deferring to external system cards is insufficient for reproducibility.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The paper performs direct model inference with prompts.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.4 describes reasoning step extraction (splitting by newline character), Section 3.2 describes output normalization (exact string matching with regular expressions), and Section 4.3 describes reasoning chain normalization (k=100 uniform bins).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The BBQ dataset is public, but the raw model outputs, reasoning traces, and LLM-as-a-judge scores are not explicitly stated to be released. Only 'evaluation and mitigation code' is available.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 describes the BBQ dataset (9 categories + 2 intersectional, source from US EEOC), Section 3.2 describes model inference procedure, Section 3.4 describes the LLM-as-a-judge evaluation setup with 5 independent scores and majority voting.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data comes from the BBQ benchmark dataset (standard public benchmark).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented: BBQ dataset → model inference with customized prompts → exact string matching for accuracy → bias score computation (Eqs. 1-3) → reasoning step extraction by newline → LLM-as-a-judge scoring with majority voting. Dataset statistics in Table 3 (58,492 total examples).",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No model's training data cutoff date is stated. The BBQ dataset was published in 2022, and models evaluated (DeepSeek-R1, GPT-4o, o1) were trained after 2022, but no cutoff dates are discussed.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether BBQ examples appeared in any model's training data. Given BBQ has been publicly available since 2022 and all evaluated models were trained after 2022, this is a significant omission.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "BBQ was published in 2022 (ACL Findings) and has been widely cited. All evaluated models could have seen BBQ examples during training. No contamination analysis is performed or discussed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or token consumption reported. ADBP requires querying the model once per reasoning step (potentially dozens of forward passes per example), but computational cost is not quantified. Limitations section acknowledges 'without consideration for cost-efficiency.'",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only 'NVIDIA A100 GPUs' mentioned (Section 4.1). No GPU hours, total API spend, or training/inference time reported.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of random seeds or seed sensitivity analysis for any experiment.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The LLM-as-a-judge evaluates each step 5 times with majority voting (Section 3.4), but the main model inference (accuracy and bias results) does not state how many runs produced the results. Appears to be single-run.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search described. Prompts were 'customized' for each model but no search budget or alternatives tried are reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "The paper uses zero-shot evaluation with model-specific prompts aligned to system documentation (Section 4.1). Prompts are transparently provided in Appendix A.2, and the approach follows standard evaluation conventions without selecting from multiple configurations.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Many comparisons made across 11 categories, multiple models, and two context types without any statistical tests or multiple comparison corrections.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors propose ADBP and compare it against baselines without acknowledging the bias of evaluating their own method. No independent evaluation mentioned.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "ADBP requires multiple forward passes per example (one per reasoning step), while SfRP and other baselines require fewer passes. The compute cost differences are not discussed or compared.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "BBQ is used as the sole bias benchmark without discussing whether it adequately captures social bias in reasoning. No comparison with alternative bias benchmarks or discussion of BBQ's limitations as a construct validity measure for reasoning bias specifically.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding involved. The paper performs direct model inference with prompts.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "BBQ was published in 2022 (ACL Findings). All evaluated models were trained after 2022. No discussion of whether models may have memorized BBQ examples.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation format or context structure leaks information to the model beyond what would be available in a real-world bias scenario.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether BBQ items share structural templates or whether performance on one item predicts performance on related items within the same category.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention methods used (no canary strings, membership inference, temporal splits, or decontamination).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Reasoning-based LLMs improve accuracy over instruction-tuned counterparts but do not reduce social bias and often amplify stereotypes, especially in ambiguous contexts.",
    457       "evidence": "Table 1 shows DeepSeek-R1-Distill models achieve higher accuracy than base models (e.g., DeepSeek-8B 0.90 vs Llama-8B 0.82 on ambiguous accuracy) but maintain similar or worse bias scores (0.51 vs 0.56 Biasamb); Appendix Table 4 shows o1 has higher bias susceptibility than GPT-4o despite comparable reasoning capabilities.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Biased reasoning steps are strongly correlated with incorrect predictions, with bias intensifying in later reasoning steps for wrong answers.",
    462       "evidence": "Table 6 shows systematically higher bias scores for incorrect vs correct predictions across Age (1.06 vs 0.23), SES (1.57 vs 0.46), and Religion (1.55 vs 0.64) categories; Figure 3 visualizes bias intensifying after initial steps for incorrect answers.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "ADBP outperforms SfRP baseline in most experimental cases by using answer distribution shifts as a supervision-free proxy for bias.",
    467       "evidence": "Table 2 shows ADBP exceeds SfRP in 3 of 4 cases (e.g., Case 2 Llama-8B: 0.5017 vs 0.1200; Case 2 DeepSeek-8B: 0.4816 vs 0.2400), with the exception being the strong Qwen-32B base model.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "The LLM-as-judge bias evaluation methodology is robust across prompt variations and scale changes, with relative trends holding across three prompt formulations.",
    472       "evidence": "Table 5 shows that while absolute bias scores shift across prompt rewrites and 5-level vs 3-level scales, the relative ordering between correct and incorrect predictions remains consistent across all three prompt variants.",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Ambiguity specifically disrupts reasoning-based models by triggering over-reliance on stereotypical associations, whereas base models maintain stable accuracy across ambiguous and disambiguated contexts.",
    477       "evidence": "Section 4.2 observes that reasoning models (including o1, o3-mini) underperform GPT-4o under ambiguity while base models show stable cross-condition accuracy; the paired ambiguous/disambiguated items differ only by a clarifying sentence, allowing attribution to ambiguity itself.",
    478       "supported": "moderate"
    479     }
    480   ],
    481   "methodology_tags": [
    482     "benchmark-eval"
    483   ],
    484   "key_findings": "Reasoning-based LLMs (DeepSeek-R1 variants, OpenAI o1/o3-mini) achieve higher prediction accuracy than instruction-tuned counterparts on the BBQ social bias benchmark, but this accuracy gain does not translate to reduced stereotyping—in ambiguous contexts, reasoning models systematically amplify social biases more than base models. Biased reasoning steps correlate strongly with incorrect predictions (mean bias scores 4-5x higher in wrong vs correct answers across multiple categories), and bias typically intensifies in later reasoning steps, suggesting a path-dependence where early stereotypical associations compound. The proposed ADBP mitigation strategy, which detects bias by tracking answer distribution shifts across incremental reasoning steps, corrects 38-60% of originally incorrect cases without external supervision, outperforming the supervised SfRP baseline in 3 of 4 test configurations, though it underperforms for the strongest base model (Qwen-32B).",
    485   "red_flags": [
    486     {
    487       "flag": "No statistical significance tests",
    488       "detail": "All comparisons between models and mitigation methods are presented as raw point estimates without any statistical tests, confidence intervals, or error bars, making it impossible to assess whether observed differences are reliable."
    489     },
    490     {
    491       "flag": "LLM-as-judge for bias labels without human validation",
    492       "detail": "GPT-4o is used to assign bias scores to every reasoning step; the authors explicitly acknowledge this may introduce uncertainty and note human validation was skipped due to cost, meaning the entire experimental foundation rests on an unvalidated automated labeler."
    493     },
    494     {
    495       "flag": "Benchmark contamination unaddressed",
    496       "detail": "BBQ was published in 2022 and is publicly available; all evaluated models (2024-2025) almost certainly saw it during pretraining, potentially inflating accuracy scores and making the bias measurements unreliable as a clean test of reasoning behavior."
    497     },
    498     {
    499       "flag": "No variance across runs",
    500       "detail": "All accuracy and bias metrics are single-point estimates despite LLM inference being stochastic; even reporting two runs would establish whether results are stable."
    501     },
    502     {
    503       "flag": "Narrow evidence for broad title",
    504       "detail": "The paper asks 'Does Reasoning Introduce Bias?' but all evidence comes from one English benchmark (BBQ) and five model families; generalization to other tasks, languages, or reasoning paradigms is not bounded."
    505     }
    506   ],
    507   "cited_papers": [
    508     {
    509       "title": "BBQ: A Hand-Built Bias Benchmark for Question Answering",
    510       "relevance": "Primary evaluation benchmark used throughout the paper; defines bias categories, ambiguous/disambiguated structure, and the bias score formulas adopted"
    511     },
    512     {
    513       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    514       "relevance": "Foundational CoT work that motivates studying bias propagation through reasoning chains"
    515     },
    516     {
    517       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    518       "relevance": "Primary reasoning model family evaluated; introduces the architecture whose bias behavior is the main study subject"
    519     },
    520     {
    521       "title": "On Second Thought, Let's Not Think Step by Step! Bias and Toxicity in Zero-Shot Reasoning",
    522       "relevance": "Direct predecessor: showed zero-shot CoT prompting increases harmful outputs; this paper extends to native reasoning models and intermediate steps"
    523     },
    524     {
    525       "title": "Evaluating Gender Bias in Large Language Models via Chain-of-Thought Prompting",
    526       "relevance": "Related work on CoT and gender bias evaluation; cited to differentiate this paper's focus on natively reasoning models vs prompted CoT"
    527     },
    528     {
    529       "title": "Decoding Biases: Automated Methods and LLM Judges for Gender Bias Detection in Language Models",
    530       "relevance": "Source of the LLM-as-judge methodology adopted for per-step bias scoring"
    531     },
    532     {
    533       "title": "Thoughts Are All Over the Place: On the Underthinking of o1-like LLMs",
    534       "relevance": "Parallel work on reasoning instability in o1-like models; motivates the analysis of reasoning step dynamics"
    535     },
    536     {
    537       "title": "Self-Debiasing Large Language Models: Zero-Shot Recognition and Reduction of Stereotypes",
    538       "relevance": "One of two baseline mitigation methods compared against ADBP in Figure 6"
    539     }
    540   ],
    541   "engagement_factors": {
    542     "practical_relevance": {
    543       "score": 2,
    544       "justification": "ADBP is a lightweight, supervision-free mitigation strategy applicable to deployed reasoning models without retraining or external annotation."
    545     },
    546     "surprise_contrarian": {
    547       "score": 3,
    548       "justification": "The core finding directly contradicts common intuition: better reasoning capability amplifies rather than reduces social bias, especially when reasoning models encounter ambiguity."
    549     },
    550     "fear_safety": {
    551       "score": 2,
    552       "justification": "Demonstrates that widely deployed reasoning models (o1, DeepSeek-R1) systematically reinforce social stereotypes in their chain-of-thought, with concrete implications for high-stakes QA applications."
    553     },
    554     "drama_conflict": {
    555       "score": 2,
    556       "justification": "Frames reasoning-based models as creating a new alignment problem rather than solving existing bias issues, positioning reasoning as a double-edged capability."
    557     },
    558     "demo_ability": {
    559       "score": 2,
    560       "justification": "Code released on GitHub and experiments use a publicly available benchmark; practitioners can run the evaluation and ADBP mitigation on their own models."
    561     },
    562     "brand_recognition": {
    563       "score": 2,
    564       "justification": "Evaluates DeepSeek-R1 and OpenAI o1/o3-mini, two high-profile reasoning model families with strong community recognition at time of publication."
    565     }
    566   },
    567   "hn_data": {
    568     "threads": [
    569       {
    570         "hn_id": "43405094",
    571         "title": "Politicians' misinformation behavior and public engagement, in 4 countries",
    572         "points": 3,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=43405094",
    575         "created_at": "2025-03-18T21:03:45Z"
    576       }
    577     ],
    578     "top_points": 3,
    579     "total_points": 3,
    580     "total_comments": 0
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs