scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29287B)
      1 {
      2   "paper": {
      3     "title": "Does Reasoning Introduce Bias? A Study of Social Bias Evaluation and Mitigation in LLM Reasoning",
      4     "authors": [
      5       "Xuyang Wu",
      6       "Jinming Nian",
      7       "Ting-Ruen Wei",
      8       "Zhiqiang Tao",
      9       "Hsin-Tai Wu",
     10       "Yi Fang"
     11     ],
     12     "year": 2025,
     13     "venue": "Findings of the Association for Computational Linguistics: EMNLP 2025",
     14     "arxiv_id": "2502.15361",
     15     "doi": "10.18653/v1/2025.findings-emnlp.1006"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "Reasoning-based LLMs (DeepSeek-R1 variants, OpenAI o1) improve accuracy over instruction-tuned counterparts on the BBQ bias benchmark but do not reduce social bias, and in some categories amplify it, especially under ambiguity. Biased reasoning steps strongly correlate with incorrect predictions, with bias intensifying after initial reasoning steps. Removing biased steps (SfRP) consistently improves accuracy, and the proposed ADBP method outperforms SfRP in most cases by tracking answer distribution shifts across incremental reasoning steps as a proxy for bias detection.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Code repository provided in abstract: 'Evaluation and mitigation code is available at https://github.com/elviswxy/LLM_reasoning_bias.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "They use the BBQ dataset (Parrish et al., 2022), a publicly available benchmark. The dataset is referenced by name and citation."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions 'NVIDIA A100 GPUs' (Section 4.1) but provides no requirements.txt, Dockerfile, conda environment, or library versions."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions found in the paper. The code repository is linked but no README or reproduction guide is described."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 1, 2, and 4 report only point estimates for accuracy and bias scores. No confidence intervals, error bars, or uncertainty measures on any main results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Claims like 'DeepSeek-8B consistently outperforms similar-sized models' (Section 4.2) are based on comparing raw numbers. No statistical tests (p-values, t-tests, etc.) are reported for any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Accuracy differences with baseline context are reported: 'accuracy increases compared to using the original (biased) reasoning: by 0.517 and 0.717 in Cases 1 and 3' (Section 4.4). Table 2 provides absolute accuracy values enabling effect size comparison."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification given for why this specific set of models was chosen, or why the BBQ dataset size (58,492 examples) is adequate for the claims. No power analysis discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance or standard deviation reported across experimental runs. The LLM-as-a-judge uses majority voting over 5 runs, but main model accuracy and bias results appear to be single-run without any spread measure."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple baselines included: instruction-tuned counterparts (Llama-8B, Qwen-32B) as base models, SfRP as mitigation baseline, plus Self-debiasing via Explanation and Combined Debiasing Prompt (Figure 6, Section 6.1)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All models are recent: DeepSeek-R1 (2025), OpenAI o1/o3-mini (2025), Qwen2.5 (2024), Llama 3.1 (2024), Marco-o1 (2024). Mitigation baselines from 2025."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section 4.4 performs a bias ablation evaluation: comparing full reasoning (with bias) vs. SfRP (without biased steps) across four cases, directly measuring the contribution of biased reasoning steps to incorrect predictions (Figure 5)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two primary metrics are used throughout: Accuracy (Acc, Eq. 1) and Bias Score (Bias, Eqs. 2-3), reported separately for ambiguous and disambiguated contexts."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation conducted. Reasoning step bias is assessed by LLM-as-a-judge (GPT-4o). The Limitations section states: 'We did not conduct human labeling to verify its reliability due to the extremely high cost of manual annotation.'"
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Models are evaluated zero-shot without fine-tuning on BBQ (Section 4.1: 'under a zero-shot setting without fine-tuning'), so the entire BBQ dataset serves as an unseen test set."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 1 provides per-category breakdowns across all 11 BBQ bias categories (Age, Disability, Gender, etc.) for each model, for both ambiguous and disambiguated contexts."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4.3 provides detailed failure analysis with Figures 3-4. They identify non-negative polarity question misinterpretation as a specific failure mode (Section 4.3, Figure 4b, Appendix A.7), and show qualitative failure examples (Appendix A.6)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "ADBP underperforms SfRP when base model is Qwen-32B (Table 2, Case 3). They also report reasoning models don't reduce bias despite improving accuracy, and that prompt-only debiasing strategies 'do not reliably correct reasoning-induced bias' (Section 6.1)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims are supported: 'reasoning-based models improve accuracy' (Table 1), bias amplification in reasoning (Section 4.3, Figures 3-4), ADBP 'outperforms SfRP baseline in most cases' (Table 2, with exception noted). The 'most cases' hedge is appropriate."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper makes causal-adjacent claims ('biased reasoning steps contribute to incorrect predictions') but is careful: 'this strong correlation between bias and wrong answer does not imply perfect causality' (Section 4.3). The SfRP ablation (removing biased steps → improved accuracy) is a controlled single-variable manipulation adequate for the hedged claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Does Reasoning Introduce Bias?' and the abstract's 'first systematic evaluation of social bias within LLM-generated reasoning' are broader than the evidence supports. The study uses only the BBQ dataset (US-centric biases), only a few models (mainly DeepSeek-R1 distilled variants), and zero-shot setting only. Results may not generalize to other bias benchmarks, languages, or full-size models."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 4.3 discusses non-negative polarity question misinterpretation as an alternative explanation for incorrect answers (Figure 4b), and notes 'many cases in Figure 3b have unbiased reasoning steps (white lines), indicating that DeepSeek-8B can still produce incorrect answers without biased reasoning.'"
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper's claims match its measurements: it measures BBQ accuracy and bias scores and frames results in those terms. The BBQ benchmark is specifically designed to measure social bias in QA, and the paper does not overframe its measurements as capturing all forms of social bias."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Open-source models are specified with HuggingFace URLs (e.g., 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B'). However, GPT-4o used as the LLM-as-a-judge (a critical methodology component) has no version or snapshot date specified. OpenAI model results are 'taken from o3-mini system card' without specifying exact versions."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt text provided in Appendix A.2: outcome evaluation prompts (Figures 7, 8), LLM-as-a-judge prompts (Figures 9, 10, 11), and ADBP prompt (Figure 12)."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Section 4.1 states 'we adhere to the same generation parameters as specified in each model's system card' but does not list the actual hyperparameters (temperature, top-p, etc.) used. Deferring to external system cards is insufficient for reproducibility."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The paper performs direct model inference with prompts."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3.4 describes reasoning step extraction (splitting by newline character), Section 3.2 describes output normalization (exact string matching with regular expressions), and Section 4.3 describes reasoning chain normalization (k=100 uniform bins)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A dedicated 'Limitations' section is present after Section 7, discussing LLM-as-a-judge reliability, lack of human verification, cost-efficiency, computational constraints, distilled model limitations, and refusal behavior."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Specific threats discussed: 'The LLM-as-a-judge method... may still introduce uncertainty', 'we were unable to conduct experiments on the full-size DeepSeek-R1 model, and the distilled versions may carry inherent biases', 'refusal behavior was rare... likely due to the controlled nature of BBQ dataset questions.' These are specific to this study."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The Limitations section discusses what wasn't done (no human labeling, no full-size models, no cost analysis) but does not explicitly state what the results do NOT show or which populations/settings are excluded. No statement bounding conclusions to US-centric biases, English-only, or zero-shot settings."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The BBQ dataset is public, but the raw model outputs, reasoning traces, and LLM-as-a-judge scores are not explicitly stated to be released. Only 'evaluation and mitigation code' is available."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.1 describes the BBQ dataset (9 categories + 2 intersectional, source from US EEOC), Section 3.2 describes model inference procedure, Section 3.4 describes the LLM-as-a-judge evaluation setup with 5 independent scores and majority voting."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data comes from the BBQ benchmark dataset (standard public benchmark)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented: BBQ dataset → model inference with customized prompts → exact string matching for accuracy → bias score computation (Eqs. 1-3) → reasoning step extraction by newline → LLM-as-a-judge scoring with majority voting. Dataset statistics in Table 3 (58,492 total examples)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgments section found in the paper. No mention of grants, sponsors, or funding agencies."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Santa Clara University, Rochester Institute of Technology, and Docomo Innovations (corporate affiliation)."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "One author is affiliated with Docomo Innovations (NTT subsidiary). No funding disclosure means funder independence cannot be verified. The paper does not evaluate Docomo products, so the conflict risk is low, but independence is not established."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement found in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No model's training data cutoff date is stated. The BBQ dataset was published in 2022, and models evaluated (DeepSeek-R1, GPT-4o, o1) were trained after 2022, but no cutoff dates are discussed."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether BBQ examples appeared in any model's training data. Given BBQ has been publicly available since 2022 and all evaluated models were trained after 2022, this is a significant omission."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "BBQ was published in 2022 (ACL Findings) and has been widely cited. All evaluated models could have seen BBQ examples during training. No contamination analysis is performed or discussed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, latency, or token consumption reported. ADBP requires querying the model once per reasoning step (potentially dozens of forward passes per example), but computational cost is not quantified. Limitations section acknowledges 'without consideration for cost-efficiency.'"
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Only 'NVIDIA A100 GPUs' mentioned (Section 4.1). No GPU hours, total API spend, or training/inference time reported."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds or seed sensitivity analysis for any experiment."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The LLM-as-a-judge evaluates each step 5 times with majority voting (Section 3.4), but the main model inference (accuracy and bias results) does not state how many runs produced the results. Appears to be single-run."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search described. Prompts were 'customized' for each model but no search budget or alternatives tried are reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "The paper uses zero-shot evaluation with model-specific prompts aligned to system documentation (Section 4.1). Prompts are transparently provided in Appendix A.2, and the approach follows standard evaluation conventions without selecting from multiple configurations."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Many comparisons made across 11 categories, multiple models, and two context types without any statistical tests or multiple comparison corrections."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors propose ADBP and compare it against baselines without acknowledging the bias of evaluating their own method. No independent evaluation mentioned."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "ADBP requires multiple forward passes per example (one per reasoning step), while SfRP and other baselines require fewer passes. The compute cost differences are not discussed or compared."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "BBQ is used as the sole bias benchmark without discussing whether it adequately captures social bias in reasoning. No comparison with alternative bias benchmarks or discussion of BBQ's limitations as a construct validity measure for reasoning bias specifically."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding involved. The paper performs direct model inference with prompts."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "BBQ was published in 2022 (ACL Findings). All evaluated models were trained after 2022. No discussion of whether models may have memorized BBQ examples."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation format or context structure leaks information to the model beyond what would be available in a real-world bias scenario."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether BBQ items share structural templates or whether performance on one item predicts performance on related items within the same category."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention methods used (no canary strings, membership inference, temporal splits, or decontamination)."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Reasoning-based models improve accuracy over instruction-tuned counterparts but do not reduce social bias, and in many cases amplify it.",
    372       "evidence": "Table 1 shows DeepSeek-8B outperforms Llama-8B in accuracy across all 11 BBQ categories in both ambiguous and disambiguated contexts, but exhibits 'similar or even worse bias scores in certain areas (9 out of 11 categories for ambiguous questions)' (Section 4.2).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Biased reasoning steps correlate with incorrect predictions, with bias intensifying after initial reasoning steps.",
    377       "evidence": "Figure 3 shows darker red (stronger bias) in cases where reasoning models produce incorrect answers compared to correct ones. Section 4.3 notes 'bias often intensifies after the initial steps in the reasoning chain.' Authors acknowledge 'this strong correlation does not imply perfect causality.'",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Removing biased reasoning steps (SfRP) consistently improves prediction accuracy.",
    382       "evidence": "Figure 5 and Section 4.4: accuracy increases 'by 0.517 and 0.717 in Cases 1 and 3' when using SfRP vs. biased reasoning. Improvements seen in all four test cases.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "ADBP outperforms SfRP-based mitigation in most scenarios.",
    387       "evidence": "Table 2 shows ADBP achieves higher accuracy than SfRP in 6 of 8 conditions (4 cases × 2 model pairs). Exception: Qwen-32B base model, where SfRP outperforms. Section 6.1 attributes this to Qwen-32B's strength in utilizing filtered reasoning steps.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Ambiguity disrupts reasoning and amplifies stereotype propagation in reasoning-based models.",
    392       "evidence": "Section 4.2: 'In ambiguous contexts, the advantage of reasoning-based models diminishes.' Table 1 shows performance gaps widen under ambiguity for categories like Age, Physical Appearance, SES, and Nationality. Hypothesis supported by two observations about paired items and base model stability.",
    393       "supported": "moderate"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No statistical significance tests",
    399       "detail": "All comparisons between models and methods rely on point estimate comparisons (raw accuracy/bias numbers). No p-values, confidence intervals, or hypothesis tests are reported for any claim of difference, making it impossible to distinguish real effects from noise."
    400     },
    401     {
    402       "flag": "LLM-as-a-judge without human validation",
    403       "detail": "The core contribution (bias in reasoning steps) relies entirely on GPT-4o as judge. While the authors tested prompt robustness (Section 5.1), they explicitly state they 'did not conduct human labeling to verify its reliability.' The judge model itself could harbor biases that systematically affect results."
    404     },
    405     {
    406       "flag": "Complete absence of contamination analysis",
    407       "detail": "BBQ was published in 2022 and is widely cited. All evaluated models (DeepSeek-R1, GPT-4o, o1/o3-mini) were trained after 2022 and could have memorized BBQ examples. If models have seen the answers, both accuracy and bias measurements are confounded. No training cutoffs stated, no overlap analysis performed."
    408     },
    409     {
    410       "flag": "ADBP computational cost unknown",
    411       "detail": "ADBP requires querying the model once per reasoning step (Algorithm 1, lines 2-6), potentially dozens of forward passes per example on the 58,492-example BBQ dataset. The Limitations section acknowledges 'without consideration for cost-efficiency' but the actual cost is never quantified, making practical applicability unclear."
    412     },
    413     {
    414       "flag": "Single benchmark evaluation",
    415       "detail": "All findings rest on a single bias benchmark (BBQ). BBQ is US-centric and tests 11 specific bias categories in a multiple-choice QA format. The broad title ('Does Reasoning Introduce Bias?') and 'first systematic evaluation' claim are not fully supported by evaluation on one dataset."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    421       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed H. Chi", "Quoc V. Le", "Denny Zhou"],
    422       "year": 2022,
    423       "relevance": "Foundational work on chain-of-thought reasoning in LLMs, directly relevant to understanding reasoning capabilities evaluated in this paper."
    424     },
    425     {
    426       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    427       "authors": ["DeepSeek-AI"],
    428       "year": 2025,
    429       "arxiv_id": "2501.12948",
    430       "relevance": "Primary reasoning model evaluated; demonstrated reasoning distillation from larger to smaller models."
    431     },
    432     {
    433       "title": "BBQ: A hand-built bias benchmark for question answering",
    434       "authors": ["Alicia Parrish", "Angelica Chen", "Nikita Nangia", "Vishakh Padmakumar", "Jason Phang", "Jana Thompson", "Phu Mon Htut", "Samuel R. Bowman"],
    435       "year": 2022,
    436       "relevance": "The bias evaluation benchmark used throughout this study; establishes the methodology for measuring social bias in QA systems."
    437     },
    438     {
    439       "title": "On second thought, let's not think step by step! Bias and toxicity in zero-shot reasoning",
    440       "authors": ["Omar Shaikh", "Hongxin Zhang", "William Held", "Michael S. Bernstein", "Diyi Yang"],
    441       "year": 2023,
    442       "relevance": "Prior work showing CoT prompting can increase harmful outputs in sensitive domains; directly relevant to bias-in-reasoning theme."
    443     },
    444     {
    445       "title": "The Llama 3 herd of models",
    446       "authors": ["Abhimanyu Dubey", "Abhinav Jauhri", "Abhinav Pandey", "Abhishek Kadian"],
    447       "year": 2024,
    448       "arxiv_id": "2407.21783",
    449       "relevance": "Base model (Llama-3.1-8B-Instruct) used as instruction-tuned counterpart in evaluation."
    450     },
    451     {
    452       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    453       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    454       "year": 2023,
    455       "relevance": "Establishes the LLM-as-a-judge methodology used for reasoning step bias evaluation in this paper."
    456     },
    457     {
    458       "title": "Decoding biases: Automated methods and LLM judges for gender bias detection in language models",
    459       "authors": ["Shachi H. Kumar", "Saurav Sahay", "Sahisnu Mazumder"],
    460       "year": 2024,
    461       "arxiv_id": "2408.03907",
    462       "relevance": "Provides the LLM-as-a-judge bias rating methodology adapted in this paper."
    463     },
    464     {
    465       "title": "Self-debiasing large language models: Zero-shot recognition and reduction of stereotypes",
    466       "authors": ["Isabel O. Gallegos", "Ryan Aponte", "Ryan A. Rossi"],
    467       "year": 2025,
    468       "relevance": "One of the mitigation baselines (Self-debiasing via Explanation) compared against ADBP."
    469     },
    470     {
    471       "title": "Thoughts are all over the place: On the underthinking of o1-like LLMs",
    472       "authors": ["Yue Wang", "Qiuzhi Liu", "Jiahao Xu"],
    473       "year": 2025,
    474       "arxiv_id": "2501.18585",
    475       "relevance": "Documents reasoning failure patterns (underthinking, frequent reasoning switches) in o1-like models, motivating the bias analysis in this paper."
    476     },
    477     {
    478       "title": "Evaluating gender bias in large language models via chain-of-thought prompting",
    479       "authors": ["Masahiro Kaneko", "Danushka Bollegala", "Naoaki Okazaki", "Timothy Baldwin"],
    480       "year": 2024,
    481       "arxiv_id": "2401.15585",
    482       "relevance": "Investigates how CoT prompting affects gender bias evaluation and mitigation, directly related to the CoT-bias interaction studied here."
    483     },
    484     {
    485       "title": "Evaluating and mitigating social bias for large language models in open-ended settings",
    486       "authors": ["Zhao Liu", "Tian Xie", "Xueru Zhang"],
    487       "year": 2025,
    488       "arxiv_id": "2412.06134",
    489       "relevance": "Provides the Combined Debiasing Prompt baseline compared against ADBP in Section 6.1."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs