scan.json (25999B)
1 { 2 "paper": { 3 "title": "Investigating Intersectional Bias in Large Language Models using Confidence Disparities in Coreference Resolution", 4 "authors": [ 5 "Falaah Arif Khan", 6 "Nivedha Sivakumar", 7 "Yinong Oliver Wang", 8 "Katherine Metcalf", 9 "Cezanne Camacho", 10 "Barry-John Theobald", 11 "Luca Zappella", 12 "Nicholas Apostoloff" 13 ], 14 "year": 2025, 15 "venue": "COLM 2025", 16 "arxiv_id": "2508.07111", 17 "doi": "10.48550/arXiv.2508.07111" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "LLMs show coreference confidence disparities up to 40% across intersectional demographic attributes including body type, sexual orientation, and socio-economic status, with doubly-disadvantaged identities most affected in anti-stereotypical settings. Even hegemonic markers (e.g., 'White', 'cisgender') decrease coreference confidence compared to unaugmented baselines, suggesting models rely on memorization rather than reasoning. The WinoIdentity benchmark (245,700 prompts across 50 intersectional identities) reveals two independent failure modes—invalidity (memorization) and value misalignment (biased reasoning)—that can compound to cause social harm.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "GitHub repository explicitly linked: https://github.com/apple/ml-winoidentity (footnote 1, §3)." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "WinoIdentity dataset released at the same GitHub link. The benchmark is constructed from the publicly available WinoBias dataset with documented augmentations." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No mention of environment specifications, requirements files, or library versions in the paper." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper. The construction procedure is described conceptually but no runnable scripts or commands are detailed." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Main results in Table 2 and Figures 2-3 report point estimates of coreference confidence disparity with no confidence intervals or error bars. Table 3 reports mean ± std dev for coreference confidence but the main disparity metric lacks uncertainty quantification." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are used despite claims of disparities between subgroups. Differences are reported as raw numbers without any hypothesis testing." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Effect sizes are reported as absolute confidence disparities with baseline context (e.g., '20 to 40%' disparities in Table 2), and specific coreference confidence values for each subgroup, allowing readers to assess magnitude." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for why 1575 base sentences from WinoBias or 245,700 total prompts is sufficient. No power analysis discussed." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": true, 70 "justification": "Table 3 reports mean ± standard deviation for coreference confidence across prompts under different augmentation conditions." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple baselines included: no-augmentation baseline (gender-only disparity), non-demographic augmentation baseline (Table 2, §A.6), and comparison across 5 models." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Models evaluated include Llama3-70b-instruct and Mixtral-8x7B, which are contemporary. The benchmark builds on the established WinoBias framework." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Three augmentation types (R-Aug, NR-Aug, C-Aug) are tested separately, along with non-demographic augmentation baselines (§A.6) and Chain-of-Thought prompting ablation (§A.5), effectively ablating the augmentation design." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Both coreference confidence (Eq. 1) and coreference accuracy are reported (Tables 3, 5), plus the disparity metric (Eq. 2) applied to both." 93 }, 94 "human_evaluation": { 95 "applies": false, 96 "answer": false, 97 "justification": "Human evaluation is not relevant here; the benchmark has deterministic ground truth answers from WinoBias construction, and the study measures model confidence/accuracy against known correct coreferences." 98 }, 99 "held_out_test_set": { 100 "applies": false, 101 "answer": false, 102 "justification": "The paper evaluates pre-trained models without training; there is no train/test split to manage. The entire WinoIdentity dataset is an evaluation corpus." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by demographic attribute (10 attributes), sentence type (Type-1 vs Type-2), augmentation type, individual occupations (Fig. 4), and per-model in Tables 2, 4, 5." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Failure cases are discussed extensively: doubly-disadvantaged identities on anti-stereotypical occupations (§4.2, Fig. 4), models being confident and wrong on male-dominated occupations with feminine pronouns." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The CoT prompting experiment (§A.5) shows that while CoT reduces disparities, it also decreases overall confidence, presenting a trade-off rather than a clean solution." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims of 40% confidence disparities are supported by Table 2 (e.g., socio-economic status at 0.4 for Mistral). Claims about doubly-disadvantaged identities are supported by §4.2 and Fig. 4. Memorization claims supported by §4.1." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper claims models 'rely on memorization rather than reasoning' (§4.1, abstract) based on observational evidence (confidence drops with augmentation). This is a causal claim made from correlational evidence — the drop in confidence when adding identity markers could have explanations other than memorization, and no causal identification strategy is used." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper explicitly limits scope to the US context (§6 Ethical Considerations), acknowledges limitation to 50 identities and 1575 WinoBias sentences (§5 Limitations), and notes findings may not generalize to other cultural settings." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Non-demographic augmentation baseline (§A.6) tests whether any additional word (not just demographic markers) causes the effect. CoT experiment (§A.5) tests whether prompting strategy matters. The paper distinguishes memorization from biased reasoning effects." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper carefully distinguishes between what is measured (coreference confidence on synthetic prompts) and what it claims to indicate (representational harm in real-world decision contexts). §6 explicitly acknowledges that 'it's unclear how to translate mathematical unfairness into practical social harm.'" 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions stated: mistral-7B-instruct-v0.2, mixtral-8x7B-instruct, llama3-70b-instruct, pythia-12B, falcon-40B-instruct (§4.1)." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt construction is described with examples (§3, Fig. 1), and an actual CoT prompt example is shown in §A.5. The prompts follow a deterministic template from WinoBias with clearly specified augmentations." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Greedy decoding is explicitly stated (§4, 'We use greedy decoding as this ensures deterministic predictions for reproducibility'), which specifies the key sampling parameter." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The evaluation is direct model querying for next-token probabilities." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The dataset construction pipeline is thoroughly described (§3.2, Fig. 1): demographic markers from Table 1, three augmentation procedures, and how prompts are generated from WinoBias base sentences." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Dedicated 'Limitations and Future Work' subsection in §5 and a separate 'Ethical Considerations' section (§6) with substantive discussion." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Specific threats discussed: limited to 50 identities and 1575 sentences (§5), combinatorial explosion problem (§5), US-specific cultural context (§6), difficulty translating mathematical unfairness to practical social harm (§6)." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Paper explicitly states scope is limited to the US context (§6), binary gender intersected with 10 attributes (§3.2), coreference resolution task only (§5), and identifies specific expansions needed (other evaluation corpora, more markers)." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The WinoIdentity dataset is released via GitHub (https://github.com/apple/ml-winoidentity), enabling independent reproduction and verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Data construction is fully described: demographic markers drawn from the Wheel of Power and Privilege (Table 1), three augmentation procedures defined (§3.2), with the source WinoBias dataset clearly referenced." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. The data is synthetically constructed from WinoBias and demographic markers." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The full pipeline is documented in §3.2 and Fig. 1: 1575 base WinoBias prompts × 2 pronouns × (25+1) demographic markers × 3 augmentation types = 245,700 prompts." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding or acknowledgments section is present in the paper." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All authors disclosed as affiliated with Apple (header: '1Work done while at Apple, 2Apple')." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "Apple is not evaluating its own models. The evaluated models are Mistral, Mixtral, Llama3, Pythia, and Falcon — none are Apple products. Apple has no financial stake in the bias findings for these models." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for any of the 5 evaluated models." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": true, 244 "justification": "The paper explicitly discusses that WinoBias is likely in LLM training data and may be memorized (§1, §5: 'existing fairness benchmarks are likely to be included in LLM training data and potentially memorized'). This is central to the validity argument." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": true, 249 "justification": "Contamination is a core theme. The paper argues WinoBias (2018) is likely memorized, and designs augmented prompts specifically to test whether models rely on memorization. The non-demographic augmentation baseline (§A.6) and the confidence drop for hegemonic markers both serve as contamination probes." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, latency, or computational requirements reported despite running 245,700 prompts across 5 models." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total compute budget, GPU hours, or hardware specifications reported." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": true, 305 "justification": "Greedy decoding is used, making results deterministic (§4: 'We use greedy decoding as this ensures deterministic predictions for reproducibility'). No seed sensitivity because there is no randomness." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Single deterministic run per model due to greedy decoding — this is implicitly stated and appropriate since results are reproducible." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": false, 314 "answer": false, 315 "justification": "No hyperparameter tuning is performed. Models are evaluated off-the-shelf with greedy decoding." 316 }, 317 "best_config_selection_justified": { 318 "applies": false, 319 "answer": false, 320 "justification": "No configuration selection; all models evaluated with the same fixed setup." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Many comparisons across 10 attributes, 5 models, 3 augmentations, and 2 sentence types, but no multiple comparison correction applied. No statistical tests performed at all." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": false, 329 "answer": false, 330 "justification": "The paper evaluates third-party models, not their own system. No self-comparison bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": false, 334 "answer": false, 335 "justification": "Not applicable; the study evaluates bias, not system performance improvement. Compute differences between models are not the focus." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "The paper extensively discusses what WinoBias and WinoIdentity actually measure vs. what is claimed. The validity framework from Coston et al. (2023) is applied throughout (§1, §5), and the paper distinguishes validity concerns from value misalignment concerns." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is involved; models are queried directly for next-token probabilities." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "The paper acknowledges WinoBias (2018) predates all evaluated models and is likely memorized (§1, §5). The augmentation framework is specifically designed to test beyond memorized patterns." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": true, 357 "justification": "The augmentation design tests whether models are affected by information that should be irrelevant (demographic markers), effectively probing for a form of feature leakage from identity markers to coreference predictions." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether the WinoBias prompts are structurally independent or whether augmented variants of the same base prompt create non-independence issues in the analysis." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "The augmentation framework itself serves as a leakage detection method: if models memorized WinoBias answers, augmented versions should show confidence drops, which is exactly what is observed and analyzed (§4.1)." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Coreference confidence disparities reach as high as 40% along demographic attributes including body type, sexual orientation, and socio-economic status.", 374 "evidence": "Table 2 shows Mistral has 0.4 disparity for socio-economic status (Type-2), 0.392 for body type (Type-2), and multiple models exceed 0.2 across attributes. Table 4 extends this to all augmentation types.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Models are most uncertain about doubly-disadvantaged identities in anti-stereotypical settings.", 379 "evidence": "§4.2 and Fig. 4 show that for mechanic, fem coreference confidence is -0.065, transgender fem is -0.11, and gay fem is -0.24, demonstrating compounding disadvantage.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "LLMs rely more on memorization than reasoning, as evidenced by confidence drops even for hegemonic markers.", 384 "evidence": "§4.1, Figs. 9-10 show coreference confidence consistently decreases after referent augmentation even for hegemonic markers like 'White' and 'cisgender'. The non-demographic baseline (§A.6) also shows drops but to a lesser extent.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Uncertainty-based evaluation detects bias that accuracy-based evaluation misses.", 389 "evidence": "§A.3.1 and Table 3 show accuracy increases under non-referent augmentation (suggesting improvement) while confidence analysis reveals this is due to bias (models less likely to pick the augmented non-referent), correctly identifying the mechanism.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Chain-of-Thought prompting reduces intersectional disparities but at the cost of overall confidence.", 394 "evidence": "Tables 6-9 (§A.5) show CoT generally reduces disparities but also decreases coreference confidence for both Type-1 and Type-2 sentences across most attributes.", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No statistical significance tests", 401 "detail": "Despite comparing disparities across 10 demographic attributes, 5 models, 3 augmentation types, and 2 sentence types, no statistical significance tests are performed. All comparisons are based on raw numerical differences, making it unclear which observed disparities are statistically meaningful versus noise." 402 }, 403 { 404 "flag": "Causal claims from observational evidence", 405 "detail": "The memorization claim is central to the paper's narrative but is inferred from confidence drops upon augmentation. Alternative explanations (distributional shift, syntactic complexity from added tokens) are partially but not fully addressed." 406 }, 407 { 408 "flag": "Company evaluating fairness of competitors' models", 409 "detail": "Apple researchers evaluate Mistral, Meta (Llama), and other companies' models for bias. While the evaluated models are not Apple products (reducing direct conflict), the finding that competitors' models are biased could serve Apple's interests." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "On the dangers of stochastic parrots: Can language models be too big?", 415 "authors": ["Emily M Bender", "Timnit Gebru", "Angelina McMillan-Major", "Shmargaret Shmitchell"], 416 "year": 2021, 417 "relevance": "Foundational work on risks of large language models, relevant to AI safety and alignment research." 418 }, 419 { 420 "title": "Are emergent abilities of large language models a mirage?", 421 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 422 "year": 2023, 423 "relevance": "Challenges emergent abilities narrative in LLMs, relevant to LLM capability evaluation methodology." 424 }, 425 { 426 "title": "GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models", 427 "authors": ["Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi"], 428 "year": 2024, 429 "arxiv_id": "2410.05229", 430 "relevance": "Demonstrates LLM reasoning limitations through symbolic math tasks, directly relevant to AI capability assessment." 431 }, 432 { 433 "title": "Gender bias in coreference resolution: Evaluation and debiasing methods", 434 "authors": ["Jieyu Zhao", "Tianlu Wang", "Mark Yatskar", "Vicente Ordonez", "Kai-Wei Chang"], 435 "year": 2018, 436 "relevance": "Original WinoBias paper that this work extends; foundational benchmark for bias evaluation in NLP." 437 }, 438 { 439 "title": "A validity perspective on evaluating the justified use of data-driven decision-making algorithms", 440 "authors": ["Amanda Coston", "Anna Kawakami", "Haiyi Zhu", "Ken Holstein", "Hoda Heidari"], 441 "year": 2023, 442 "relevance": "Validity framework applied in this paper to distinguish validity concerns from value alignment in AI systems." 443 }, 444 { 445 "title": "Uncertainty as a fairness measure", 446 "authors": ["Selim Kuzucu", "Jiaee Cheong", "Hatice Gunes", "Sinan Kalkan"], 447 "year": 2024, 448 "relevance": "Formalizes uncertainty-based fairness evaluation, directly inspiring the methodology of this paper." 449 }, 450 { 451 "title": "Gender, race, and intersectional bias in resume screening via language model retrieval", 452 "authors": ["Kyra Wilson", "Aylin Caliskan"], 453 "year": 2024, 454 "relevance": "Studies intersectional bias in LLM-based hiring, directly relevant to AI fairness and LLM evaluation." 455 }, 456 { 457 "title": "BBQ: A hand-built bias benchmark for question answering", 458 "authors": ["Alicia Parrish", "Angelica Chen", "Nikita Nangia"], 459 "year": 2022, 460 "relevance": "Major bias benchmark for LLM evaluation across multiple demographic attributes." 461 }, 462 { 463 "title": "The silicone ceiling: Auditing GPT's race and gender biases in hiring", 464 "authors": ["Lena Armstrong", "Abbey Liu", "Stephen MacNeil", "Danaë Metaxa"], 465 "year": 2024, 466 "arxiv_id": "2405.04412", 467 "relevance": "Audits GPT for hiring bias, relevant to LLM fairness evaluation methodology." 468 }, 469 { 470 "title": "A survey on fairness in large language models", 471 "authors": ["Yingji Li", "Mengnan Du", "Rui Song", "Xin Wang", "Ying Wang"], 472 "year": 2023, 473 "arxiv_id": "2308.10149", 474 "relevance": "Comprehensive survey of LLM fairness evaluations, relevant as meta-research on AI evaluation methodology." 475 } 476 ] 477 }