scan-v5.json (19071B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "HearSay Benchmark: Do Audio LLMs Leak What They Hear?", 6 "authors": [ 7 "Jin Wang", 8 "Liang Lin", 9 "Kaiwen Luo", 10 "Weiliu Wang", 11 "Yitian Chen" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.03783", 16 "doi": "10.48550/arXiv.2601.03783" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Key abstract claims — 92.89% gender accuracy (Table 1), near-zero refusal rates for physiological traits (Table 1), and CoT amplifying risk in capable models (Figure 4) — are all supported; the CoT claim is appropriately qualified to 'capable models' matching the mixed evidence.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Claims that CoT 'boosts Accent inference by 22.1%' are based on direct controlled comparisons with and without CoT prompting; the paper frames these as observational effects rather than mechanism claims, which is adequate for the study design.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper claims ALLMs 'inherently' encode private attributes as an 'emergent property of pre-training,' but the entire dataset derives from public figures giving academic lectures — a highly unrepresentative population — and no caveat qualifies these sweeping generalizations.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "RQ3 explicitly investigates whether high IAR reflects genuine acoustic reasoning vs. statistical hallucination using the Blind Bias Rate (BBR) metric, and Figure 5 visualizes model-level separation between these two explanations.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "IAR directly measures accuracy against ground-truth attribute labels; the paper is clear it measures model inference accuracy on labeled clips, not real-world privacy harm, though the gap between these is not fully theorized.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 6 'Limitation' is present and substantive, identifying English-centrism, lack of audio-level defense testing, and unexplored adversarial attack vectors.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "The limitations section omits the most critical validity threats: all speakers are public figures who are likely already in models' training data (inflating IAR via memorization), and the validity of inferring social constructs like income and social stratum from voice is never questioned.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper acknowledges English-centrism and unexplored attack vectors but never explicitly states that results cannot generalize to non-public-figure speakers or that acoustic social attribute inference may reflect corpus stereotypes rather than real-world acoustic correlates.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No acknowledgments or funding disclosure section appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are listed on the title page: XDU, NTU, NCEPU, BUPT, SHU, UAEU, UCAS-IIE, and Squirrel AI.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed; NA since no funder to assess, though one author's affiliation with Squirrel AI (a commercial AI company) is undisclosed in terms of potential conflict.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or financial disclosure of any kind appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "ALLM architecture is formally defined in Section 2.1 with mathematical notation; IAR, ARR, and BBR are all precisely defined in Section 4.1 with equations and operational criteria.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper explicitly positions HearSay as 'the first comprehensive benchmark investigating the privacy inference capabilities of ALLMs solely from voiceprint' with clear evaluation goals across eight attributes.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2.2 distinguishes HearSay from text-based privacy inference (Staab et al.), vision-based approaches (Li et al. 2025b), and prior task-specific audio classifiers, explaining the specific gap this benchmark fills.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "benchmark-creation": { 120 "construct_design": { 121 "construct_validity_argued": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper motivates that pre-training may encode voice-attribute correlations but provides no systematic argument for why each of the 8 attributes is validly measurable from voiceprint alone, particularly abstract social constructs like income and social stratum.", 125 "source": "haiku" 126 }, 127 "difficulty_distribution_characterized": { 128 "applies": true, 129 "answer": false, 130 "justification": "Figure 2 shows attribute class distributions but no difficulty tiers are defined or measured; varying IAR across attributes implicitly shows difficulty variation but this is not analyzed or designed as a benchmark property.", 131 "source": "haiku" 132 }, 133 "ceiling_floor_effects_checked": { 134 "applies": true, 135 "answer": false, 136 "justification": "Gender achieves 92.89% average IAR — a near-ceiling result — but the paper treats this as a finding rather than a discriminability problem; no explicit ceiling/floor effect analysis is performed or flagged as a benchmark design concern.", 137 "source": "haiku" 138 }, 139 "human_baseline_included": { 140 "applies": true, 141 "answer": false, 142 "justification": "No human baseline is reported for any attribute; models are only compared against random guessing, leaving open whether results represent superhuman, human-level, or sub-human inference capability.", 143 "source": "haiku" 144 }, 145 "scoring_rubric_justified": { 146 "applies": true, 147 "answer": false, 148 "justification": "GPT-4o-mini is used as automated judge for IAR but no validation against human annotations is reported, no inter-annotator reliability is measured, and no edge-case handling for ambiguous responses is described.", 149 "source": "haiku" 150 } 151 }, 152 "robustness": { 153 "contamination_resistance_designed": { 154 "applies": true, 155 "answer": false, 156 "justification": "The dataset is built from public figures' lecture audio that is very likely in evaluated models' training corpora; no temporal split, canary strings, held-out test partition, or contamination check is designed or discussed.", 157 "source": "haiku" 158 }, 159 "temporal_robustness_discussed": { 160 "applies": true, 161 "answer": false, 162 "justification": "No discussion of how the benchmark will remain useful as models improve; with gender already at 92.89%, physiological attributes may saturate rapidly, and no update plan is mentioned.", 163 "source": "haiku" 164 }, 165 "failure_modes_discussed": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper discusses model failure modes (BBR, CoT degradation) but not benchmark failure modes — there is no analysis of what HearSay itself cannot measure or how it could be systematically gamed.", 169 "source": "haiku" 170 }, 171 "baseline_implementations_provided": { 172 "applies": true, 173 "answer": true, 174 "justification": "Code is available at https://github.com/JinWang79/HearSay_Benchmark and all evaluation prompts are provided verbatim in Appendix C.", 175 "source": "haiku" 176 } 177 }, 178 "documentation": { 179 "dataset_documentation_complete": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 3.2 describes the four-stage data curation pipeline; Appendix A documents all three external sources (NISP, VocalSound, Age-Vox-Celeb) with their collection methodology and preprocessing steps.", 183 "source": "haiku" 184 }, 185 "licensing_and_access_clear": { 186 "applies": true, 187 "answer": true, 188 "justification": "Appendix A.2 explicitly states the license for each external dataset (CC BY 4.0, CC BY-SA 4.0, NTT Software Evaluation License) and explains the controlled-access mechanism requiring a formal research application.", 189 "source": "haiku" 190 }, 191 "intended_use_specified": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 7 (Ethical Statement) explicitly states the research-only intent, acknowledges dual-use risk from malicious profiling applications, and restricts dataset use to academic non-commercial purposes.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "ALLMs achieve average gender inference accuracy of 92.89% from voiceprint alone", 203 "evidence": "Table 1 shows average IAR for Gender across 9 open-source models is 92.89%, with MERaLION reaching 96.44%", 204 "supported": "strong" 205 }, 206 { 207 "claim": "Audio-based inference surpasses random guessing baseline by 46.4%", 208 "evidence": "Figure 3 compares With-Audio, Transcribed-Text, and Random Guessing settings; the 46.4% gap is labeled on the figure", 209 "supported": "strong" 210 }, 211 { 212 "claim": "Most open-source models have near-zero Answer Refusal Rates for physiological attributes", 213 "evidence": "Table 1 shows ARR near 0% for Gender across nearly all open-source models despite high IAR; only GPT-4o-Audio achieves 100% ARR on Weight", 214 "supported": "strong" 215 }, 216 { 217 "claim": "CoT prompting boosts Accent inference accuracy by 22.1% in capable models (Qwen3-Omni-Flash)", 218 "evidence": "Figure 4 shows CoT vs. no-CoT comparison; 22.1% gain on Accent for Qwen3-Omni-Flash is reported, with mixed effects across other models", 219 "supported": "moderate" 220 }, 221 { 222 "claim": "Privacy leakage stems from genuine acoustic inference, not statistical hallucination", 223 "evidence": "Figure 5 BBR analysis shows capable models (Qwen3-Omni-Flash, MiniCPM-o-2.6) systematically deviate from prior distributions when given audio, placing them in the 'Correction Zone'", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "Lightweight prompt defense significantly increases refusal rates in responsive models", 228 "evidence": "Table 2 shows Kimi-Audio ARR for Age jumps from 0.46% to 97.96% with a safety system prompt; MiniCPM-o-2.6 reaches 61.42% overall ARR under defense", 229 "supported": "strong" 230 }, 231 { 232 "claim": "Social attribute inference (Income, Social Stratum) is effectively possible with capable ALLMs", 233 "evidence": "Qwen3-Omni-Flash achieves 61.19% on Income (Table 1), but average IAR for social attributes across all models is only 25-43%, barely above random for many", 234 "supported": "weak" 235 } 236 ], 237 "methodology_tags": [ 238 "benchmark-creation", 239 "benchmark-eval" 240 ], 241 "key_findings": "HearSay is a 22,064-clip benchmark demonstrating that Audio LLMs can infer private attributes from voiceprint with alarming accuracy — 92.89% average for gender, and up to 61.19% for abstract social attributes like income in capable models. Nearly all open-source models have near-zero answer refusal rates despite high inference accuracy, while GPT-4o-Audio shows strong defensive behavior. Chain-of-Thought reasoning amplifies privacy risk in capable models (+22.1% Accent accuracy for Qwen3-Omni-Flash) but degrades performance in weaker models. BBR analysis confirms capable models extract genuine acoustic signal rather than hallucinating from statistical priors.", 242 "red_flags": [ 243 { 244 "flag": "Public figure selection bias", 245 "detail": "The dataset is entirely constructed from academic public lecturers — an extreme population skew for social attributes like income, education, and social stratum. All speakers self-selected into public roles, making results ungeneralizable to the general population while the paper makes sweeping claims about ALLM privacy risk for ordinary users." 246 }, 247 { 248 "flag": "Training data contamination unaddressed", 249 "detail": "Public figures whose lectures constitute the benchmark are very likely in the training corpora of the evaluated models, creating a memorization confound that could inflate IAR substantially. No contamination check, temporal split, or speaker de-duplication against known training sets is performed." 250 }, 251 { 252 "flag": "No human baseline", 253 "detail": "Without human performance on the same eight attributes, there is no way to calibrate whether model accuracy represents superhuman inference or simply replicates what any person could infer from voice — a standard requirement for 'emergent capability' claims." 254 }, 255 { 256 "flag": "LLM judge unvalidated", 257 "detail": "GPT-4o-mini is used as automated judge for all IAR calculations with no validation against human annotations, no inter-rater reliability metrics, and no analysis of judge failure modes for ambiguous responses." 258 }, 259 { 260 "flag": "Construct validity of social attributes questionable", 261 "detail": "The claim that income, social stratum, and education can be validly inferred from voiceprint is scientifically contested and potentially encodes harmful stereotypes; the paper provides no literature grounding for these specific voice-attribute correlations and no discussion of whether the associations in the dataset reflect real acoustic signals or sociolinguistic confounds." 262 }, 263 { 264 "flag": "Gender ceiling effect unaddressed as benchmark design flaw", 265 "detail": "At 92.89% average accuracy, gender is immediately saturated and provides minimal discriminative power for ranking models, yet this is celebrated as a finding rather than identified as a benchmark design limitation that undermines longitudinal usefulness." 266 } 267 ], 268 "cited_papers": [ 269 { 270 "title": "Beyond memorization: Violating privacy via inference with large language models", 271 "relevance": "Direct predecessor establishing that LLMs can infer private attributes from text; HearSay extends this to the audio modality" 272 }, 273 { 274 "title": "The man behind the sound: Demystifying audio private attribute profiling via multimodal large language model agents", 275 "relevance": "Concurrent work on audio privacy inference using multimodal agents with synthetic/TV-series audio; HearSay claims improvement via real-world public lecture audio with ground-truth labels" 276 }, 277 { 278 "title": "Auditing M-LLMs for privacy risks: A synthetic benchmark and evaluation framework", 279 "relevance": "Related benchmark for multimodal LLM privacy evaluation using synthetic data, contrasted with HearSay's real audio approach" 280 }, 281 { 282 "title": "AudioTrust: Benchmarking the multifaceted trustworthiness of audio large language models", 283 "relevance": "Concurrent comprehensive ALLM safety benchmark; HearSay focuses specifically on voiceprint privacy inference as one trustworthiness dimension" 284 }, 285 { 286 "title": "Extracting training data from large language models", 287 "relevance": "Foundational privacy work motivating the shift from memorization-based to inference-based privacy risk analysis in LLMs" 288 }, 289 { 290 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 291 "relevance": "Methodological basis for the LLM-as-judge evaluation paradigm used for IAR scoring throughout HearSay experiments" 292 }, 293 { 294 "title": "Do-not-answer: A dataset for evaluating safeguards in LLMs", 295 "relevance": "Related work on safety refusal evaluation methodology, informing the ARR metric design and safety alignment assessment" 296 } 297 ], 298 "engagement_factors": { 299 "practical_relevance": { 300 "score": 2, 301 "justification": "Practitioners deploying audio LLMs can use HearSay to audit privacy alignment before deployment, though controlled-access limits immediate usability." 302 }, 303 "surprise_contrarian": { 304 "score": 2, 305 "justification": "The finding that CoT reasoning amplifies privacy risk in capable models — rather than improving safety — challenges the default assumption that better reasoning equals safer behavior." 306 }, 307 "fear_safety": { 308 "score": 3, 309 "justification": "Directly frames ALLM capabilities as concrete privacy threats with alarming accuracy numbers (92.89% gender, 61.19% income), linking to real-world risks of voice-based mass profiling." 310 }, 311 "drama_conflict": { 312 "score": 2, 313 "justification": "The stark contrast between GPT-4o-Audio's 100% ARR on Weight vs. open-source models' near-zero ARR creates a commercial-vs-open-source safety narrative." 314 }, 315 "demo_ability": { 316 "score": 1, 317 "justification": "Code is on GitHub but the dataset requires a formal research application for controlled access, preventing easy public demonstration." 318 }, 319 "brand_recognition": { 320 "score": 2, 321 "justification": "Evaluates GPT-4o-Audio, Gemini-2.5-Pro, and multiple Qwen models, leveraging high name recognition of major commercial AI systems." 322 } 323 }, 324 "hn_data": { 325 "threads": [], 326 "top_points": 0, 327 "total_points": 0, 328 "total_comments": 0 329 } 330 }