scan.json (30827B)
1 { 2 "paper": { 3 "title": "HearSay Benchmark: Do Audio LLMs Leak What They Hear?", 4 "authors": [ 5 "Jin Wang", 6 "Liang Lin", 7 "Kaiwen Luo", 8 "Weiliu Wang", 9 "Yitian Chen", 10 "Moayad Aloqaily", 11 "Xuehai Tang", 12 "Zhenhong Zhou", 13 "Kun Wang", 14 "Li Sun", 15 "Qingsong Wen" 16 ], 17 "year": 2026, 18 "venue": "arXiv", 19 "arxiv_id": "2601.03783", 20 "doi": "10.48550/arXiv.2601.03783" 21 }, 22 "scan_version": 2, 23 "active_modules": ["experimental_rigor", "data_leakage"], 24 "methodology_tags": ["benchmark-eval"], 25 "key_findings": "ALLMs inherently extract private attributes from voiceprints, achieving 92.89% average accuracy on gender across 13 models, while exhibiting near-zero refusal rates for physiological attributes. Chain-of-Thought prompting amplifies privacy risks in capable models (22.1% boost on accent inference for Qwen3-Omni-Flash) but degrades performance in weaker models. Capable models leverage genuine acoustic evidence to correct statistical biases, while weaker models hallucinate based on prior distributions. Lightweight prompt defense improves refusal rates overall but remains ineffective for deeply ingrained physiological inferences like gender.", 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "GitHub repository URL provided in abstract: https://github.com/JinWang79/HearSay_Benchmark. 'The codes and dataset are available at' this URL." 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "Dataset is available through controlled access at the GitHub URL, plus external public datasets (NISP, VocalSound, VoxCeleb2/Age-Vox-Celeb) are all publicly available. Appendix A.2 describes licensing terms." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper mentions 'four H20 GPUs' for inference but provides no requirements.txt, Dockerfile, conda environment, or library version details." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "No step-by-step reproduction instructions are provided in the paper. No README with commands or a 'Reproducing Results' section is described." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "Tables 1 and 2 report only point estimates (IAR%, ARR%) with no confidence intervals, error bars, or ± notation anywhere in the paper." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper makes numerous comparative claims (e.g., 'surpasses random baselines by 46.4%', 'CoT boosts Accent inference accuracy by 22.1%') based solely on comparing numbers without any statistical tests (no p-values, t-tests, or bootstrap tests)." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "Percentage improvements are reported with baseline context throughout: Figure 3 shows 46.4% above random guessing baseline with both values visible, Table 1 shows absolute IAR/ARR values for all models, and CoT improvements are given with both before and after values (e.g., Accent 'surges by 22.1%')." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "The dataset comprises 22,064 audio clips but no justification is given for this sample size. No power analysis or discussion of whether this size is sufficient for the claims made." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "Random Guessing Baseline is included (Figure 3). Additionally, a Transcribed-Text control setting isolates semantic from acoustic features. 13 models are compared against each other." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "Evaluated models include recent 2024-2025 releases: Qwen3-Omni-Flash, Gemini-2.5-Pro, GPT-4o-Audio, Step-Audio-2, Kimi-Audio. These represent the current state of the art in audio LLMs." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple ablation-like comparisons: With-Audio vs Transcribed-Text (Section 4.2), direct answering vs CoT prompting (Section 4.3), base vs defense prompt (Section 4.5). Each isolates a specific factor." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "Three distinct metrics are defined and used: Inference Accuracy Rate (IAR), Answer Refusal Rate (ARR), and Blind Bias Rate (BBR), each measuring different aspects of privacy risk." 96 }, 97 "human_evaluation": { 98 "applies": true, 99 "answer": false, 100 "justification": "Evaluation of model outputs is entirely automated using GPT-4o-mini as judge (Section 4.1). Human verification was used only for dataset construction (Stage III), not for evaluating model outputs." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": true, 105 "justification": "HearSay is a newly constructed benchmark that the evaluated models were not fine-tuned on. The entire benchmark serves as a held-out test set since no model was trained or tuned on this data." 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table 1 provides per-attribute breakdowns across all 8 privacy dimensions for each of 13 models. Figure 4 gives per-attribute CoT comparisons. Table 2 shows per-attribute defense effectiveness." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 4.4 discusses models that fail to use acoustic evidence (MERaLION, Kimi-Audio 'cluster tightly around the diagonal line'). Section 4.3 discusses CoT hurting models (Kimi-Audio Income drops 13.7%). Section 4.5 shows defense failure on Gender." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": true, 120 "justification": "CoT degrades performance for Kimi-Audio and Qwen2.5-Omni (Obs 3). Prompt defense is 'largely ineffective against deeply ingrained physiological inferences' with Gender ARR remaining near 0% (Obs 8). MERaLION shows 'limited responsiveness even to general defense prompts.'" 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "Abstract claims are verified in results: 92.89% gender accuracy (Table 1 average), near-zero refusal rates (Table 1 ARR columns), CoT boosts accent by 22.1% (Figure 4 Qwen3-Omni-Flash panel). All specific numbers in the abstract match the experimental results." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The main causal claim — 'CoT reasoning exacerbates privacy risks' — is supported by controlled comparison (same models, same data, with/without CoT). The Transcribed-Text control (Section 4.2) isolates voiceprint from semantic content through controlled manipulation. These are adequate single-variable manipulations." 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": false, 137 "justification": "Title and abstract frame findings about 'Audio LLMs' broadly, but the benchmark is English-centric (acknowledged only in Limitations Section 6) and speakers are predominantly from academic/professional backgrounds. The paper generalizes from 13 specific models to 'ALLMs' throughout without adequately bounding this." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.4 (RQ3) directly addresses the alternative explanation that models hallucinate based on statistical priors rather than using acoustic evidence, using BBR analysis and Figure 5. Section 4.2 addresses semantic leakage as an alternative explanation using the Transcribed-Text control." 143 }, 144 "proxy_outcome_distinction": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper measures IAR (accuracy of attribute inference from audio) and frames this as 'privacy leakage.' The measurement closely matches the claim — successfully inferring private attributes from voiceprints IS the privacy leak being studied. No significant proxy gap exists between measurement and framing." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "Some models have size specifications (e.g., 'Qwen2-Audio-7B-Instruct', 'Qwen2.5-Omni-7B') but others use marketing names without snapshot dates: 'GPT-4o-Audio', 'Gemini-2.5-Flash', 'Gemini-2.5-Pro'. No API version or snapshot date is provided for any closed-source model." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": true, 159 "justification": "Full prompt text for all experimental conditions is provided in Appendix C: Profiling ALLM prompt, Privacy Judge prompt, Transcribed-Text control prompt, Chain-of-Thought profiling prompt, and Defense prompt. Actual text, not just descriptions." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the 13 models or for the GPT-4o-mini judge. Only hardware is mentioned ('four H20 GPUs')." 165 }, 166 "scaffolding_described": { 167 "applies": false, 168 "answer": false, 169 "justification": "No agentic scaffolding is used. Models receive audio clips and prompts directly for inference." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 3.2 describes a four-stage data curation pipeline: (I) profiling with confidence tags, (II) audio extraction from lectures, (III) human review discarding 'inferred' attributes, (IV) supplementation with external datasets for dynamic traits. Each stage's purpose and filtering logic is described." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6 'Limitation' is a dedicated section with substantive discussion of three specific limitation areas." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 6 identifies specific threats: (1) defense exploration limited to textual prompt engineering, not audio-level defenses like encoder-stage de-identification, (2) dataset is predominantly English-centric limiting cross-lingual generalization, (3) evaluation limited to standard inference and CoT, excluding jailbreaking and adversarial audio perturbations." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 6 explicitly states what was NOT tested: audio-level defenses, multilingual corpora for cross-lingual privacy risks, and aggressive attack vectors including privacy-targeted jailbreaking and adversarial audio perturbations." 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "Dataset available through controlled access mechanism (Appendix A.2). External datasets (NISP, VocalSound, VoxCeleb2) are publicly available. 'HearSay is available exclusively through a controlled access mechanism, requiring researchers to submit a formal application.'" 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 3.2 details the four-stage collection pipeline with time period consideration (Stage IV addresses time-sensitive attributes), instruments used (profiling agent with confidence tags), and inclusion/exclusion criteria (only 'verified' attributes retained, 'inferred' discarded)." 204 }, 205 "recruitment_methods_described": { 206 "applies": true, 207 "answer": false, 208 "justification": "The paper states speakers come from 'public lecture videos' and external datasets but does not describe how specific lectures or speakers were selected from the pool of available sources. No selection criteria for which public lectures were included, which could introduce demographic and socioeconomic bias toward academic/professional speakers." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "The four-stage pipeline (Section 3.2) documents each transformation: profiling → audio extraction → human verification → external supplementation. Section 3.3 reports final statistics (22,064 clips). Figure 2 shows distribution across attributes and categories." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": false, 220 "justification": "No funding information, acknowledgments section, or grant disclosures appear anywhere in the paper." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Author affiliations are listed: XDU, NTU, NCEPU, BUPT, SHU, UAEU, UCAS-IIE, Squirrel AI. None of the authors are affiliated with the companies whose models are being evaluated (OpenAI, Google, etc.)." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": false, 230 "justification": "No funding is disclosed, making it impossible to assess funder independence. Absence of disclosure is not evidence of absence of conflict." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests or financial interests statement appears in the paper. One author is from 'Squirrel AI' (an AI company) but no financial interests declaration is made." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "No training data cutoff dates are stated for any of the 13 evaluated models, despite benchmark audio clips coming from public sources that models may have encountered during training." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The benchmark uses audio from public lectures and public datasets (VoxCeleb2 published 2018, NISP 2021, VocalSound 2022). Models trained after these dates may have encountered this audio. No discussion of potential overlap." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": false, 252 "justification": "HearSay uses audio from publicly available datasets and lectures that pre-date most evaluated models' training. No contamination analysis is performed despite the risk that models may have been exposed to these audio clips or related speaker information during pre-training." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in the evaluation. Speakers in audio clips are not study participants — they are subjects of public recordings used to construct a benchmark." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants. The paper includes an ethical statement (Section 7) addressing data ethics and dual-use risks but no IRB approval is needed or mentioned." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants. Speaker demographics are the attributes being predicted, not participant demographics." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study. Data inclusion criteria for the benchmark are covered under data_integrity." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants and no experimental randomization required for benchmark evaluation." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants. Blinding is not applicable to automated benchmark evaluation." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants. No attrition concept applies to benchmark evaluation." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No inference costs, API costs, tokens consumed, or per-example latency reported despite evaluating 13 models across 22,064 clips × 8 attributes (over 175,000 inference calls minimum)." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "Only 'four H20 GPUs' is mentioned for open-source model inference. No total GPU hours, API spend, or training time is stated. Closed-source API costs are not reported." 302 } 303 }, 304 "experimental_rigor": { 305 "seed_sensitivity_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No multi-seed results reported. All results appear to be from single runs without any seed sensitivity analysis." 309 }, 310 "number_of_runs_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "The number of experimental runs is not stated for the main evaluation (Table 1). BBR uses N independent trials (Equation 6) but the specific N value is not reported, and the main IAR/ARR results don't specify run count." 314 }, 315 "hyperparameter_search_budget": { 316 "applies": true, 317 "answer": false, 318 "justification": "No hyperparameter search budget reported. Sampling parameters (temperature, top-p) are not even disclosed, let alone any search over them." 319 }, 320 "best_config_selection_justified": { 321 "applies": true, 322 "answer": false, 323 "justification": "No description of how configuration choices were made for each model. Default settings appear to be used but this is not stated or justified." 324 }, 325 "multiple_comparison_correction": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper makes many comparisons across 13 models × 8 attributes but performs no statistical tests at all, let alone corrections for multiple comparisons." 329 }, 330 "self_comparison_bias_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The authors created the HearSay benchmark and evaluate models on it. No acknowledgment of potential bias in benchmark design favoring their conclusions. No independent evaluation or discussion of author-evaluation bias." 334 }, 335 "compute_budget_vs_performance": { 336 "applies": true, 337 "answer": false, 338 "justification": "Models of vastly different sizes and compute requirements are compared (7B open-source vs large closed-source APIs) without any discussion of compute budget differences or matched-compute comparisons." 339 }, 340 "benchmark_construct_validity": { 341 "applies": true, 342 "answer": true, 343 "justification": "The paper includes two construct validity checks: (1) BBR analysis (Section 4.4) verifying models use acoustic evidence rather than statistical priors, and (2) Transcribed-Text control (Section 4.2) confirming privacy leakage stems from voiceprints rather than semantic content. Both directly validate that the benchmark measures what it claims." 344 }, 345 "scaffold_confound_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "No scaffolding is involved. Models receive audio clips directly for inference." 349 } 350 }, 351 "data_leakage": { 352 "temporal_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "Benchmark audio comes from public sources including VoxCeleb2 (2018), NISP (2021), and VocalSound (2022), all predating the evaluated models' training. No discussion of whether models encountered this audio during pre-training." 356 }, 357 "feature_leakage_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "The profiling prompt explicitly instructs models to 'analyze a voice based ONLY on its physical and paralinguistic qualities' and specifies the target dimension. No discussion of whether the prompt itself cues models toward certain responses or leaks information about expected answers." 361 }, 362 "non_independence_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "Multiple audio clips may come from the same speaker (22,064 clips from an unspecified number of unique speakers, though 'broad spectrum of unique vocal identities' is mentioned). Non-independence between clips from the same speaker is not addressed." 366 }, 367 "leakage_detection_method": { 368 "applies": true, 369 "answer": false, 370 "justification": "No concrete leakage detection method is applied. No canary strings, membership inference tests, or decontamination pipelines are used to verify the benchmark data wasn't in model training sets." 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "ALLMs inherently extract private attributes from voiceprints, with audio-based inference surpassing random baselines by 46.4% on average and achieving 92.89% accuracy on gender.", 377 "evidence": "Table 1 shows average Gender IAR of 92.89% across open-source models. Figure 3 shows the 46.4% gap between With-Audio and Random Guessing Baseline across 8 attributes. Results from 13 models across 8 attributes.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Current ALLMs exhibit near-zero refusal rates for privacy-intruding requests on physiological attributes, with only GPT-4o-Audio demonstrating appropriate defensive behavior.", 382 "evidence": "Table 1 shows most open-source models have 0.00% ARR for Gender and other physiological attributes. GPT-4o-Audio achieves 100% ARR on Weight. Average ARR across open-source models is below 1% for Gender.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Chain-of-Thought reasoning amplifies privacy risks in capable models, boosting Accent inference accuracy by 22.1% for Qwen3-Omni-Flash.", 387 "evidence": "Figure 4 shows CoT boosts Qwen3-Omni-Flash on Accent by 22.1%, Education by 7.0%, and Income by 10.8%. However, the effect is mixed — Kimi-Audio's Income drops 13.7% and Qwen2.5-Omni's Weight drops 23.5% with CoT.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Capable models leverage acoustic evidence to correct inherent statistical biases, rather than hallucinating based on prior distributions.", 392 "evidence": "Section 4.4 and Figure 5 show high-performing models (Qwen3-Omni-Flash, MiniCPM-o-2.6) fall in the 'Correction Zone' while weaker models (MERaLION, Kimi-Audio) cluster near the diagonal. BBR analysis across N trials with empty audio input.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Lightweight prompt defense significantly boosts refusal rates for responsive models but fails for physiological attributes, especially gender.", 397 "evidence": "Table 2 shows MiniCPM-o-2.6 overall ARR rises from near-zero to 61.42% with defense prompt. However, Gender ARR remains near 0% across all models even with defense. Average improvement varies from +0.01% (Gender) to +72.2% (Income).", 398 "supported": "moderate" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "No error bars or uncertainty quantification", 404 "detail": "All results across Tables 1, 2 and Figures 3, 4 are single-run point estimates with no error bars, confidence intervals, variance, or any measure of uncertainty. With stochastic model outputs, results could vary substantially across runs." 405 }, 406 { 407 "flag": "Unvalidated LLM-as-judge methodology", 408 "detail": "GPT-4o-mini is used as the automated judge for determining inference accuracy and refusal behavior, but no validation of judge agreement with human judgment is provided. The accuracy of the automated evaluation is assumed without verification." 409 }, 410 { 411 "flag": "No statistical tests for comparative claims", 412 "detail": "Numerous comparative claims are made (model A outperforms model B, CoT improves/degrades performance, defense boosts refusal rates) based solely on comparing point estimates without any statistical tests to determine if differences are significant." 413 }, 414 { 415 "flag": "Ecological validity concerns", 416 "detail": "The profiling prompts explicitly instruct models as 'Forensic Vocal Profilers' to analyze voice for private attributes. This differs substantially from natural user interactions and may overestimate real-world privacy risk. The attack model assumes adversarial prompting, which is acknowledged but the gap between benchmark and real-world privacy risk is not quantified." 417 }, 418 { 419 "flag": "Speaker demographic bias in benchmark construction", 420 "detail": "Speakers are primarily drawn from public lecture videos, which overrepresents academic/professional speakers. The benchmark claims to be 'comprehensive' but this selection process creates demographic skew that could affect generalizability of privacy risk findings." 421 }, 422 { 423 "flag": "Contamination risk unaddressed", 424 "detail": "Audio from public datasets (VoxCeleb2, NISP, VocalSound) and public lectures likely appears in training data of large multimodal models. Models may recognize specific speakers or audio clips rather than performing genuine acoustic inference. This is completely unaddressed." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Beyond memorization: Violating privacy via inference with large language models", 430 "authors": ["Robin Staab", "Mark Vero", "Mislav Balunović", "Martin T. Vechev"], 431 "year": 2023, 432 "arxiv_id": "2310.07298", 433 "relevance": "Foundational work on LLM privacy inference showing models can infer personal attributes from unstructured text, directly motivating HearSay's extension to audio." 434 }, 435 { 436 "title": "Extracting training data from large language models", 437 "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace"], 438 "year": 2021, 439 "relevance": "Seminal work on LLM memorization and privacy risks from training data extraction, establishing the broader privacy threat model for language models." 440 }, 441 { 442 "title": "GPT-4 technical report", 443 "authors": ["Josh Achiam"], 444 "year": 2023, 445 "arxiv_id": "2303.08774", 446 "relevance": "Technical report for GPT-4, one of the foundation models whose audio variant (GPT-4o-Audio) is evaluated in this benchmark." 447 }, 448 { 449 "title": "AudioTrust: Benchmarking the Multifaceted Trustworthiness of Audio Large Language Models", 450 "authors": ["Kai Li", "Can Shen", "Yile Liu"], 451 "year": 2025, 452 "arxiv_id": "2505.16211", 453 "relevance": "Concurrent work on trustworthiness benchmarking for audio LLMs, establishing the broader safety evaluation landscape for ALLMs." 454 }, 455 { 456 "title": "A Comprehensive Survey in LLM (-Agent) Full Stack Safety: Data, Training and Deployment", 457 "authors": ["Kun Wang", "Guibin Zhang", "Zhenhong Zhou"], 458 "year": 2025, 459 "arxiv_id": "2504.15585", 460 "relevance": "Comprehensive survey of LLM and agent safety covering data, training, and deployment risks relevant to understanding privacy implications." 461 }, 462 { 463 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 464 "authors": ["Lianmin Zheng", "Wei-Lin Chiang"], 465 "year": 2023, 466 "relevance": "Establishes the LLM-as-judge evaluation paradigm used by HearSay for automated assessment of model inference accuracy and refusal behavior." 467 }, 468 { 469 "title": "Do-not-answer: A dataset for evaluating safeguards in LLMs", 470 "authors": ["Yuxia Wang", "Haonan Li", "Xudong Han", "Preslav Nakov", "Timothy Baldwin"], 471 "year": 2023, 472 "arxiv_id": "2308.13387", 473 "relevance": "Dataset for evaluating LLM safety refusal capabilities, directly related to HearSay's Answer Refusal Rate metric." 474 }, 475 { 476 "title": "A survey on code generation with LLM-based agents", 477 "authors": ["Yihong Dong", "Xue Jiang"], 478 "year": 2025, 479 "arxiv_id": "2508.00083", 480 "relevance": "Survey on LLM-based agent capabilities in code generation, part of the broader LLM capability assessment landscape." 481 }, 482 { 483 "title": "Gemini: a family of highly capable multimodal models", 484 "authors": ["Gemini Team"], 485 "year": 2023, 486 "arxiv_id": "2312.11805", 487 "relevance": "Technical report for the Gemini model family, whose 2.5 variants (Flash-Lite, Flash, Pro) are evaluated in the HearSay benchmark." 488 }, 489 { 490 "title": "Calibrate before use: Improving few-shot performance of language models", 491 "authors": ["Zihao Zhao", "Eric Wallace", "Shi Feng", "Dan Klein", "Sameer Singh"], 492 "year": 2021, 493 "relevance": "Content-free calibration methodology that inspired HearSay's Blind Bias Rate metric for distinguishing genuine inference from statistical priors." 494 }, 495 { 496 "title": "Towards understanding chain-of-thought prompting: An empirical study of what matters", 497 "authors": ["Boshi Wang", "Sewon Min", "Xiang Deng"], 498 "year": 2023, 499 "relevance": "Empirical analysis of CoT prompting mechanisms, relevant to HearSay's finding that CoT amplifies privacy risks in capable models." 500 } 501 ] 502 }