scan.json (26262B)
1 { 2 "paper": { 3 "title": "SocialVeil: Probing Social Intelligence of Language Agents under Communication Barriers", 4 "authors": ["Keyang Xuan", "Pengda Wang", "Chongrui Ye", "Haofei Yu", "Tal August", "Jiaxuan You"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.05115" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Communication barriers consistently impair LLM agents' social intelligence, with mutual understanding reduced by over 45% and confusion elevated by nearly 50%. Semantic vagueness most severely disrupts mutual understanding (avg -58%), emotional interference disproportionately damages relationship quality (avg -49%), and sociocultural mismatch induces persistent confusion (avg -49%). Adaptation strategies (repair instruction and interactive learning) yield only modest improvements, with performance remaining far below barrier-free baselines.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper states 'Our code and data are available at https://github.com/ulab-uiuc/social-veil' in a footnote on page 1." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states code and data are available at the GitHub link. Episodes are adapted from SOTOPIA, which is publicly available." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Training details mention '4 × A6000 80G GPUs' and QLoRA parameters (Appendix D.3), but no requirements.txt, Dockerfile, or dependency specifications are provided in the paper." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-style instructions for replicating experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": true, 40 "justification": "95% confidence intervals are reported throughout: barrier identification accuracy (Table 4, Figure 6), ICC values, Pearson correlations, and bootstrap CIs for human evaluation metrics." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "Statistical significance is reported: ICC F-tests with p < .001, Pearson correlations with significance levels (*p < .05, **p < .01, ***p < .001) in Figure 4, and bootstrap resampling for stability estimates." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes are reported as percentage changes from baseline: 'mutual understanding reduced by over 45%', 'avg −58%' for semantic vagueness on mutual understanding. Figure 5 shows percentage deviations with significance markers." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for why 180 episodes per barrier type were chosen, nor is a power analysis discussed. The 720 total scenarios and 120 human evaluation samples are stated without justification." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Table 2 reports standard deviations (subscript notation) for all metrics across all models and conditions. For example, 'GPT-4o-m Base BEL 8.78.11' indicates mean 8.78 with std 0.11." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "A barrier-free baseline condition is included for all experiments, allowing direct comparison of performance with and without communication barriers (Table 2)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The evaluated models include contemporary LLMs: GPT-4o-mini, Qwen2.5-7B-Instruct, Qwen3-4B-Instruct, and Mistral-8B-Instruct. The framework builds on SOTOPIA (2023), which is the current standard." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper tests three distinct barrier types independently and a barrier-free baseline, effectively ablating the barrier dimension. Table 3 ablates adaptation strategies (Base vs Repair Instruction vs BC+SR)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Six evaluation metrics are used: Believability, Relationship Quality, Knowledge, Goal Completion, Unresolved Confusion, and Mutual Understanding (Table 2)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Six human annotators evaluated 120 scenarios across barrier identification accuracy, inter-rater reliability (Fleiss's Kappa, ICC), and metric alignment with automated scores (Section 5.2, Table 4)." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "No explicit separation of dev and test sets is described. The 180 episodes per condition appear to be used both for development and reporting. Interactive learning uses filtered trajectories from the same scenario pool." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by barrier type (semantic, sociocultural, emotional), by model, and by difficulty split (All vs Hard) in Table 2. Figure 5 shows per-barrier-type deviations." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4.3 discusses the failure of repair instruction strategies, and Section 5.1 analyzes behavioral patterns of communication breakdown. The paper discusses when and why agents fail to adapt." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that repair instruction yields 'trivial performance improvements' and that neither adaptation strategy restores baseline performance (Section 4.3, Table 3)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of '45% reduction in mutual understanding' and '50% elevation in confusion' are supported by Table 2 data. Human evaluation ICC≈0.78 and Pearson r≈0.80 are confirmed in Section 5.2." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper's causal claims (barriers cause performance degradation) are supported by controlled experimental design: the barrier is the sole manipulated variable between conditions, with partner agents held constant." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper bounds claims to the tested models and the SOTOPIA scenario set. Appendix B explicitly discusses limitations: text-only interactions, short-term scenarios, and the gap to multimodal communication." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper validates barriers are not artifacts of specific models (Appendix F.1 tests alternative evaluator backbone, F.2 tests alternative barrier backbone). Section 5.1 provides linguistic feature analysis as independent validation." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper explicitly introduces barrier-aware metrics (Unresolved Confusion, Mutual Understanding) alongside goal-oriented metrics, distinguishing between task success and social competence. The evaluation protocol (Appendix G) precisely defines what each metric measures." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Appendix D.2 lists HuggingFace model IDs for open-source models (e.g., 'Qwen/Qwen2.5-7B-Instruct', 'mistralai/Mistral-8B-Instruct') but GPT-4o-mini and GPT-4o lack snapshot dates or API versions." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt text is provided in Appendix H: agent prompt (H.2), barrier evaluation prompt (H.3), social goal evaluation prompt (H.4), and neutralization prompt (H.1)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Temperature 0.7 for agents, 0 for evaluator. Training details: learning rate 5.0e-5, batch size 4, 20 epochs, cutoff length 4096, QLoRA rank 8, alpha 16, dropout 0.05 (Appendix D.3)." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The agent scaffolding is described in detail: two-layer barrier design with style prompts and parameterization, episode structure with private goals and role profiles, turn-based dialogue with 20-turn cap (Sections 2.2-2.3)." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The scenario neutralization process is described: GPT-4o rewrites SOTOPIA scenarios to remove goal-related hints (Section 2.3), with the exact neutralization prompt provided (Appendix H.1)." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Appendix B provides a dedicated limitations and future directions section discussing text-only focus, short-term interaction limitation, and episode-level evaluation constraints." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Appendix B discusses specific limitations: text-only modality misses prosody/facial expressions, scenarios are discrete/short-term rather than longitudinal, and evaluation protocol operates at episode level rather than capturing proactive barrier mitigation." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper explicitly states scope boundaries: focuses on cognitive-factor-induced barriers (not physical), text-only interactions, and three specific barrier types. Appendix B states what the framework does NOT cover." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper states code and data are available at the GitHub repository, which would include the generated dialogues and evaluation results." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Episode generation is described in detail: scenarios adapted from SOTOPIA, neutralized using GPT-4o, barrier injection via style prompts and parameterization, with 180 episodes per condition (Section 2.2-2.3)." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "For human evaluation: 'Six human annotators were recruited from two universities and received research credit, identified as 50% women and 50% men' (Appendix E). However, no discussion of whether this introduces selection bias." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: SOTOPIA scenarios → neutralization → barrier injection → multi-turn dialogue simulation → automated evaluation + human evaluation. Interactive learning pipeline is also documented (behavior cloning → self-reinforcement)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding source or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: UIUC (Siebel School of Computing and Data Science) and Rice University (Department of Psychological Sciences)." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial disclosure statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state training data cutoff dates for any of the evaluated models (GPT-4o-mini, Qwen2.5-7B, Qwen3-4B, Mistral-8B). This matters because SOTOPIA scenarios may be in training data." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether SOTOPIA scenarios (published 2023) may have been seen during training of the evaluated models. Since SOTOPIA is public and all models were trained after 2023, this is a relevant concern." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "SOTOPIA scenarios were published in 2023 and are publicly available. All evaluated models were likely trained after this date. No contamination analysis is performed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": false, 246 "justification": "The human evaluation study involving 6 annotators rating 120 scenarios is not pre-registered. No link to a pre-registration is provided." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "The Ethics Statement discusses ethical guidelines but does not mention IRB or ethics board approval for the human evaluation study." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": false, 256 "justification": "Only gender distribution is reported (50% women, 50% men) and institutional affiliation (two universities). No information on age, experience level, cultural background, or language proficiency — all relevant for evaluating communication barriers." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": false, 261 "justification": "No inclusion or exclusion criteria for annotators are stated beyond being from two universities." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "The human evaluation is an annotation task, not an experimental study with treatment conditions requiring randomization." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": false, 271 "justification": "It is not stated whether annotators knew the barrier conditions when rating metrics. The annotation UI (Figure 7) shows barrier definitions alongside the dialogue, which may bias ratings." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": false, 276 "justification": "No information on whether all 6 annotators completed all assigned scenarios or whether any dropped out." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference costs, API costs, or latency figures are reported despite running 720 scenarios across multiple models plus GPT-4o evaluation." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Training used '4 × A6000 80G GPUs, across 20 epochs' (Appendix D.3). However, total GPU hours and API costs for the simulation and evaluation phases are not stated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of multiple random seeds for the simulation experiments. Temperature 0.7 introduces stochasticity but seed sensitivity is not analyzed." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "It is not stated whether results in Table 2 are from single runs or averaged across multiple runs. Standard deviations are shown but their source (across scenarios vs across runs) is ambiguous." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported for the training experiments (BC+SR). The learning rate, batch size, and QLoRA parameters are stated but no search process is described." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "No justification for how the reported configuration was selected. Training hyperparameters appear chosen without documenting the selection process." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Multiple significance tests are performed across models, barrier types, and metrics (Figures 4-5) without mention of correction for multiple comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors evaluate their own SocialVeil framework and barrier implementations without discussing potential bias in their own evaluation design." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": false, 324 "answer": false, 325 "justification": "Compute differences between conditions are negligible — all use the same models and same number of dialogue turns." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "Section 4.1 validates barrier construct validity through t-SNE visualization of model representations, showing barriers form distinct clusters. Section 5.1-5.2 validates through linguistic signatures and human evaluation alignment." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "The evaluation framework (SocialVeil) IS the thing being tested. The same scaffolding is used across all model comparisons." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "SOTOPIA scenarios (2023) predate all evaluated models' training. No temporal leakage analysis is performed." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether evaluation setup leaks information. The barrier agent's style prompts could provide hints about expected partner behavior." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "All 180 episodes per condition are adapted from SOTOPIA scenarios. No discussion of potential dependencies between scenarios or overlap with training data." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention methods are applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Communication barriers consistently impair LLM agents' social intelligence, with mutual understanding reduced by over 45% and confusion elevated by nearly 50%.", 364 "evidence": "Table 2 shows consistent performance degradation across all four models and six metrics under all three barrier types compared to barrier-free baseline.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Each barrier type produces a characteristic degradation pattern: semantic vagueness impairs mutual understanding most (avg -58%), emotional interference damages relationships most (avg -49%), and sociocultural mismatch induces persistent confusion (avg -49%).", 369 "evidence": "Table 2 per-barrier results and Figure 5 showing per-barrier-type percentage deviations with statistical significance markers.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Human evaluations validate the fidelity of simulated barriers with ICC≈0.78, Pearson r≈0.80, and barrier identification accuracy of 68%.", 374 "evidence": "Section 5.2: ICC(1,k) = 0.77 for confusion, 0.79 for mutual understanding. Pearson r = 0.80 for confusion, 0.79 for mutual understanding. Bootstrap accuracy 0.68 [0.63, 0.73].", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Repair instruction yields trivial performance improvements while interactive learning (BC+SR) offers consistent but modest gains, with neither restoring baseline performance.", 379 "evidence": "Table 3 shows repair instruction gains are negligible (e.g., GOAL 5.99→6.07 for semantic barrier) and BC+SR gains are modest (avg 10-20% improvement).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Social reasoning is more fragile than goal pursuit, with relationship quality dropping by 45% and mutual understanding by 52% versus 20-30% for goal completion.", 384 "evidence": "Section 4.2 finding 3, derived from Table 2 comparing degradation magnitudes across metric categories.", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "No contamination analysis", 391 "detail": "SOTOPIA scenarios are publicly available since 2023 and all evaluated models were trained after this date. No analysis of whether models have seen these scenarios during training, which could inflate baseline performance." 392 }, 393 { 394 "flag": "GPT-4o as both barrier agent backbone and evaluator", 395 "detail": "GPT-4o-mini serves as the barrier agent and GPT-4o serves as the evaluator. Using models from the same family for both generation and evaluation creates potential circularity in the evaluation pipeline." 396 }, 397 { 398 "flag": "No multiple comparison correction", 399 "detail": "The paper performs many statistical tests across 4 models × 3 barrier types × 6 metrics and reports significance without any correction for multiple comparisons." 400 }, 401 { 402 "flag": "Limited annotator demographics", 403 "detail": "Only 6 annotators from two universities, with minimal demographic reporting. For a study about sociocultural communication barriers, annotator cultural background and language proficiency are highly relevant but unreported." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "SOTOPIA: Interactive Evaluation for Social Intelligence in Language Agents", 409 "authors": ["Xuhui Zhou", "Hao Zhu", "Leena Mathur"], 410 "year": 2023, 411 "arxiv_id": "2310.11667", 412 "relevance": "Core benchmark from which SocialVeil adapts scenarios; key prior work on interactive evaluation of LLM social intelligence." 413 }, 414 { 415 "title": "SOTOPIA-π: Interactive Learning of Socially Intelligent Language Agents", 416 "authors": ["Ruiyi Wang", "Haofei Yu", "Wenxin Zhang"], 417 "year": 2024, 418 "arxiv_id": "2403.08715", 419 "relevance": "Interactive learning framework adapted for SocialVeil's BC+SR adaptation strategy." 420 }, 421 { 422 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 423 "authors": ["Joon Sung Park", "Joseph O'Brien"], 424 "year": 2023, 425 "relevance": "Foundational work on LLM-based social simulation and agent behavior." 426 }, 427 { 428 "title": "SocialBench: Sociality Evaluation of Role-Playing Conversational Agents", 429 "authors": ["Hongzhan Chen"], 430 "year": 2024, 431 "arxiv_id": "2403.13679", 432 "relevance": "Related benchmark for evaluating social intelligence in conversational AI agents." 433 }, 434 { 435 "title": "AgentSense: Benchmarking Social Intelligence of Language Agents through Interactive Scenarios", 436 "authors": ["Xinyi Mou", "Jingcong Liang"], 437 "year": 2024, 438 "arxiv_id": "2410.19346", 439 "relevance": "Independent interaction benchmark used for cross-benchmark transfer validation of SocialVeil." 440 }, 441 { 442 "title": "Why do Multi-Agent LLM Systems Fail?", 443 "authors": ["Mert Cemri"], 444 "year": 2025, 445 "arxiv_id": "2503.13657", 446 "relevance": "Empirical analysis of multi-agent system failures, including communicative misalignment relevant to the survey scope." 447 }, 448 { 449 "title": "Communicative Agents for Software Development", 450 "authors": ["Chen Qian", "Xin Cong"], 451 "year": 2023, 452 "arxiv_id": "2307.07924", 453 "relevance": "Multi-agent collaborative software development using LLMs — relevant to agentic workflow evaluation." 454 }, 455 { 456 "title": "Using Large Language Models to Simulate Multiple Humans and Replicate Human Subject Studies", 457 "authors": ["Gati V Aher", "Rosa I Arriaga", "Adam Tauman Kalai"], 458 "year": 2023, 459 "relevance": "Methodological work on using LLMs to simulate human subjects, relevant to agent evaluation methodology." 460 }, 461 { 462 "title": "QLoRA: Efficient Finetuning of Quantized LLMs", 463 "authors": ["Tim Dettmers", "Artidoro Pagnoni"], 464 "year": 2023, 465 "relevance": "Training method used for the interactive learning adaptation strategy in SocialVeil." 466 } 467 ] 468 }