scan.json (28556B)
1 { 2 "paper": { 3 "title": "AutoToM: Scaling Model-based Mental Inference via Automated Agent Modeling", 4 "authors": [ 5 "Zhining Zhang", 6 "Chuanyang Jin", 7 "Mung Yao Jia", 8 "Shunchi Zhang", 9 "Tianmin Shu" 10 ], 11 "year": 2025, 12 "venue": "NeurIPS 2025 (Spotlight)", 13 "arxiv_id": "2502.15676" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper states 'Link: Project Page | Code' directly below the author list, indicating code is released. A project page is also referenced." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper evaluates on publicly available benchmarks (ToMi, BigToM, MMToM-QA, MuMA-ToM, Hi-ToM), all of which are publicly accessible datasets. The cognitive study scenarios are adapted from published works (Baker et al. 2009, 2017). The O-WAH benchmark is also public." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or detailed library versions. The paper specifies LLM API model versions but not the software environment for running the code." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "No step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section are provided in the paper. The paper provides the algorithm (Algorithm 1) and hyperparameters but not a concrete reproduction guide." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": true, 42 "justification": "Error bars (standard errors) are shown in Figure 7 for the embodied assistance experiment. Additionally, Section 4.1 reports 'a mean accuracy of 82.56% with a standard error of 0.45%' for the statistical reliability analysis on MMToM-QA." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": true, 47 "justification": "Table 3 reports Pearson correlation coefficients with p-values (e.g., '0.93**' with '**: p ≤ .001'), testing significance of correlations between model and human judgments. However, no significance tests are reported for the main benchmark accuracy comparisons in Table 1." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper reports absolute accuracy scores for all methods across all benchmarks (Table 1), enabling readers to compute effect sizes. For example, AutoToM achieves 82.43% overall vs. GPT-4o's 63.39%, a 19.04 percentage point improvement. The embodied assistance experiment reports speedup percentages with baselines (27.7% vs 6.8% for GPT-4o)." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "No justification is given for the sample sizes used. For example, the Hi-ToM evaluation uses a subset of 200 questions chosen 'due to the high cost of testing the full dataset' and the FANToM evaluation uses a 'subset of 200 false-belief first-order questions with short contexts due to budget constraints,' but no power analysis or formal justification is provided." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Section 4.1 under 'Statistical Reliability' reports: 'Across three different random seeds, AutoToM achieved a mean accuracy of 82.56% with a standard error of 0.45%' and 'o3-mini-high achieved a mean accuracy of 65.94% with a standard error of 0.59%.' Error bars showing standard errors are also in Figure 7. However, variance is only reported for MMToM-QA and the embodied task, not all five benchmarks." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Extensive baselines are included: LLMs (Llama 3.1 70B, GPT-4o, Gemini 2.0 Flash, Gemini 2.0 Pro), ToM prompting methods (SymbolicToM, SimToM), large reasoning models (DeepSeek-R1, Gemini 2.0 Flash Thinking, o3-mini-high), and model-based inference methods (BIP-ALM, LIMP). See Table 1." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "Baselines include state-of-the-art models as of early 2025: o3-mini-high, DeepSeek-R1, Gemini 2.0 Flash Thinking, Gemini 2.0 Pro. These are recent and competitive. Section A3.7 lists specific model versions (e.g., 'gpt-4o-2024-08-06', 'o3-mini-2025-01-31')." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": true, 79 "justification": "Section 4.1 presents an ablation study evaluating five variants: no hypothesis reduction, always POMDP, no variable adjustment, last timestep only, and all timesteps. Results in Figure 6 and Tables A8-A10 show both accuracy and compute tradeoffs." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": true, 84 "justification": "The paper uses accuracy across five benchmarks as the primary metric, Pearson correlation with human judgments for cognitive studies (Table 3), speedup percentage for embodied assistance (Figure 7), Macro-F1 for OpenToM (Section A3.3.2), and computational efficiency metrics (tokens consumed, inference time, API calls in Tables A2, A9, A10)." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": true, 89 "justification": "Experiment 2 (Section 4.2) compares AutoToM's outputs against human judgments from published cognitive studies (Baker et al. 2009, 2017), measuring Pearson correlation between model and human confidence estimates. This is an evaluation against human ground truth rather than human evaluation of outputs per se, but the benchmarks themselves include human-created test sets." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "All five benchmarks (ToMi, BigToM, MMToM-QA, MuMA-ToM, Hi-ToM) are established test sets. The method does not involve training or tuning on these benchmarks—it uses a fixed algorithm with the same prompts across all settings. No dev/test split issue arises since this is a zero-shot prompting approach." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Per-question-type breakdowns are provided for all benchmarks in Tables A3-A7 (e.g., first/second order for ToMi, forward/backward true/false belief for BigToM, belief/goal for MMToM-QA). Figure 4 provides breakdowns by question type, context length, number of agents, and recursion level." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "The Limitations section discusses specific failure modes: 'model adjustments may sometimes fail to recognize the relevance of certain mental variables, resulting in an insufficient model.' The qualitative examples (Section A3.6) show how the model can initially fail and then self-correct. The paper also discusses weaker performance on Hi-ToM higher orders." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The ablation study (Figure 6, Table A8) shows variants that hurt performance. For example, 'w/ last timestep' drops overall accuracy from 82.43% to 69.11%. On Hi-ToM, AutoToM (72.50%) does not outperform o3-mini-high (75.00%). The FANToM results show imperfect 72.7% accuracy." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims AutoToM 'outperforms existing ToM methods and even large reasoning models.' Table 1 shows AutoToM achieves 82.43% overall vs. 74.18% for the best reasoning model (Gemini 2.0 Flash Thinking). The abstract's claims about 'human-like confidence estimates' are supported by Table 3, and 'online mental inference for embodied decision-making' is supported by Figure 7." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper makes causal claims through ablation studies (e.g., removing hypothesis reduction, variable adjustment, timestep adjustment degrades performance). These are controlled single-variable manipulations. The claim that AutoToM's framework 'enables' better performance is supported by systematic comparisons. The ablation design is adequate for these component-level causal claims." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper tests on five diverse benchmarks spanning different contexts, modalities, and reasoning depths. The title 'Scaling Model-based Mental Inference' is supported by evaluations across these diverse settings. The Limitations section explicitly bounds generalization: 'AutoToM currently requires a separate process to first fuse information from different modalities into text before inference.' The scope is clearly Theory of Mind, not general reasoning." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper does not substantively discuss alternative explanations for the results. It does not consider whether the improvements could be due to increased token budget (more LLM calls), prompt engineering advantages, or whether the specific benchmarks favor structured approaches. The Limitations section discusses practical limitations but not alternative explanations for observed performance gains." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Section A3.7 specifies exact model versions: 'gpt-4o-2024-08-06 for GPT-4o', 'meta-llama/Llama-3.1-70B-Instruct', 'gemini-2.0-flash', 'gemini-2.0-pro-exp-02-05', 'gemini-2.0-flash-thinking-exp-01-21', 'o3-mini-2025-01-31', and 'deepseek-r1'. These include snapshot dates or specific version identifiers." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": true, 143 "justification": "Appendix A6 provides the full prompts used in AutoToM, including prompts for information extraction (A6.1), hypothesis sampling (A6.2), and likelihood estimation (A6.3). These are the actual prompt texts with placeholders clearly indicated." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper reports algorithm-level hyperparameters (α = 0.02, Umin = −0.693 in Section A1.3, K = 20 particles and Pmin = 10% in Algorithm 2) but does not report LLM API hyperparameters such as temperature, top-p, or max tokens settings used when calling GPT-4o or other models." 149 }, 150 "scaffolding_described": { 151 "applies": true, 152 "answer": true, 153 "justification": "The agentic scaffolding is described in detail: Algorithm 1 provides the full AutoToM algorithm, Section 3.3 describes automated Bayesian inverse planning, Section 3.4 describes automated agent model discovery, and the appendix provides implementation details including hypothesis sampling, reduction, and likelihood estimation procedures." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "For multimodal benchmarks, the paper describes the fusion methods used (Section 4.1: 'we adopt the information fusion methods proposed by Jin et al. [20] and Shi et al. [39]'). For cognitive studies, Section A4.2 describes scenario selection, stimuli translation principles, and the rationality assumption. For Hi-ToM, the subset selection is described ('we choose the length 1 subset consisting of 200 questions')." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "A dedicated 'Limitations and Future Work' section appears at the end of Section 5 (Conclusion), discussing two specific limitations: the requirement for separate multimodal fusion and occasional failures in model adjustment." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": false, 170 "justification": "The limitations discussed are specific ('AutoToM currently requires a separate process to first fuse information from different modalities' and 'model adjustments may sometimes fail to recognize the relevance of certain mental variables'), but there is no threats-to-validity discussion addressing issues such as sensitivity to LLM backend quality, benchmark-specific biases, or the reliability of LLM-estimated probabilities. The budget-constrained subset evaluations (FANToM, Hi-ToM) also lack discussion of whether the subsets are representative." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "The paper does not explicitly state what the results do NOT show. For example, it does not clarify that results are limited to text-based or text-fused ToM scenarios, that the approach has only been tested on controlled benchmark settings rather than real human interaction, or that the cognitive study correlations are based on small scenario samples." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": false, 182 "justification": "No raw experimental data (e.g., per-question predictions, model outputs) are released. Only aggregate results are reported in tables. The code release may include data, but the paper itself does not provide or reference raw data availability." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper uses established benchmarks and describes them in detail (Table A11, Section A3.8). For the cognitive studies, Section A4.2 describes scenario selection ('selected all 6 usable scenarios from [4]', 'selected one representative scenario from each type, resulting in 7 unique experimental scenarios') and translation procedures." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants were recruited for this study. The paper uses existing benchmarks and previously collected human data from published cognitive studies." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The pipeline from input to output is documented: information extraction → initial model proposal → Bayesian inference → model adjustment → final answer (Algorithm 1). For multimodal inputs, the fusion pipeline is described. For cognitive studies, the translation from visual stimuli to text captions is documented (Section A4.2, Figure A4)." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "The Acknowledgments section states: 'This work was supported by a grant from Amazon.'" 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: Peking University and Johns Hopkins University. The Amazon funding is disclosed but no Amazon employee is listed as an author." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": true, 214 "justification": "The work is funded by Amazon. Amazon does not have a direct product stake in Theory of Mind benchmarks. The paper evaluates OpenAI, Google, and Meta models, not Amazon products. The funder appears independent of the specific outcomes." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper. While no obvious conflicts exist, the absence of an explicit declaration means this criterion is not satisfied." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper uses GPT-4o, DeepSeek-R1, Gemini 2.0, and other models on benchmarks but does not state the training data cutoff dates for any of these models. Since the benchmarks (e.g., ToMi from 2019, BigToM from 2024) could potentially be in the training data, this is relevant." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": false, 231 "justification": "No discussion of whether the benchmark examples appeared in the LLMs' training data. Given that ToMi (2019), Hi-ToM (2023), and BigToM (2024) are publicly available and could be in GPT-4o's training data, this is a significant omission." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": false, 236 "justification": "The benchmarks (ToMi published 2019, BigToM published 2024) were available online before GPT-4o's training cutoff. No discussion of contamination risk is provided. While AutoToM's performance improvement over base GPT-4o suggests the framework adds value beyond memorization, the absolute numbers for baseline LLMs could be inflated by contamination." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants were recruited for this study. The paper uses existing benchmarks and previously published human data." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants were recruited. The cognitive study data comes from previously published studies." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants were recruited for this study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants were recruited for this study." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants were recruited for this study." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants were recruited for this study." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants were recruited for this study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": true, 280 "justification": "Table A2 reports average tokens per question (8.0K for AutoToM) and average inference time (8.5s) on MMToM-QA. Tables A9 and A10 report token costs and API call counts for AutoToM and all ablation variants across all benchmarks." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "No total computational budget (total API spend, total cost of running all experiments) is stated. The paper reports per-question costs but not the aggregate expense of the full experimental campaign across all benchmarks and baselines." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "AutoToM outperforms all existing ToM methods and large reasoning models with an overall accuracy of 82.43% across five benchmarks.", 292 "evidence": "Table 1 shows AutoToM (82.43%) outperforming the next best methods: o3-mini-high (73.94%) and Gemini 2.0 Flash Thinking (74.18%). Per-benchmark results are detailed in Tables A3-A7.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "AutoToM outperforms its LLM backend (GPT-4o) by a large margin.", 297 "evidence": "Table 1 shows AutoToM achieves 82.43% vs. GPT-4o's 63.39%, a 19.04 percentage point improvement. Table 2 shows this holds across multiple LLM backends on MMToM-QA (e.g., GPT-4o: 44.0 → 83.0, DeepSeek-V3: 34.8 → 71.1).", 298 "supported": "strong" 299 }, 300 { 301 "claim": "AutoToM produces human-like confidence estimates in classic cognitive studies.", 302 "evidence": "Table 3 shows Pearson correlations: 0.93 (p ≤ .001) for online goal inference, 0.88 (p ≤ .001) for desire inference, and 0.73 (p ≤ .001) for belief inference. AutoToM significantly outperforms GPT-4o and o3-mini-high on partial observability tasks.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "AutoToM enables effective embodied assistance with 27.7% speedup.", 307 "evidence": "Figure 7 shows AutoToM achieves 27.7% speedup on O-WAH benchmark, compared to 6.8% for GPT-4o and 6.3% for random goal. Error bars indicate standard errors. Results averaged over 3 runs per episode across 20 episodes.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "AutoToM achieves higher performance with comparable or lower computational cost than large reasoning models.", 312 "evidence": "Table A2 shows AutoToM uses 8.0K tokens and 8.5s per question on MMToM-QA, while o3-mini-high uses 10.9K tokens and 21.6s. AutoToM achieves 83.0% vs. 64.67% for o3-mini-high on the same benchmark.", 313 "supported": "strong" 314 }, 315 { 316 "claim": "The evaluation is stable across runs with low variance.", 317 "evidence": "Section 4.1 reports 'Across three different random seeds, AutoToM achieved a mean accuracy of 82.56% with a standard error of 0.45%' on MMToM-QA. However, this is only verified on one benchmark.", 318 "supported": "moderate" 319 } 320 ], 321 "methodology_tags": [ 322 "benchmark-eval" 323 ], 324 "key_findings": "AutoToM introduces an automated framework for model-based Theory of Mind that combines automated Bayesian inverse planning with automated agent model discovery, using LLMs as a computational backend. Across five diverse ToM benchmarks, AutoToM achieves 82.43% overall accuracy, outperforming both state-of-the-art LLMs and large reasoning models including o3-mini-high (73.94%). The framework produces posterior distributions that correlate strongly with human confidence judgments in cognitive studies (r = 0.73-0.93) and enables effective embodied assistance with 27.7% task speedup. The approach generalizes across different LLM backends, consistently boosting performance over the base model without additional prompt engineering.", 325 "red_flags": [ 326 { 327 "flag": "No benchmark contamination analysis", 328 "detail": "Several benchmarks (ToMi, 2019; BigToM, 2024) were publicly available before the training cutoff of models tested. No analysis of whether LLM performance is inflated by training data contamination is provided. While AutoToM's improvement over base models suggests the framework adds value, absolute baseline numbers may be unreliable." 329 }, 330 { 331 "flag": "Variance only reported for one benchmark", 332 "detail": "Multi-run variance (standard error across random seeds) is reported only for MMToM-QA. The other four benchmark results and the cognitive study correlations appear to be from single runs, making it impossible to assess result stability across the full evaluation suite." 333 }, 334 { 335 "flag": "LLM API temperature not reported", 336 "detail": "The paper does not report temperature, top-p, or other sampling parameters used when calling the LLM APIs. These settings significantly affect output, and without them, exact reproduction is difficult even with the same model versions." 337 }, 338 { 339 "flag": "Cognitive study evaluation uses small scenario samples", 340 "detail": "The cognitive study correlation with humans (Table 3) is based on 6 scenarios for goal inference and 7 scenarios for desire/belief inference. High correlations on such small samples should be interpreted cautiously, though p-values are reported." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "Understanding social reasoning in language models with language models", 346 "authors": ["Kanishk Gandhi", "Jan-Philipp Fränken", "Tobias Gerstenberg", "Noah Goodman"], 347 "year": 2024, 348 "relevance": "BigToM benchmark used for evaluating LLM Theory of Mind capabilities, relevant to assessing LLM reasoning quality." 349 }, 350 { 351 "title": "MMToM-QA: Multimodal Theory of Mind Question Answering", 352 "authors": ["Chuanyang Jin", "Yutong Wu", "Jing Cao"], 353 "year": 2024, 354 "relevance": "Multimodal Theory of Mind benchmark combining vision and language, relevant to evaluating LLM capability in complex reasoning." 355 }, 356 { 357 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 358 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 359 "year": 2025, 360 "arxiv_id": "2501.12948", 361 "relevance": "Large reasoning model used as baseline, relevant to understanding LLM reasoning capabilities and training approaches." 362 }, 363 { 364 "title": "GPT-4 Technical Report", 365 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 366 "year": 2023, 367 "arxiv_id": "2303.08774", 368 "relevance": "Foundation model used as the LLM backend in AutoToM and as a baseline, central to understanding LLM capabilities." 369 }, 370 { 371 "title": "The Llama 3 Herd of Models", 372 "authors": ["Abhimanyu Dubey", "Abhinav Jauhri", "Abhinav Pandey"], 373 "year": 2024, 374 "arxiv_id": "2407.21783", 375 "relevance": "Open-source LLM used as baseline, relevant to evaluating open vs. closed model capabilities." 376 }, 377 { 378 "title": "Hypothesis-driven Theory-of-Mind Reasoning for Large Language Models", 379 "authors": ["Hyunwoo Kim", "Melanie Sclar", "Tan Zhi-Xuan"], 380 "year": 2025, 381 "arxiv_id": "2502.11881", 382 "relevance": "Closely related work on hypothesis-based ToM reasoning with LLMs, relevant to understanding LLM reasoning approaches." 383 }, 384 { 385 "title": "Hypothetical Minds: Scaffolding Theory of Mind for Multi-Agent Tasks with Large Language Models", 386 "authors": ["Logan Cross", "Violet Xiang", "Agam Bhatia"], 387 "year": 2024, 388 "arxiv_id": "2407.07086", 389 "relevance": "LLM-based multi-agent reasoning with ToM scaffolding, relevant to understanding agentic AI architectures." 390 }, 391 { 392 "title": "NOPA: Neurally-Guided Online Probabilistic Assistance for Building Socially Intelligent Home Assistants", 393 "authors": ["Xavier Puig", "Tianmin Shu", "Joshua B Tenenbaum", "Antonio Torralba"], 394 "year": 2023, 395 "relevance": "Online Watch-And-Help benchmark used for embodied assistance evaluation, relevant to evaluating AI agent cooperation." 396 }, 397 { 398 "title": "FANToM: A Benchmark for Stress-Testing Machine Theory of Mind in Interactions", 399 "authors": ["Hyunwoo Kim", "Melanie Sclar", "Xuhui Zhou"], 400 "year": 2023, 401 "arxiv_id": "2310.15421", 402 "relevance": "Challenging ToM benchmark used for additional evaluation, relevant to stress-testing LLM reasoning capabilities." 403 }, 404 { 405 "title": "Large Language Models Fail on Trivial Alterations to Theory-of-Mind Tasks", 406 "authors": ["Tomer Ullman"], 407 "year": 2023, 408 "arxiv_id": "2302.08399", 409 "relevance": "Demonstrates systematic LLM failures in ToM reasoning, motivating the need for structured approaches like AutoToM." 410 }, 411 { 412 "title": "MuMA-ToM: Multi-modal Multi-agent Theory of Mind", 413 "authors": ["Haojun Shi", "Suyu Ye", "Xinyu Fang"], 414 "year": 2024, 415 "arxiv_id": "2408.12574", 416 "relevance": "Multimodal multi-agent ToM benchmark and LIMP baseline method, relevant to evaluating LLM multi-agent reasoning." 417 }, 418 { 419 "title": "SoMi-ToM: Evaluating Multi-perspective Theory of Mind in Embodied Social Interactions", 420 "authors": ["Xianzhe Fan", "Xuhui Zhou", "Chuanyang Jin"], 421 "year": 2025, 422 "arxiv_id": "2506.23046", 423 "relevance": "Evaluates LLM Theory of Mind in embodied social settings, relevant to understanding AI social intelligence." 424 } 425 ] 426 }