scan.json (21137B)
1 { 2 "paper": { 3 "title": "Challenges in Human-Agent Communication", 4 "authors": [ 5 "Gagan Bansal", 6 "Jennifer Wortman Vaughan", 7 "Saleema Amershi", 8 "Eric Horvitz", 9 "Adam Fourney", 10 "Hussein Mozannar", 11 "Victor Dibia", 12 "Daniel S. Weld" 13 ], 14 "year": 2024, 15 "venue": "arXiv preprint", 16 "arxiv_id": "2412.10380" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": false, 22 "answer": false, 23 "justification": "This is a theoretical/position paper proposing a taxonomy of challenges. There is no code, tool, or system implementation to release." 24 }, 25 "data_released": { 26 "applies": false, 27 "answer": false, 28 "justification": "No data was collected or generated. The paper is a conceptual analysis with illustrative examples, not an empirical study with datasets." 29 }, 30 "environment_specified": { 31 "applies": false, 32 "answer": false, 33 "justification": "No computational experiments were run, so no environment specification is needed." 34 }, 35 "reproduction_instructions": { 36 "applies": false, 37 "answer": false, 38 "justification": "This is a position/perspective paper. There are no experiments to reproduce." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": false, 44 "answer": false, 45 "justification": "No quantitative results are reported. The paper is entirely qualitative/theoretical." 46 }, 47 "significance_tests": { 48 "applies": false, 49 "answer": false, 50 "justification": "No statistical comparisons are made. The paper proposes a taxonomy of challenges without empirical evaluation." 51 }, 52 "effect_sizes_reported": { 53 "applies": false, 54 "answer": false, 55 "justification": "No quantitative effects are measured. This is a conceptual paper." 56 }, 57 "sample_size_justified": { 58 "applies": false, 59 "answer": false, 60 "justification": "No samples were collected. This is a theoretical paper." 61 }, 62 "variance_reported": { 63 "applies": false, 64 "answer": false, 65 "justification": "No experiments with multiple runs were conducted." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": false, 71 "answer": false, 72 "justification": "This is a position paper proposing challenges, not evaluating a system or method against baselines." 73 }, 74 "baselines_contemporary": { 75 "applies": false, 76 "answer": false, 77 "justification": "No system evaluation is performed, so baseline contemporaneity is not applicable." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "No system with components to ablate. This is a conceptual taxonomy paper." 83 }, 84 "multiple_metrics": { 85 "applies": false, 86 "answer": false, 87 "justification": "No system evaluation with metrics is performed." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "No system outputs are produced to evaluate. The paper proposes challenges but does not evaluate solutions." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "No dataset or test set is used. This is a theoretical paper." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper organizes its 12 challenges into three categories (agent-to-user: A1-A5, user-to-agent: U1-U3, and overarching: X1-X4) and discusses each challenge individually with dedicated subsections and examples." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper is fundamentally about failure modes in human-agent communication. Each challenge (X1-X4, U1-U3, A1-A5) is illustrated with concrete failure examples, such as an agent misinterpreting a business trip as leisure (U1), an agent deleting critical files without permission (A2), and an agent filing a FOIA request autonomously (A3)." 108 }, 109 "negative_results_reported": { 110 "applies": false, 111 "answer": false, 112 "justification": "No experiments were run, so there are no positive or negative results to report." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims to identify and analyze twelve key communication challenges, which is exactly what the paper does in Sections 3-5. The abstract's claim of providing 'concrete examples and open directions of research' is supported throughout." 120 }, 121 "causal_claims_justified": { 122 "applies": false, 123 "answer": false, 124 "justification": "The paper makes no causal claims. It identifies challenges and proposes directions, using language like 'may lead to' and 'can result in' to discuss potential consequences, which are framed as possibilities rather than causal assertions." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper explicitly states in the introduction that this list 'isn't exhaustive' (Section 1) and that they 'consider these [ethical, safety, fairness challenges] to be outside the scope of this paper' (Section 1.2). They scope the work to communication challenges specifically." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": false, 133 "answer": false, 134 "justification": "The paper presents no empirical results, so there are no findings requiring alternative explanations. It is a taxonomy/position paper." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "No models are evaluated in experiments. The paper mentions GPT-4 and ChatGPT in illustrative examples but does not run systematic experiments with them." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "No prompting experiments are conducted. The paper uses narrative examples, not systematic prompt-based evaluations." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No experiments requiring hyperparameter settings are conducted." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is implemented or tested in this paper. The paper discusses agent architectures conceptually." 157 }, 158 "data_preprocessing_documented": { 159 "applies": false, 160 "answer": false, 161 "justification": "No data was collected or preprocessed. This is a position paper." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The paper does not discuss the limitations of its own analysis or taxonomy in a structured way." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed. The paper does not address potential weaknesses of its taxonomy, such as whether the 12 challenges are comprehensive, whether the categorization is the best possible, or whether the challenges are weighted equally in importance." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 1.2 explicitly states: 'While the rise of new degrees of autonomy and use of tools by agents poses many other important challenges including technical, ethical, safety, and fairness challenges, we consider these to be outside the scope of this paper.' The introduction also notes the list 'isn't exhaustive.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": false, 184 "answer": false, 185 "justification": "No data was collected. This is a theoretical/position paper." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper states the challenges were arrived at 'based on our experiences building and experimenting with complex AI agents and multi-agent systems, drawing on the literature on human-AI interaction and collaboration' (Section 1). However, the methodology for how these specific 12 challenges were identified and selected is not described in detail — there is no systematic process documented." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No participants were recruited. This is a position paper based on the authors' expertise and literature review." 196 }, 197 "data_pipeline_documented": { 198 "applies": false, 199 "answer": false, 200 "justification": "No data pipeline exists. This is a conceptual paper." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding source is disclosed. The acknowledgments section thanks colleagues for feedback but does not mention any grants or funding." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: seven authors from Microsoft and one from the Allen Institute for Artificial Intelligence. The paper header states '1Microsoft' and '2Allen Institute for Artificial Intelligence.'" 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Most authors are from Microsoft, which develops and sells AI agent products (Copilot, AutoGen). The paper discusses Microsoft products in examples (Microsoft CoPilot, ChatGPT). While not an evaluation of Microsoft products per se, the framing of challenges in human-agent communication directly relates to Microsoft's commercial interests, and funding independence is not addressed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is included. Several authors work at Microsoft, which has significant commercial interests in AI agent technology." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "No pre-trained model is evaluated on any benchmark. This is a theoretical paper." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "No benchmark evaluation is performed." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No benchmark evaluation is performed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human subjects study was conducted. This is a position paper." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants were involved." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants were involved." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants were involved." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants were involved." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants were involved." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants were involved." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a theoretical/position paper. No method with inference costs is proposed or evaluated." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "No computational experiments were conducted." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Modern AI agents introduce twelve key communication challenges grouped into three categories: agent-to-user information flow (A1-A5), user-to-agent information flow (U1-U3), and overarching communication challenges (X1-X4).", 295 "evidence": "The twelve challenges are enumerated in Figure 1 and developed in Sections 3-5, each with detailed discussion and concrete examples.", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "Two primary technical advances distinguish today's AI agents: (1) generative foundation models with broad capabilities including natural language communication, and (2) the ability to invoke tools via APIs that enable real-world actions.", 300 "evidence": "Discussed in Section 1.1 with references to existing systems like Devin, Microsoft Copilot, and ChatGPT as concrete examples of these capabilities.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Establishing common ground between users and AI agents is critical for effective collaboration, requiring mechanisms for verifiability, consistency, appropriate detail, and contextual awareness.", 305 "evidence": "Section 3 develops four overarching challenges (X1-X4) with references to grounding theory from Clark and Brennan (1991) and prior HCI research.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "The complexity and tool-using capabilities of modern agents make communication challenges qualitatively different from those faced with earlier AI systems.", 310 "evidence": "Section 1.1 argues this based on the broad capabilities of foundation models and tool-use, illustrated with examples of agents that can browse the web, execute code, and make purchases. However, this is argued conceptually rather than demonstrated empirically.", 311 "supported": "moderate" 312 } 313 ], 314 "methodology_tags": [ 315 "theoretical", 316 "qualitative" 317 ], 318 "key_findings": "The paper identifies twelve key challenges in human-agent communication, organized into three categories: conveying information from agent to user (what can the agent do, what is it about to do, what is it currently doing, were there side effects, was the goal achieved), enabling users to convey information to the agent (goals, preferences, feedback), and overarching challenges (verifiability, consistency, appropriate detail level, contextual memory). The authors ground these challenges in communication theory (particularly Clark and Brennan's grounding framework) and illustrate each with concrete scenarios drawn from modern agent systems. The paper serves as a research agenda rather than presenting empirical findings.", 319 "red_flags": [ 320 { 321 "flag": "No empirical validation of the taxonomy", 322 "detail": "The twelve challenges are derived from the authors' experience rather than a systematic methodology (e.g., grounded theory, structured literature review, or user studies). The paper states challenges were identified 'based on our experiences building and experimenting with complex AI agents' but does not describe how completeness or importance was assessed." 323 }, 324 { 325 "flag": "Potential conflict of interest not explicitly discussed", 326 "detail": "Seven of eight authors are from Microsoft, which develops major AI agent products (Copilot, AutoGen). The paper references and uses Microsoft products in examples. While the paper is not a product evaluation, the framing of challenges could be influenced by Microsoft's product roadmap and commercial interests. No conflict of interest statement is provided." 327 }, 328 { 329 "flag": "No structured quality assessment of cited prior work", 330 "detail": "The paper draws on prior HCI and cognitive science literature but does not systematically evaluate the strength of evidence from these cited works. Conclusions about what has and hasn't been addressed are asserted rather than derived from structured analysis." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 336 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 337 "year": 2024, 338 "relevance": "Directly relevant as a multi-agent system framework that exemplifies the types of agent architectures whose communication challenges the paper discusses." 339 }, 340 { 341 "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks", 342 "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"], 343 "year": 2024, 344 "arxiv_id": "2411.04468", 345 "relevance": "Multi-agent system from the same research group, directly relevant to agentic AI evaluation and capabilities." 346 }, 347 { 348 "title": "Practices for Governing Agentic AI Systems", 349 "authors": ["Yonadav Shavit", "Sandhini Agarwal", "Miles Brundage"], 350 "year": 2024, 351 "relevance": "Proposes governance practices for agentic AI including legibility and action-space constraints, directly intersecting with communication challenges discussed in this paper." 352 }, 353 { 354 "title": "The Ethics of Advanced AI Assistants", 355 "authors": ["Iason Gabriel", "Arianna Manzini", "Geoff Keeling"], 356 "year": 2024, 357 "arxiv_id": "2404.16244", 358 "relevance": "Examines ethical challenges of advanced AI assistants including safety and control issues relevant to human-agent interaction." 359 }, 360 { 361 "title": "From Interaction to Impact: Towards Safer AI Agents through Understanding and Evaluating UI Operation Impacts", 362 "authors": ["Zhuohao Jerry Zhang", "Eldon Schoop", "Jeffrey Nichols"], 363 "year": 2024, 364 "relevance": "Studies safety implications of AI agent actions through UI operations, directly related to side effects challenge (A4)." 365 }, 366 { 367 "title": "Harms from Increasingly Agentic Algorithmic Systems", 368 "authors": ["Alan Chan", "Rebecca Salganik", "Alva Markelius"], 369 "year": 2023, 370 "relevance": "Analyzes harms from agentic AI systems, providing context for the safety and transparency challenges discussed in this paper." 371 }, 372 { 373 "title": "AI Transparency in the Age of LLMs: A Human-Centered Research Roadmap", 374 "authors": ["Q. Vera Liao", "Jennifer Wortman Vaughan"], 375 "year": 2024, 376 "relevance": "Research roadmap for AI transparency with LLMs, directly related to the transparency challenges this paper identifies." 377 }, 378 { 379 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 380 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 381 "year": 2024, 382 "relevance": "Open platform for AI software development agents, exemplifying the class of agentic systems whose communication challenges this paper addresses." 383 }, 384 { 385 "title": "Can Large Language Models Reason and Plan?", 386 "authors": ["Subbarao Kambhampati"], 387 "year": 2024, 388 "relevance": "Critical investigation of LLM planning abilities, directly relevant to understanding agent capabilities and limitations discussed in challenges A1 and A2." 389 }, 390 { 391 "title": "Concrete Problems in AI Safety", 392 "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt"], 393 "year": 2016, 394 "arxiv_id": "1606.06565", 395 "relevance": "Foundational AI safety paper identifying problems like reward hacking and side effects that are central to the communication challenges discussed here." 396 }, 397 { 398 "title": "In Search of Verifiability: Explanations Rarely Enable Complementary Performance in AI-Advised Decision Making", 399 "authors": ["Raymond Fok", "Daniel S. Weld"], 400 "year": 2024, 401 "relevance": "Empirical study on AI explainability and verifiability, directly relevant to challenge X1 (helping users verify agent behavior)." 402 }, 403 { 404 "title": "Agent Workflow Memory", 405 "authors": ["Zora Zhiruo Wang", "Jiayuan Mao", "Daniel Fried", "Graham Neubig"], 406 "year": 2024, 407 "arxiv_id": "2409.07429", 408 "relevance": "Proposes agent memory mechanisms for learning from past interactions, directly relevant to challenge X4 about contextual memory in agent communication." 409 } 410 ] 411 }