scan.json (26701B)
1 { 2 "paper": { 3 "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 4 "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"], 5 "year": 2024, 6 "venue": "Preprint (under review)", 7 "arxiv_id": "2406.12045" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper provides a GitHub link: https://github.com/sierra-research/tau-bench (mentioned in the abstract footnote and in the paper body)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The benchmark data (database JSON files, task instances, domain policies) is released as part of the GitHub repository. The paper states 'We release our codebase publicly to encourage the community to create new tasks and domains for τ-bench.'" 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing with versions is mentioned in the paper. The paper mentions Python API tools and various LLM APIs but does not specify environment setup details." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper does not include step-by-step reproduction instructions. While the code is released, the paper itself does not describe how to run the benchmark experiments, what commands to use, or how to replicate the main results." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "The main results in Table 2 report only point estimates (e.g., 'gpt-4o: 61.2% retail, 35.2% airline'). No confidence intervals or error bars are reported for these numbers. The pass^k curves in Figure 4 also lack error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., 'gpt-4o is the best model') but provides no statistical significance tests. All comparisons are based on raw number differences." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports absolute performance levels and differences in context. For example, 'even state-of-the-art LMs like gpt-4o achieve low task success rates (∼61% on τ-retail and ∼35% on τ-airline)' and 'pass^8 drops to < 25%'. The ablation in Table 3 shows specific degradation: 'gpt-4o: 61.2 → 56.8' and '33.2 → 10.8', providing both baseline and changed values." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The benchmark has 115 tasks (τ-retail) and 50 tasks (τ-airline) with at least 3 trials per task, but no justification is given for why these sizes are sufficient for the claims made. No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper reports 'at least 3 trials per task' for main results but does not report standard deviations, variance, or any spread measure across these trials. Only aggregate pass^1 numbers are reported in Table 2." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares multiple models (gpt-4o, gpt-4-turbo, gpt-4-32k, gpt-3.5-turbo, claude-3-opus, claude-3-sonnet, claude-3-haiku, gemini-1.5-pro, gemini-1.5-flash, mistral-large, mixtral-8x22b, meta-llama-3-70B) and multiple methods (Function Calling, ReAct, Act-only) as baselines." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines include state-of-the-art models as of mid-2024 (gpt-4o, claude-3-opus, gemini-1.5-pro, meta-llama-3-70B), which were the most advanced models available at the time of publication." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper includes an ablation removing the domain policy from the agent's system prompt (Table 3), showing the impact on performance. It also compares Function Calling vs. ReAct vs. Act-only methods (Figure 3), and reports a 'think' function experiment." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports both pass^1 (average success rate) and pass^k (consistency over multiple trials), as well as pass@k for comparison. These capture different aspects of agent performance: average capability vs. reliability." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "The authors manually examined 40 failure cases from gpt-4o FC agent trajectories (Section 5.2), classifying them into 4 categories (wrong argument, wrong info, wrong decision, partially resolve). This is manual expert evaluation of system outputs." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "There is no separation of dev and test sets. The same 115 retail and 50 airline tasks are used for all experiments and analysis. The user instructions were iteratively refined using gpt-4-turbo trials (Stage III), creating a risk that the tasks are tuned to specific model behaviors." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper provides per-domain breakdown (τ-retail vs τ-airline), per-task difficulty visualization (Figure 7), failure category breakdown (Figure 5), and performance by number of write API actions (Figure 6)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.2 provides detailed failure analysis with three failure categories: wrong argument/info (55%), wrong decision (25%), and partial resolution (19%). Specific trajectory examples are shown in Appendix C.2." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that adding a 'think' function for function-calling agents 'did not boost performance' (Section 5.1). It also reports that even the best model (gpt-4o) solves less than 50% of tasks overall, which is itself a negative finding about current agent capabilities." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims that 'even state-of-the-art function calling agents (like gpt-4o) succeed on < 50% of the tasks' (supported by Table 2: avg 48.2%) and 'pass^8 < 25% in retail' (supported by Figure 4). These claims are well-supported by the results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims through ablation studies (e.g., removing domain policy reduces performance - Table 3, removing reasoning traces reduces performance - ReAct vs Act in Figure 3). These are single-variable controlled ablations, adequate for the claims." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper bounds its claims to the two tested domains (retail and airline customer service) and notes 'For more capable agents in the future, more advanced domains (e.g., medical, tax, or legal) with more complex data and rules can be studied.' The Discussion section acknowledges simulation limitations." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 6 (Discussion) discusses alternative explanations for results, including user simulator limitations (typos/ambiguities in instructions, limited LM reasoning capacity, potential implicit bias from using gpt-4-turbo for task curation). The paper acknowledges that some failures may be due to user simulation rather than agent issues." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses marketing names without specific version strings for most models: 'gpt-4o', 'gpt-4-turbo', 'claude-3-opus', 'claude-3-sonnet', 'gemini-1.5-pro-latest', 'gemini-1.5-flash-latest'. Only 'gpt-4-32k' is partially versioned as 'gpt-4-32k-0613' in Figure 4, but the main Table 2 uses 'gpt-4-32k'. The user simulator uses 'gpt-4-0613' which is properly versioned." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper provides complete domain policy documents used as system prompts (Appendix B.1 contains the full retail and airline policies), user instruction templates with concrete examples (Figure 2d, Appendix C.2), and explains that the domain policy is the agent's system prompt. The actual prompts sent to models are fully reconstructable." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5 states: 'The LM temperature is 0.0 for agent and 1.0 for user.' The paper also specifies 'We limit each task to at most 30 agent actions.' These are the key hyperparameters for the API-based experiments." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper describes the agent scaffolding approaches: Function Calling (FC) where the model autonomously decides to generate a user response or tool call, ReAct with 'Thought: {reasoning} Action: {JSON action}', and Act-only. The user simulation setup with system prompts and conversation history management is also described." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper documents the three-stage benchmark construction process (Section 4): Stage I manual schema/API design, Stage II automatic data generation with LMs (with code shown in Appendix B.2), and Stage III manual task annotation and validation with iterative agent runs. The data generation pipeline is well-documented." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 6 (Discussion) contains a substantive 'Directions for improvement' subsection that discusses specific limitations of the benchmark, including user simulator limitations, task annotation difficulty, and implicit bias from using gpt-4-turbo for task curation." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The paper discusses specific threats: (1) user instruction typos or ambiguities, (2) user instruction not containing all domain knowledge, (3) user simulation LM capacity limitations at reasoning/calculation/long-context, (4) implicit bias from using gpt-4-turbo FC agent to tune user prompts. These are specific to this study." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states scope boundaries: it focuses on customer service domains (retail and airline) only, uses simplified policies compared to real-world, and notes that results may not extend to more complex domains. The Discussion states 'more advanced domains (e.g., medical, tax, or legal) with more complex data and rules can be studied' in future work." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The benchmark data (database JSON files, task instances, Python API implementations, domain policies) is released through the GitHub repository. This allows independent verification of the benchmark construction and results." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4 describes the three-stage data collection procedure in detail: manual schema/API design (Stage I), LM-assisted data generation with code (Stage II, with generation code in Appendix B.2), and manual task annotation with iterative validation (Stage III)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were recruited. The benchmark uses LM-simulated users and programmatic APIs. The data is synthetically generated, not collected from human subjects." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The three-stage pipeline is documented with specific details: Stage I manual design, Stage II code-based generation (with actual Python code in Appendix B.2 showing how user profiles are generated), Stage III iterative annotation and validation (with mention of 40+ gpt-4-turbo trials per task). The filtering step of fixing 4 user instruction typos/ambiguities is mentioned." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source is disclosed. The Acknowledgements section thanks individuals for feedback and help but does not mention any grants, sponsors, or funding agencies." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are listed with their affiliation: Sierra. The first author (Shunyu Yao) has a footnote noting 'Work done during internship.' Author affiliations are clearly stated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "The authors are from Sierra, an AI agent company. Sierra has a direct commercial interest in the agent evaluation space. While no explicit funding is disclosed, the work was done at/during an internship at Sierra, which has a financial stake in demonstrating that current agents are not reliable enough (motivating demand for better agent products). The funder independence is not addressed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is included in the paper. Sierra is an AI agent startup, and the authors' equity or other financial interests related to the agent evaluation space are not disclosed." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates multiple pre-trained models on the τ-bench benchmark but does not state the training data cutoff dates for any of the models tested. While τ-bench uses synthetic data that is unlikely to be in training sets, the paper does not discuss this." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper does not discuss whether any of the benchmark components (domain policies, database schemas, API designs) could have been influenced by or similar to data in the training sets of the tested models. The synthetic data makes contamination less likely but this is not discussed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The benchmark is newly created with synthetic data, making direct contamination unlikely, but the paper does not discuss this. The domain policies and task designs could share patterns with customer service data in training sets, and this is not addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved. The benchmark uses LM-simulated users, not real humans." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in the study. The NeurIPS checklist also marks human subjects items as N/A." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants; users are simulated by language models." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants; all users are LM-simulated." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants; not an experimental study with human subjects." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants; blinding is not applicable." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants; attrition is not applicable." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 5.1 'Cost analysis' reports: 'the agent / user simulation costs are $0.38 / $0.23 per task respectively, so running one trial per task costs around 200 dollars.' It also breaks down costs: 'the input prompt / completion output take up 95.9% / 4.1% of the price.'" 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "While per-task API costs are reported, the total computational budget for all experiments is not stated. The paper does not mention total API spend across all models and trials, GPU hours, or total compute used." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Even state-of-the-art function calling agents (like gpt-4o) succeed on less than 50% of τ-bench tasks overall.", 286 "evidence": "Table 2 shows gpt-4o achieves 61.2% on τ-retail and 35.2% on τ-airline, with a weighted average of 48.2%. This is the highest among all tested models.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Agent consistency drops rapidly with increasing trials: pass^8 < 25% for gpt-4o in retail.", 291 "evidence": "Figure 4 shows the pass^k curve for gpt-4o FC in τ-retail, with pass^8 dropping below 25% despite pass^1 being above 60%.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Function calling consistently outperforms text-formatted agent methods (ReAct, Act-only) on state-of-the-art models.", 296 "evidence": "Figure 3 shows FC outperforming ReAct and Act across gpt-4o, gpt-4-turbo, and gpt-4-32k on τ-retail. However, gpt-3.5-turbo shows a different pattern.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Adding reasoning traces (ReAct vs. Act-only) consistently helps bridge the gap between observations and actions.", 301 "evidence": "Figure 3 shows ReAct outperforming Act-only across all four tested models on τ-retail.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Removing domain policy from the agent significantly hurts gpt-4o on τ-airline (−22.4%) but only slightly on τ-retail (−4.4%).", 306 "evidence": "Table 3 shows performance degradation with policy removal: gpt-4o retail 61.2→56.8, airline 33.2→10.8. gpt-3.5-turbo shows minimal degradation on airline (10.8→9.6).", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Current agents struggle most with complex database reasoning (~55% of failures) and domain rule following (~25% of failures).", 311 "evidence": "Section 5.2 and Figure 5 provide a breakdown of 36 failed gpt-4o FC trajectories in τ-retail: wrong argument (19.4%), wrong info (25.0%), wrong decision (22.2%), partially resolve (33.3%). Wrong argument + wrong info = ~44%, and wrong decision = ~22%.", 312 "supported": "moderate" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "τ-bench introduces a benchmark for evaluating language agents on realistic tool-agent-user interaction with domain-specific policies. Even the best model (gpt-4o with function calling) achieves only 48.2% average task success, and consistency drops sharply with repeated trials (pass^8 < 25% on retail). The main failure modes are complex database reasoning (wrong arguments/info, ~55% of failures), domain rule following (wrong decisions, ~25%), and compound request handling (~19%). Removing domain policies shows that agents rely heavily on commonsense in simpler domains but benefit significantly from policy documents in complex domains.", 317 "red_flags": [ 318 { 319 "flag": "Task design bias from model-in-the-loop annotation", 320 "detail": "Task instances were iteratively refined using gpt-4-turbo FC agent runs (Stage III), creating potential bias where tasks are tuned to the behavioral patterns of one specific model. The paper acknowledges this: 'there is also some element of implicit bias during the task curation process since we use the gpt-4-turbo FC agent to tune the user's system prompt.'" 321 }, 322 { 323 "flag": "No confidence intervals on main results", 324 "detail": "All results in Table 2 are point estimates without error bars or confidence intervals, despite running only 3+ trials per task. With 115 tasks and 3 trials each, the estimates have non-trivial sampling uncertainty that is not quantified." 325 }, 326 { 327 "flag": "Company affiliation not discussed as conflict", 328 "detail": "All authors are from Sierra, an AI agent company. The finding that 'current agents are not reliable enough for real-world deployment' directly motivates demand for Sierra's products. This conflict of interest is not acknowledged in the paper." 329 }, 330 { 331 "flag": "No held-out test set", 332 "detail": "All 165 tasks are used for both development (iterative refinement with gpt-4-turbo) and evaluation. There is no held-out split to guard against overfitting the benchmark to specific model behaviors." 333 } 334 ], 335 "cited_papers": [ 336 { 337 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 338 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Karthik Pei", "Ofir Press", "Karthik Narasimhan"], 339 "year": 2023, 340 "arxiv_id": "2310.06770", 341 "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks, directly relevant to the survey's scope of agent evaluation methodology." 342 }, 343 { 344 "title": "AgentBench: Evaluating LLMs as Agents", 345 "authors": ["Xiao Liu"], 346 "year": 2023, 347 "arxiv_id": "2308.03688", 348 "relevance": "Benchmark for evaluating LLMs as agents across multiple environments, directly comparable to τ-bench as an agent evaluation methodology." 349 }, 350 { 351 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 352 "authors": ["Shuyan Zhou"], 353 "year": 2023, 354 "arxiv_id": "2307.13854", 355 "relevance": "Realistic web-based agent benchmark that τ-bench extends by adding human-in-the-loop interaction, relevant to agent evaluation methodology quality." 356 }, 357 { 358 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 359 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 360 "year": 2023, 361 "arxiv_id": "2210.03629", 362 "relevance": "Foundational agent framework used as a baseline method in τ-bench, key to understanding agent scaffolding approaches." 363 }, 364 { 365 "title": "Evaluating Large Language Models Trained on Code", 366 "authors": ["Mark Chen"], 367 "year": 2021, 368 "relevance": "Introduced pass@k metric for code generation, which τ-bench extends to pass^k for agent reliability evaluation." 369 }, 370 { 371 "title": "Reflexion: An Autonomous Agent with Dynamic Memory and Self-Reflection", 372 "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"], 373 "year": 2023, 374 "relevance": "Agent self-reflection method discussed in context of why it's unsuitable for real-world user-facing settings, relevant to agent methodology evaluation." 375 }, 376 { 377 "title": "Identifying the Risks of LM Agents with an LM-Emulated Sandbox", 378 "authors": ["Yangjun Ruan"], 379 "year": 2023, 380 "arxiv_id": "2309.15817", 381 "relevance": "ToolEmu benchmark that uses LMs to emulate tool execution for safety evaluation, relevant to agent safety and evaluation methodology." 382 }, 383 { 384 "title": "Berkeley Function Calling Leaderboard", 385 "authors": ["Fanjia Yan"], 386 "year": 2024, 387 "relevance": "Major benchmark for evaluating LLM function calling capabilities, directly relevant to tool-use evaluation methodology." 388 }, 389 { 390 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework", 391 "authors": ["Qingyun Wu"], 392 "year": 2023, 393 "arxiv_id": "2308.08155", 394 "relevance": "Multi-agent conversation framework relevant to agentic AI architectures and evaluation." 395 }, 396 { 397 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 398 "authors": ["Joon Sung Park"], 399 "year": 2023, 400 "arxiv_id": "2304.03442", 401 "relevance": "Foundational work on LM-based simulation of human behavior, directly relevant to τ-bench's user simulation approach." 402 }, 403 { 404 "title": "Cognitive Architectures for Language Agents", 405 "authors": ["Theodore R. Sumers", "Shunyu Yao", "Karthik Narasimhan", "Thomas L. Griffiths"], 406 "year": 2023, 407 "arxiv_id": "2309.02427", 408 "relevance": "Survey of cognitive architectures for language agents, providing theoretical grounding for agent evaluation." 409 }, 410 { 411 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 412 "authors": ["Timo Schick"], 413 "year": 2023, 414 "arxiv_id": "2302.04761", 415 "relevance": "Foundational work on LLM tool use, relevant to understanding the tool-use capabilities evaluated by τ-bench." 416 } 417 ] 418 }