scan.json (28365B)
1 { 2 "paper": { 3 "title": "Benchmark Test-Time Scaling of General LLM Agents", 4 "authors": [ 5 "Xiaochuan Li", 6 "Ryan Ming", 7 "Pranav Setlur", 8 "Abhijay Paladugu", 9 "Andy Tang", 10 "Hao Kang", 11 "Shuai Shao", 12 "Rong Jin", 13 "Chenyan Xiong" 14 ], 15 "year": 2026, 16 "venue": "arXiv preprint", 17 "arxiv_id": "2602.18998" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract states 'Code is publicly available at https://github.com/cxcscmu/General-AgentBench.' A working URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The benchmark is constructed from publicly available datasets (SWE-Bench Verified, BrowseComp, WebVoyager, MathHay, Tau2-Bench, MCP-Bench). Table 1 lists the original and sampled sizes. The GitHub repo is provided for access." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile for the evaluation framework, conda environment, or detailed library version listing is provided in the paper. The paper mentions Docker-based environments for coding tasks but does not provide environment setup specifications for reproducing the benchmark framework itself." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided in the paper itself. Appendix D describes implementation details (tool management, Docker bridge mode, evaluators) but does not provide a README-style guide with specific commands to reproduce the experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Results in Table 2 and throughout the paper are reported as point estimates (e.g., '48.0' accuracy) with no confidence intervals, error bars, or uncertainty quantification." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes comparative claims (e.g., 'Claude Sonnet 4.5 achieves the strongest overall performance,' 'most other models experience performance drops of approximately 30%') without any statistical significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports relative performance changes with baselines throughout. For example, Table 3 reports baseline vs. general setting scores with percentage changes (e.g., 'Claude Sonnet 4.5: -0.2% average degradation'). Figure 4 reports mean relative degradation percentages per model. These provide context for the magnitude of differences." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The benchmark samples subsets from each original dataset (e.g., 50 from SWE-Bench Verified's 500, 124 from BrowseComp's 1266) as shown in Table 1. No justification is given for why these specific sample sizes were chosen or whether they are sufficient for the claims made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No standard deviations, variances, or spread measures are reported across experimental runs. Results appear to be single-run numbers. The parallel scaling experiments sample K=1 to K=4 trajectories but do not report variance across multiple repetitions of the entire experiment." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper compares general-agent performance against domain-specific baseline performance for each model (Table 3, Figures 3-4). It also compares ten different LLM agents against each other." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The evaluated models are highly contemporary: GPT-5, Claude Sonnet 4.5, Claude Haiku 4.5, Gemini 2.5-Pro, Gemini 2.5-Flash, DeepSeek-R1, DeepSeek-V3.2, Qwen3-235B, Qwen3-Next. These represent frontier models as of early 2026." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": false, 83 "justification": "The paper does not include ablation studies of its benchmark design choices (e.g., effect of tool pool size, impact of unified vs. domain-specific policies, impact of sampling strategy). The attention analysis in Appendix B compares two model architectures but is not an ablation of the benchmark or methodology." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses multiple metrics: task accuracy/success rate, pass@K for parallel scaling, self-choice accuracy (point-wise and pair-wise), relative performance degradation from baseline, and per-domain breakdowns across four domains." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation of agent outputs is included. All evaluation is automated through benchmark-specific evaluators (test suites, LLM-based semantic equivalence, DB state matching). Given claims about agent quality and robustness in real-world settings, human evaluation of trajectories or outputs would strengthen the claims." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "The benchmark samples from existing test sets (SWE-Bench Verified, BrowseComp, etc.) that serve as held-out evaluation data. The models are not tuned on this data; they are evaluated zero-shot via API calls." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 provides per-benchmark and per-domain breakdowns (Search, Code, Reason, Tool-use). Figures 3 and 5 show per-domain performance changes and scaling behaviors. Table 3 gives detailed per-benchmark scores." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 4.2 discusses failure modes in sequential scaling (stagnant fluctuation, saturation and degradation). Figure 6 shows instance-level correctness dynamics. Section 4.3 discusses the verification gap as a failure mode. Section 3.2 includes a case study contrasting successful and less effective tool usage strategies." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper's central findings are largely negative: test-time scaling does not yield effective improvements due to context ceiling (sequential) and verification gap (parallel). Section 4.3 reports that GPT-5 as an external verifier 'generally underperforms models' own self-judgment,' a negative finding about external verification." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims substantial performance degradation when moving to general-agent setting (supported by Table 2, Table 3, Figure 4), context ceiling in sequential scaling (supported by Figures 5, 6, 7), and verification gap in parallel scaling (supported by Figures 5, 8). All abstract claims have corresponding evidence in the results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper makes causal claims like 'the accumulated history eventually overwhelms the agent's reasoning capacity, leading to instability' (Section 1) and attributes performance degradation to the unified tool interface. These causal explanations are not adequately tested through controlled experiments. The performance drop could stem from multiple confounding factors (longer prompts, different tool descriptions, different policies) that are not isolated." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims to benchmark 'General LLM Agents' but tests on a specific subset of tasks sampled from seven benchmarks across four domains. The paper does not bound its generalizations to the specific tasks tested. The claim that this 'more closely reflects real-world user interactions' (Section 2.2) is not adequately bounded -- real-world usage includes many more domains and interaction patterns." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not substantively discuss alternative explanations for the observed performance degradation. For example, the drop from domain-specific to general settings could be due to prompt differences, tool description length consuming context, or policy differences rather than the 'general agent' framing. The verification gap could reflect prompt engineering quality rather than fundamental model limitations." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper uses marketing names without specific API versions or snapshot dates: 'Claude Sonnet 4.5', 'GPT-5', 'Gemini 2.5-Pro', 'Gemini 2.5-Flash'. No API version strings (e.g., 'gpt-5-2025-08-01') or snapshot dates are provided. Section 2.3 says they 'access these models via Amazon Bedrock and the Hugging Face Inference API' but does not specify API versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix E provides the full universal agent system prompt, point-wise self-choice system prompt, and pair-wise self-choice system prompt. The templates include placeholders that are described (e.g., '{{Task Description}}', '{{Trajectory}}'). Section 2.2 notes 'Detailed prompt templates, tool specifications, and the unified policy are deferred to the Appendix E.'" 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 2.3 states 'we fix the decoding temperature to 0.7' but does not report other important hyperparameters such as top-p, max tokens, or stop sequences. Temperature alone is insufficient." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 2.2 and Appendix D describe the agentic scaffolding in detail: the MCP-based Host-Client-Server architecture, tool routing, interaction flow, sequential scaling via injecting additional feedback rounds, and parallel scaling via independent trajectory sampling. Figure 2 provides a framework diagram." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "Table 1 shows original and sampled sizes (e.g., BrowseComp: 1266 original, 124 sampled) but does not describe the sampling procedure or criteria used to select the subsets. No filtering criteria are stated." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper has an 'Impact Statement' section (after Section 6) that briefly mentions risks ('may favor computationally intensive or proprietary models,' 'unreliable self-choice under parallel scaling') but there is no dedicated limitations or threats-to-validity section with substantive methodological discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The Impact Statement contains only generic risk statements, not specific threats to the validity of the study's conclusions. No specific methodological limitations are discussed (e.g., small sample sizes per benchmark, single-run results, potential confounds in the domain-specific vs. general comparison)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It positions General AgentBench as 'an initial step toward evaluating general-purpose agents' (Section 2.1) but does not bound its claims to the tested domains, models, or configurations." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw evaluation logs, execution traces, or per-instance results are released. The paper shows aggregate numbers and selected visualizations (e.g., Figure 6 shows 10 sampled instances) but does not provide the full underlying data for verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2.1 and Appendix A.1 describe each data source in detail: the seven benchmarks used, their domains, what they test, and how they are structured. Table 1 shows the composition. Appendix D describes the execution process and evaluation procedures." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The study evaluates LLM agents on benchmarks, not human subjects." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "While the framework architecture is well-described (Appendix D), the data pipeline from raw benchmark data to final reported results is not fully documented. The sampling procedure from original benchmarks to the subsets used (Table 1) is not described. The paper goes from '1266 BrowseComp examples' to '124 sampled' without explaining the selection." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding acknowledgments section is present. The authors are from Carnegie Mellon University and Meta, but no specific funding sources are disclosed." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: 'Language Technologies Institute, School of Computer Science, Carnegie Mellon University' and 'Meta.' The paper notes 'All experiments, data collection, and processing activities were conducted by Carnegie Mellon University. Meta was involved solely in an advisory role.'" 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Meta is listed as a co-affiliation and the paper evaluates models that compete with Meta's offerings. No disclosure is made about Meta's financial interest in the outcomes. The paper does not evaluate Meta's own models (Llama), which could also indicate selection bias." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the ten evaluated models, despite using benchmarks like SWE-Bench Verified and WebVoyager that have been publicly available since 2023-2024." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the benchmark tasks could have appeared in the training data of the evaluated models. SWE-Bench issues are from public GitHub repositories; BrowseComp and WebVoyager tasks are also public." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Several of the included benchmarks (SWE-Bench, WebVoyager) were published in 2023-2024, well before the training cutoffs of models like GPT-5 and Claude Sonnet 4.5. No discussion of contamination risk is provided." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Appendix C provides detailed cost tables (Tables 6-9) with per-model, per-domain, and per-scaling-setting cost breakdowns in USD. Table 6 lists unit API prices per 1M tokens. Total costs are reported for parallel scaling ($29,576), sequential scaling ($24,392), and general setting ($7,768)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix C provides the total computational budget across all three evaluation settings. The total API cost is approximately $61,736 across all experiments. Token counts and pricing formulas are also provided." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "LLM agents experience substantial performance degradation when moving from domain-specific to general-agent evaluation, with average relative drops ranging from 10% to 30%.", 296 "evidence": "Table 3 and Figure 4 show per-model degradation. Claude Sonnet 4.5 shows only -0.2% while Gemini 2.5-Flash shows -31.2%. Detailed per-domain breakdowns in Figure 3 and Table 3.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Claude Sonnet 4.5 exhibits the strongest robustness with only 0.2% average degradation when moving to the general-agent setting.", 301 "evidence": "Table 3 reports Claude Sonnet 4.5 baseline average of 45.1 vs general average of 45.0, yielding -0.2% relative change. Some domains show gains (+33.0% Search, +12.5% Reason) while others show drops (-1.4% Code, -16.8% Tool-Call).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Sequential scaling is bounded by a 'context ceiling' beyond which performance fluctuates or degrades.", 306 "evidence": "Figure 5 (bottom) and Figure 7 show performance plateauing or degrading after context exceeds ~96K-112K tokens. Figure 6 shows instance-level oscillation. Results shown for 5 models across 4 domains.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Parallel scaling reveals a 'verification gap' where models cannot reliably select correct solutions from their sampled trajectories.", 311 "evidence": "Figure 8 shows consistent gap between pass@K and self-choice accuracy across all models and domains. GPT-5 as external verifier also fails to close the gap, sometimes underperforming self-judgment.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Increasing parallel sampling K from 1 to 4 yields an average improvement of roughly 50% in pass@K.", 316 "evidence": "Figure 5 (top) shows monotonic increase in pass@K. Specific numbers cited for DeepSeek-V3.2 approaching twofold improvement in coding and reasoning domains.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Performance on static long-context benchmarks exhibits weak correlation with agentic performance.", 321 "evidence": "Figure 9 shows Pearson correlations between LongBench, HELMET, MRCR and four agentic domains. Most correlations are weak (0.04-0.71), with the exception of reasoning-MRCR (0.71).", 322 "supported": "moderate" 323 } 324 ], 325 "methodology_tags": [ 326 "benchmark-eval" 327 ], 328 "key_findings": "General AgentBench reveals that LLM agents suffer 10-30% performance degradation when transitioning from domain-specific to general-agent evaluation settings, with Claude Sonnet 4.5 being notably robust (-0.2%). Sequential test-time scaling hits a context ceiling where extended interaction histories lead to stagnation or degradation rather than improvement. Parallel test-time scaling increases the theoretical solution space (pass@K) but a persistent verification gap prevents models from reliably selecting correct solutions, limiting practical gains. Static long-context benchmark performance shows weak correlation with agentic task performance.", 329 "red_flags": [ 330 { 331 "flag": "No statistical significance testing", 332 "detail": "All comparisons between models and settings are made on point estimates without confidence intervals, significance tests, or multiple-run variance. With small sample sizes per benchmark (50-124 examples) and single runs, observed differences may not be statistically significant." 333 }, 334 { 335 "flag": "Unexplained benchmark sampling", 336 "detail": "Table 1 shows substantial subsampling from original benchmarks (e.g., 50 from 500 SWE-Bench instances, 124 from 1266 BrowseComp) without explaining the sampling criteria. If the subset is non-representative, it could bias results." 337 }, 338 { 339 "flag": "No limitations section", 340 "detail": "The paper lacks a dedicated limitations or threats-to-validity section. The Impact Statement mentions only deployment-level risks, not methodological limitations of the study design." 341 }, 342 { 343 "flag": "Confounded comparison between domain-specific and general settings", 344 "detail": "The baseline vs. general comparison conflates multiple variables: tool pool size, prompt differences, context length, and domain isolation. The performance drop attributed to 'general agent' setting could be due to any of these factors, which are not independently controlled." 345 }, 346 { 347 "flag": "Missing contamination analysis", 348 "detail": "Several benchmarks (SWE-Bench, WebVoyager) were published in 2023-2024, well before the training cutoffs of the evaluated 2025-2026 frontier models. No contamination discussion is provided despite this being a benchmark paper." 349 }, 350 { 351 "flag": "Model version opacity", 352 "detail": "Models are identified by marketing names (GPT-5, Claude Sonnet 4.5) without API version strings or snapshot dates. Model behavior can change across API versions, making exact reproduction difficult." 353 } 354 ], 355 "cited_papers": [ 356 { 357 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 358 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"], 359 "year": 2023, 360 "arxiv_id": "2310.06770", 361 "relevance": "Foundational benchmark for evaluating LLM coding agents on real-world software engineering tasks." 362 }, 363 { 364 "title": "SWE-bench+: Enhanced Coding Benchmark for LLMs", 365 "authors": ["R. Aleithan"], 366 "year": 2024, 367 "arxiv_id": "2410.06992", 368 "relevance": "Extended version of SWE-bench addressing issues in the original benchmark's evaluation methodology." 369 }, 370 { 371 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 372 "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"], 373 "year": 2024, 374 "relevance": "Agent framework for software engineering that defines agent-computer interfaces for code editing tasks." 375 }, 376 { 377 "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models", 378 "authors": ["H. He", "W. Yao", "K. Ma", "W. Yu", "Y. Dai", "H. Zhang", "Z. Lan", "D. Yu"], 379 "year": 2024, 380 "arxiv_id": "2401.13919", 381 "relevance": "Web navigation benchmark used as one of the component benchmarks in General AgentBench." 382 }, 383 { 384 "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents", 385 "authors": ["J. Wei", "Z. Sun", "S. Papay", "S. McKinney", "J. Han", "I. Fulford"], 386 "year": 2025, 387 "arxiv_id": "2504.12516", 388 "relevance": "Search benchmark requiring multi-step web investigation, used as a component of General AgentBench." 389 }, 390 { 391 "title": "tau2-bench: Evaluating Conversational Agents in a Dual-Control Environment", 392 "authors": ["V. Barres", "H. Dong", "S. Ray", "X. Si", "K. Narasimhan"], 393 "year": 2025, 394 "arxiv_id": "2506.07982", 395 "relevance": "Tool-use benchmark for conversational agents in service scenarios, used as a component benchmark." 396 }, 397 { 398 "title": "MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers", 399 "authors": ["Z. Wang", "Q. Chang", "H. Patel"], 400 "year": 2025, 401 "arxiv_id": "2508.20453", 402 "relevance": "Benchmark for MCP-based tool-use evaluation, directly integrated into General AgentBench." 403 }, 404 { 405 "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective than Scaling Model Parameters", 406 "authors": ["C. Snell", "J. Lee", "K. Xu", "A. Kumar"], 407 "year": 2024, 408 "arxiv_id": "2408.03314", 409 "relevance": "Foundational study on test-time compute scaling that this paper extends to agentic settings." 410 }, 411 { 412 "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling", 413 "authors": ["B. Brown", "J. Juravsky", "R. Ehrlich", "R. Clark", "Q. V. Le", "C. Re", "A. Mirhoseini"], 414 "year": 2024, 415 "arxiv_id": "2407.21787", 416 "relevance": "Studies repeated sampling as a test-time scaling strategy, directly relevant to this paper's parallel scaling analysis." 417 }, 418 { 419 "title": "s1: Simple Test-Time Scaling", 420 "authors": ["N. Muennighoff", "Z. Yang", "W. Shi", "X. L. Li"], 421 "year": 2025, 422 "relevance": "Recent work on test-time scaling in non-agentic settings that this paper contrasts with agentic findings." 423 }, 424 { 425 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 426 "authors": ["D. Guo", "D. Yang", "H. Zhang"], 427 "year": 2025, 428 "arxiv_id": "2501.12948", 429 "relevance": "One of the frontier reasoning models evaluated in the benchmark, representing internal scaling approaches." 430 }, 431 { 432 "title": "GAIA: A Benchmark for General AI Assistants", 433 "authors": ["G. Mialon", "C. Fourrier", "C. Swift", "T. Wolf", "Y. LeCun", "T. Scialom"], 434 "year": 2023, 435 "arxiv_id": "2311.12983", 436 "relevance": "Related general-purpose agent benchmark that evaluates multi-step reasoning and tool use." 437 }, 438 { 439 "title": "Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models", 440 "authors": ["A. Zhou", "K. Yan", "M. Shlapentokh-Rothman", "H. Wang", "Y.-X. Wang"], 441 "year": 2023, 442 "arxiv_id": "2310.04406", 443 "relevance": "MCTS-based agent planning approach representing more sophisticated test-time scaling beyond the basic strategies studied here." 444 }, 445 { 446 "title": "AgentBench: Evaluating LLMs as Agents", 447 "authors": ["V. Barres"], 448 "year": 2025, 449 "relevance": "Related multi-task agent benchmark that evaluates LLMs across multiple task categories." 450 } 451 ] 452 }