scan-v4.json (17555B)
1 { 2 "scan_version": 4, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Evaluation and Benchmarking of LLM Agents: A Survey", 6 "authors": ["Mahmoud Mohammadi", "Yipeng Li", "Jane Lo", "Wendy Yip"], 7 "year": 2025, 8 "venue": "KDD '25", 9 "arxiv_id": "2507.21504", 10 "doi": "10.1145/3711896.3736570" 11 }, 12 "checklist": { 13 "claims_and_evidence": { 14 "abstract_claims_supported": { 15 "applies": true, 16 "answer": true, 17 "justification": "The abstract claims a two-dimensional taxonomy, enterprise challenge coverage, and future directions — all delivered in the paper's six sections.", 18 "source": "haiku" 19 }, 20 "causal_claims_justified": { 21 "applies": false, 22 "answer": false, 23 "justification": "The paper is a taxonomy/survey and makes no causal claims requiring study designs.", 24 "source": "haiku" 25 }, 26 "generalization_bounded": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper presents its taxonomy as a comprehensive framework for 'the field' without bounding which papers were reviewed, what time range was covered, or what domains are excluded.", 30 "source": "haiku" 31 }, 32 "alternative_explanations_discussed": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper presents its two-dimensional taxonomy without discussing alternative taxonomic frameworks or why other organizational schemes were not adopted.", 36 "source": "haiku" 37 }, 38 "proxy_outcome_distinction": { 39 "applies": false, 40 "answer": false, 41 "justification": "The paper is a conceptual taxonomy — it does not measure outcomes or report experimental results.", 42 "source": "haiku" 43 } 44 }, 45 "limitations_and_scope": { 46 "limitations_section_present": { 47 "applies": true, 48 "answer": false, 49 "justification": "Section 6 discusses 'Future Research Directions' but there is no dedicated limitations or threats-to-validity section anywhere in the paper.", 50 "source": "haiku" 51 }, 52 "threats_to_validity_specific": { 53 "applies": true, 54 "answer": false, 55 "justification": "No threats to validity are discussed; the paper does not acknowledge potential gaps in coverage, selection bias, or limitations of the taxonomy.", 56 "source": "haiku" 57 }, 58 "scope_boundaries_stated": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper does not state explicit scope boundaries — no time range, venue filter, paper type, or excluded topic areas are defined.", 62 "source": "haiku" 63 } 64 }, 65 "conflicts_of_interest": { 66 "funding_disclosed": { 67 "applies": true, 68 "answer": false, 69 "justification": "There is no funding acknowledgment or disclosure anywhere in the paper.", 70 "source": "haiku" 71 }, 72 "affiliations_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "All four authors are explicitly listed with SAP Labs affiliations (Bellevue and Palo Alto) on the first page.", 76 "source": "haiku" 77 }, 78 "funder_independent_of_outcome": { 79 "applies": true, 80 "answer": false, 81 "justification": "All authors are from SAP Labs, an enterprise software company, and the paper significantly amplifies enterprise-specific evaluation challenges — a framing that aligns with SAP's business interests.", 82 "source": "haiku" 83 }, 84 "financial_interests_declared": { 85 "applies": true, 86 "answer": false, 87 "justification": "No competing interests, patent disclosures, or financial interests statement is present in the paper.", 88 "source": "haiku" 89 } 90 }, 91 "scope_and_framing": { 92 "key_terms_defined": { 93 "applies": true, 94 "answer": true, 95 "justification": "LLM agents are defined as 'autonomous or semi-autonomous systems that use LLMs to reason, plan, and act'; taxonomy dimensions (behavior, capabilities, reliability, safety) are explained in Section 2.", 96 "source": "haiku" 97 }, 98 "intended_contribution_clear": { 99 "applies": true, 100 "answer": true, 101 "justification": "Two contributions are explicitly bulleted in Section 1: a two-dimensional taxonomy and identification of enterprise-specific challenges.", 102 "source": "haiku" 103 }, 104 "engagement_with_prior_work": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper positions itself against prior surveys ([107], [121]) and extensively references prior benchmarks; however comparative depth is limited to a single paragraph in the introduction.", 108 "source": "haiku" 109 } 110 } 111 }, 112 "type_checklist": { 113 "survey": { 114 "search_and_selection": { 115 "search_strategy_reproducible": { 116 "applies": true, 117 "answer": false, 118 "justification": "No search strategy is described; it is unclear how the 127 cited papers were identified or what databases were queried.", 119 "source": "haiku" 120 }, 121 "inclusion_exclusion_explicit": { 122 "applies": true, 123 "answer": false, 124 "justification": "No inclusion or exclusion criteria are stated anywhere; papers appear selected ad hoc based on author familiarity.", 125 "source": "haiku" 126 }, 127 "prisma_or_structured_protocol": { 128 "applies": true, 129 "answer": false, 130 "justification": "No PRISMA flowchart, structured review protocol, or systematic methodology is mentioned or used.", 131 "source": "haiku" 132 }, 133 "search_terms_provided": { 134 "applies": true, 135 "answer": false, 136 "justification": "No search queries or keywords used to identify papers are provided.", 137 "source": "haiku" 138 }, 139 "databases_listed": { 140 "applies": true, 141 "answer": false, 142 "justification": "No databases, repositories, or search engines used to find papers are mentioned.", 143 "source": "haiku" 144 }, 145 "screening_process_documented": { 146 "applies": true, 147 "answer": false, 148 "justification": "No screening process, stage counts, or PRISMA-style funnel is documented.", 149 "source": "haiku" 150 }, 151 "review_scope_justified": { 152 "applies": true, 153 "answer": false, 154 "justification": "The scope is not justified beyond a vague statement that prior surveys are 'too narrow'; no rationale for time range, venue selection, or topic boundaries is provided.", 155 "source": "haiku" 156 } 157 }, 158 "synthesis_quality": { 159 "conflicting_findings_acknowledged": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper presents a descriptive taxonomy without acknowledging conflicting findings or methodological debates among the benchmarks and evaluation approaches reviewed.", 163 "source": "haiku" 164 }, 165 "quality_assessment_of_sources": { 166 "applies": true, 167 "answer": false, 168 "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of reviewed papers is performed; all cited papers are treated as equally authoritative.", 169 "source": "haiku" 170 }, 171 "publication_bias_discussed": { 172 "applies": true, 173 "answer": false, 174 "justification": "Publication bias is not mentioned; the survey does not acknowledge that published evaluation benchmarks may skew toward positive results or popular approaches.", 175 "source": "haiku" 176 }, 177 "quantitative_synthesis_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "The survey is entirely narrative; there is no meta-analysis, vote counting, effect size aggregation, or even a count of how many papers cover each category.", 181 "source": "haiku" 182 }, 183 "recommendations_supported_by_evidence": { 184 "applies": true, 185 "answer": false, 186 "justification": "Future research directions (holistic frameworks, realistic settings, scalable evaluation) are stated as author opinion without citing systematic evidence of their necessity or comparing alternatives.", 187 "source": "haiku" 188 } 189 } 190 } 191 }, 192 "claims": [ 193 { 194 "claim": "Evaluating LLM agents is more complex than evaluating LLMs in isolation, requiring assessment across dynamic, interactive environments.", 195 "evidence": "Logical argument by analogy (LLM = engine, agent = car) plus enumeration of agent capabilities. No empirical support.", 196 "supported": "moderate" 197 }, 198 { 199 "claim": "Existing surveys focus narrowly on LLM evaluation or specific capabilities without a holistic perspective.", 200 "evidence": "Single citation to [121] plus general assertion; no systematic comparison of prior surveys.", 201 "supported": "weak" 202 }, 203 { 204 "claim": "Enterprise applications introduce requirements (RBAC, compliance, reliability, long-horizon) rarely addressed in current research.", 205 "evidence": "Section 5 describes these challenges with selective citations but no systematic analysis of what fraction of current benchmarks address them.", 206 "supported": "weak" 207 }, 208 { 209 "claim": "Current agents struggle with consistency as measured by the pass^k metric.", 210 "evidence": "Cites τ-bench [104], which showed agents fail consistency tests in retail and airline booking domains.", 211 "supported": "moderate" 212 }, 213 { 214 "claim": "Task completion (success rate) is the predominant measure of overall agent performance.", 215 "evidence": "Supported by citing multiple major benchmarks (SWE-bench, AgentBench, WebArena) all using success rate as their primary metric.", 216 "supported": "strong" 217 }, 218 { 219 "claim": "LLM-as-a-Judge has gained traction for evaluating subjective and nuanced agent responses.", 220 "evidence": "Multiple citations [30, 46, 125, 127] support uptake, though no quantitative comparison with human evaluation baselines is provided.", 221 "supported": "moderate" 222 } 223 ], 224 "methodology_tags": ["theoretical", "qualitative"], 225 "key_findings": "This survey proposes a two-dimensional taxonomy for LLM agent evaluation organized by evaluation objectives (behavior, capabilities, reliability, safety) and evaluation process (interaction modes, datasets, metrics, tooling, contexts). A significant secondary contribution is identification of enterprise-specific evaluation gaps: role-based access control, reliability guarantees, long-horizon interactions, and compliance requirements are largely unaddressed by current benchmarks. The paper is a narrative taxonomy rather than a systematic review — no search strategy, inclusion criteria, or paper quality assessment is reported, making coverage selection opaque. Future directions include holistic, realistic, and scalable evaluation frameworks, but these are opinion-based rather than evidence-driven.", 226 "red_flags": [ 227 { 228 "flag": "No systematic literature review", 229 "detail": "No search strategy, inclusion/exclusion criteria, database listing, or PRISMA protocol. Paper selection appears ad hoc based on author familiarity with the field." 230 }, 231 { 232 "flag": "Enterprise bias from SAP authors", 233 "detail": "All four authors are from SAP Labs, an enterprise software company. The paper disproportionately amplifies enterprise-specific evaluation challenges in a way that aligns with SAP's product direction, without disclosing this potential conflict." 234 }, 235 { 236 "flag": "No funding disclosure", 237 "detail": "No acknowledgment of funding sources or statement that the work is independently funded." 238 }, 239 { 240 "flag": "No quality assessment of sources", 241 "detail": "All 127 cited benchmarks and papers are treated as equally authoritative with no quality filtering, risk-of-bias assessment, or critical appraisal." 242 }, 243 { 244 "flag": "No quantitative synthesis", 245 "detail": "Purely narrative taxonomy with no paper counts per category, coverage statistics, or meta-analytic aggregation across reviewed papers." 246 }, 247 { 248 "flag": "No limitations section", 249 "detail": "The paper has no dedicated limitations or threats-to-validity discussion; coverage gaps and potential selection biases are entirely unacknowledged." 250 } 251 ], 252 "cited_papers": [ 253 { 254 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 255 "relevance": "Key benchmark for coding agent evaluation using GitHub issue resolution as ground truth; highly cited in agent evaluation literature" 256 }, 257 { 258 "title": "AgentBench: Evaluating LLMs as Agents", 259 "relevance": "Multi-environment benchmark covering coding, web, games — foundational agent evaluation framework" 260 }, 261 { 262 "title": "tau-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 263 "relevance": "Introduces pass^k consistency metric; demonstrates current agents fail reliability requirements in retail and airline domains" 264 }, 265 { 266 "title": "Holistic Evaluation of Language Models (HELM)", 267 "relevance": "Comprehensive evaluation framework covering accuracy, robustness, bias, toxicity — canonical holistic evaluation reference" 268 }, 269 { 270 "title": "AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents", 271 "relevance": "Multi-turn agent evaluation with fine-grained progress rate metric; distinguishes trajectory quality from binary success" 272 }, 273 { 274 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 275 "relevance": "Canonical web agent benchmark used across many evaluation studies in the survey" 276 }, 277 { 278 "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents", 279 "relevance": "Safety evaluation benchmark specifically targeting harmful agent behaviors" 280 }, 281 { 282 "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks", 283 "relevance": "Enterprise task evaluation framework requiring agents to follow organizational policy constraints" 284 }, 285 { 286 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 287 "relevance": "Foundational paper establishing the LLM-as-a-Judge evaluation paradigm discussed extensively in the survey" 288 }, 289 { 290 "title": "Survey on Evaluation of LLM-based Agents", 291 "relevance": "Prior survey explicitly positioned against as 'too narrow', motivating this paper's broader scope" 292 } 293 ], 294 "engagement_factors": { 295 "practical_relevance": { 296 "score": 3, 297 "justification": "Directly useful as a reference for practitioners and researchers designing LLM agent evaluation systems; comprehensive benchmark coverage." 298 }, 299 "surprise_contrarian": { 300 "score": 1, 301 "justification": "The taxonomy is well-organized but not surprising; the enterprise angle adds modest novelty but does not challenge conventional wisdom." 302 }, 303 "fear_safety": { 304 "score": 2, 305 "justification": "Section 3.4 covers harm, toxicity, bias, and compliance with concrete failure examples (CoSafe coreference attacks, ToolEmu failures), raising substantive safety concerns." 306 }, 307 "drama_conflict": { 308 "score": 0, 309 "justification": "No controversy or conflict angle; the paper is a straightforward organizational taxonomy." 310 }, 311 "demo_ability": { 312 "score": 0, 313 "justification": "Conceptual taxonomy paper with no demo, tool, dataset, or interactive artifact." 314 }, 315 "brand_recognition": { 316 "score": 1, 317 "justification": "SAP Labs is a recognized enterprise software company but not a top AI research lab; KDD venue adds credibility." 318 } 319 }, 320 "hn_data": { 321 "threads": [ 322 {"hn_id": "44120359", "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective", "points": 19, "comments": 1, "url": "https://news.ycombinator.com/item?id=44120359", "created_at": "2025-05-28T20:27:45Z"}, 323 {"hn_id": "45472586", "title": "Physics of Learning: A Lagrangian perspective to different learning paradigms", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=45472586", "created_at": "2025-10-04T11:38:44Z"}, 324 {"hn_id": "36931866", "title": "Universal and Transferable Adversarial Attacks on LLM", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=36931866", "created_at": "2023-07-30T15:04:08Z"}, 325 {"hn_id": "45418635", "title": "Can LLMs Be Creative? Paper: Combinatorial Creativity: A New Frontier", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=45418635", "created_at": "2025-09-29T20:53:22Z"}, 326 {"hn_id": "41174642", "title": "Case-Based Reasoning for Explainable Depression Detection on Twitter Using LLMs", "points": 1, "comments": 0, "url": "https://news.ycombinator.com/item?id=41174642", "created_at": "2024-08-06T19:55:38Z"}, 327 {"hn_id": "36903968", "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", "points": 1, "comments": 0, "url": "https://news.ycombinator.com/item?id=36903968", "created_at": "2023-07-28T07:30:39Z"} 328 ], 329 "top_points": 19, 330 "total_points": 29, 331 "total_comments": 1 332 } 333 }