scan-v5.json (18524B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Evaluation and Benchmarking of LLM Agents: A Survey", 6 "authors": [ 7 "Mahmoud Mohammadi", 8 "Yipeng Li", 9 "Jane Lo", 10 "Wendy Yip" 11 ], 12 "year": 2025, 13 "venue": "KDD '25", 14 "arxiv_id": "2507.21504", 15 "doi": "10.1145/3711896.3736570" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract promises a two-dimensional taxonomy, enterprise-specific challenges, and future research directions — all of which are present in the paper's structure and sections.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": false, 27 "answer": false, 28 "justification": "The paper is a taxonomy and narrative survey; it makes no causal claims about what interventions improve evaluation outcomes.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Claims are framed as organizing existing literature and identifying gaps rather than asserting empirical findings; enterprise challenges are framed as observed gaps, not universal laws.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The two-dimensional taxonomy is presented as the natural organizing structure without acknowledging alternative taxonomic frameworks or justifying why this decomposition is superior.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": false, 45 "answer": false, 46 "justification": "This is a taxonomy survey with no empirical measurements; there is no gap between measured proxy and claimed outcome to evaluate.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": false, 54 "justification": "Section 6 on future research directions is forward-looking, not a self-critical limitations section; no limitations or threats-to-validity section exists.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": false, 60 "justification": "No threats to validity are discussed — the survey's non-systematic paper selection, potential coverage gaps, and enterprise framing bias are not acknowledged.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper never specifies which years, venues, or paper types were included or excluded; boundaries of the review are entirely implicit.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding source or acknowledgments section appears in the paper; all four authors are SAP Labs employees but no funding is disclosed.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors list SAP Labs with city/location explicitly in the paper header.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "All authors are SAP Labs employees; the survey devotes a full section to enterprise-specific challenges (RBAC, compliance, reliability) that align with SAP's commercial interests without disclosing this potential bias.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "'LLM-based agents' are explicitly defined as 'autonomous or semi-autonomous systems that use LLMs to reason, plan, and act'; taxonomy dimensions and subcategories are defined with examples.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Contributions are stated as two explicit bullet points: (1) a two-dimensional evaluation taxonomy, and (2) identification of enterprise-specific challenges.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper references prior surveys ([121], [107]) and explicitly differentiates its contribution as more holistic and enterprise-focused, though the engagement is brief rather than substantive.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "survey": { 119 "search_and_selection": { 120 "search_strategy_reproducible": { 121 "applies": true, 122 "answer": false, 123 "justification": "No search strategy is described anywhere; the paper reads as a curated narrative review with no explanation of how the 127 references were identified.", 124 "source": "haiku" 125 }, 126 "inclusion_exclusion_explicit": { 127 "applies": true, 128 "answer": false, 129 "justification": "No inclusion or exclusion criteria are stated; it is impossible to determine why specific benchmarks and papers were included or why others were omitted.", 130 "source": "haiku" 131 }, 132 "prisma_or_structured_protocol": { 133 "applies": true, 134 "answer": false, 135 "justification": "No PRISMA flowchart or any other structured review protocol is mentioned or followed.", 136 "source": "haiku" 137 }, 138 "search_terms_provided": { 139 "applies": true, 140 "answer": false, 141 "justification": "No search queries, keywords, or search strings are provided anywhere in the paper.", 142 "source": "haiku" 143 }, 144 "databases_listed": { 145 "applies": true, 146 "answer": false, 147 "justification": "No databases or sources (e.g., arXiv, ACM DL, Semantic Scholar) are named as having been searched.", 148 "source": "haiku" 149 }, 150 "screening_process_documented": { 151 "applies": true, 152 "answer": false, 153 "justification": "No screening process with paper counts at each stage is documented; the selection process is entirely opaque.", 154 "source": "haiku" 155 }, 156 "review_scope_justified": { 157 "applies": true, 158 "answer": false, 159 "justification": "Temporal scope, venue coverage, and topic boundaries are never justified; the paper claims to cover 'the emerging field' without bounding what qualifies.", 160 "source": "haiku" 161 } 162 }, 163 "synthesis_quality": { 164 "conflicting_findings_acknowledged": { 165 "applies": true, 166 "answer": false, 167 "justification": "The paper catalogs benchmarks and methods additively without acknowledging any conflicting evidence or disagreement across the reviewed literature.", 168 "source": "haiku" 169 }, 170 "quality_assessment_of_sources": { 171 "applies": true, 172 "answer": false, 173 "justification": "No quality rubric, risk-of-bias tool, or structured evaluation is applied to any reviewed paper; all cited works are treated as equally reliable.", 174 "source": "haiku" 175 }, 176 "publication_bias_discussed": { 177 "applies": true, 178 "answer": false, 179 "justification": "Publication bias is never mentioned; the survey does not acknowledge that available benchmarks and evaluation papers skew toward positive or publishable results.", 180 "source": "haiku" 181 }, 182 "quantitative_synthesis_present": { 183 "applies": true, 184 "answer": false, 185 "justification": "There is no meta-analysis, vote counting, or quantitative aggregation of findings across reviewed papers; synthesis is entirely narrative.", 186 "source": "haiku" 187 }, 188 "recommendations_supported_by_evidence": { 189 "applies": true, 190 "answer": true, 191 "justification": "The four future research directions (holistic frameworks, realistic settings, automated evaluation, time/cost-bounded protocols) are connected to gaps documented through the taxonomy review, though the support is qualitative.", 192 "source": "haiku" 193 } 194 } 195 } 196 }, 197 "claims": [ 198 { 199 "claim": "Evaluating LLM agents is more complex than evaluating LLMs in isolation because agents operate in dynamic, interactive environments with tools, memory, and coordination.", 200 "evidence": "Argued conceptually via analogy (engine vs. car) and supported by citing diverse agent benchmarks that require multi-step, environment-aware evaluation beyond static QA.", 201 "supported": "moderate" 202 }, 203 { 204 "claim": "Existing surveys focus narrowly on LLM evaluation or specific agent capabilities without a holistic perspective.", 205 "evidence": "References [121] and [107] as narrower prior work but does not systematically compare coverage across these surveys.", 206 "supported": "weak" 207 }, 208 { 209 "claim": "Enterprise applications require evaluation considerations (RBAC, reliability guarantees, compliance) that are rarely addressed in existing literature.", 210 "evidence": "Only IntellAgent [45] and TheAgentCompany [97] are cited as partially addressing enterprise constraints; the 'rarely' claim is asserted rather than verified through systematic coverage analysis.", 211 "supported": "weak" 212 }, 213 { 214 "claim": "Current agents struggle with consistency as measured by the pass^k metric.", 215 "evidence": "Directly supported by τ-bench [104] results showing agents fail to maintain consistent performance across repeated trials in retail and airline domains.", 216 "supported": "strong" 217 }, 218 { 219 "claim": "The two-dimensional taxonomy (evaluation objectives × evaluation process) brings clarity to the fragmented agent evaluation landscape.", 220 "evidence": "The taxonomy is mapped to 127 references and visualized in a hierarchical tree and Table 1, but no formal evaluation of the taxonomy's completeness or comparative utility is provided.", 221 "supported": "moderate" 222 } 223 ], 224 "methodology_tags": [ 225 "survey", 226 "qualitative" 227 ], 228 "key_findings": "This KDD '25 survey proposes a two-dimensional taxonomy of LLM agent evaluation organized by evaluation objectives (agent behavior, capabilities, reliability, safety) and evaluation process (interaction modes, datasets/benchmarks, metrics computation, tooling, contexts). The paper identifies enterprise-specific evaluation gaps including role-based access control, reliability guarantees across repeated runs, long-horizon interaction assessment, and domain-specific compliance verification — areas underserved by academic benchmarks. Future research directions include holistic multi-dimensional frameworks, more realistic enterprise-like evaluation environments, automated and scalable evaluation techniques, and time/cost-bounded protocols. The survey is non-systematic with no described search methodology, making it a curated overview rather than a rigorous literature review.", 229 "red_flags": [ 230 { 231 "flag": "Non-systematic paper selection", 232 "detail": "No search strategy, inclusion/exclusion criteria, databases searched, or screening process is described; the 127 references appear hand-curated with no transparency about omissions." 233 }, 234 { 235 "flag": "No quality assessment of sources", 236 "detail": "All reviewed benchmarks and evaluation papers are treated equally with no methodological quality assessment, making it impossible to distinguish rigorous from weak evaluations." 237 }, 238 { 239 "flag": "Undisclosed enterprise conflict of interest", 240 "detail": "All four authors are SAP Labs employees; the survey devotes Section 5 to enterprise-specific challenges that align with SAP's commercial interests, with no disclosure of this potential framing bias." 241 }, 242 { 243 "flag": "No limitations section", 244 "detail": "The paper has no limitations or threats-to-validity section, omitting discussion of coverage gaps, selection bias, recency constraints, or non-systematic methodology." 245 }, 246 { 247 "flag": "No publication bias acknowledgment", 248 "detail": "The survey does not acknowledge that the available corpus of agent evaluation benchmarks and papers skews heavily toward positive or publishable results." 249 } 250 ], 251 "cited_papers": [ 252 { 253 "title": "AgentBench: Evaluating LLMs as Agents", 254 "relevance": "Central benchmark for evaluating LLMs across diverse task environments; anchor reference for the evaluation objectives dimension" 255 }, 256 { 257 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 258 "relevance": "Representative software engineering benchmark illustrating task completion evaluation in coding domains" 259 }, 260 { 261 "title": "Holistic Evaluation of Language Models (HELM)", 262 "relevance": "Reference framework for multi-dimensional evaluation incorporating toxicity, bias, and robustness alongside task performance" 263 }, 264 { 265 "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 266 "relevance": "Introduces pass^k consistency metric; primary reference for reliability evaluation in enterprise-relevant domains" 267 }, 268 { 269 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 270 "relevance": "Frequently cited as a realistic dynamic evaluation environment exemplifying online/interactive evaluation mode" 271 }, 272 { 273 "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents", 274 "relevance": "Core reference for the safety/harm evaluation dimension" 275 }, 276 { 277 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 278 "relevance": "Key benchmark for adversarial robustness and security evaluation of agents" 279 }, 280 { 281 "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks", 282 "relevance": "Enterprise-relevant benchmark with organizational policies; cited for both compliance evaluation and enterprise challenges sections" 283 }, 284 { 285 "title": "Survey on Evaluation of LLM-based Agents (Yehudai et al., 2025)", 286 "relevance": "Prior related survey that the authors explicitly position their work against as being narrower in scope" 287 }, 288 { 289 "title": "HAL: A Holistic Agent Leaderboard for Centralized and Reproducible Agent Evaluation", 290 "relevance": "Infrastructure reference for standardized leaderboard-based centralized evaluation" 291 } 292 ], 293 "engagement_factors": { 294 "practical_relevance": { 295 "score": 2, 296 "justification": "Practitioners designing LLM agent evaluation pipelines can use the taxonomy to structure coverage across behavior, capabilities, reliability, and safety dimensions." 297 }, 298 "surprise_contrarian": { 299 "score": 1, 300 "justification": "The enterprise-specific challenges section surfaces underappreciated evaluation requirements (RBAC, pass^k consistency) not commonly foregrounded in academic benchmark literature." 301 }, 302 "fear_safety": { 303 "score": 1, 304 "justification": "The safety section covers harm, toxicity, prompt injection, and compliance risks, but the paper's primary contribution is organizational rather than a safety alarm." 305 }, 306 "drama_conflict": { 307 "score": 0, 308 "justification": "No controversy, debate between competing approaches, or conflicting findings are surfaced; the paper is a neutral taxonomy." 309 }, 310 "demo_ability": { 311 "score": 0, 312 "justification": "The paper offers no artifact, tool, dataset, or interactive system that readers can immediately access or try." 313 }, 314 "brand_recognition": { 315 "score": 1, 316 "justification": "SAP is a well-known enterprise software vendor but not a top-tier AI research lab; KDD '25 venue adds credibility but is not a top-prestige AI venue for surveys." 317 } 318 }, 319 "hn_data": { 320 "threads": [ 321 { 322 "hn_id": "44120359", 323 "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective", 324 "points": 19, 325 "comments": 1, 326 "url": "https://news.ycombinator.com/item?id=44120359", 327 "created_at": "2025-05-28T20:27:45Z" 328 }, 329 { 330 "hn_id": "45472586", 331 "title": "Physics of Learning: A Lagrangian perspective to different learning paradigms", 332 "points": 3, 333 "comments": 0, 334 "url": "https://news.ycombinator.com/item?id=45472586", 335 "created_at": "2025-10-04T11:38:44Z" 336 }, 337 { 338 "hn_id": "36931866", 339 "title": "Universal and Transferable Adversarial Attacks on LLM", 340 "points": 3, 341 "comments": 0, 342 "url": "https://news.ycombinator.com/item?id=36931866", 343 "created_at": "2023-07-30T15:04:08Z" 344 }, 345 { 346 "hn_id": "45418635", 347 "title": "Can LLMs Be Creative? Paper: Combinatorial Creativity: A New Frontier", 348 "points": 2, 349 "comments": 0, 350 "url": "https://news.ycombinator.com/item?id=45418635", 351 "created_at": "2025-09-29T20:53:22Z" 352 }, 353 { 354 "hn_id": "41174642", 355 "title": "Case-Based Reasoning for Explainable Depression Detection on Twitter Using LLMs", 356 "points": 1, 357 "comments": 0, 358 "url": "https://news.ycombinator.com/item?id=41174642", 359 "created_at": "2024-08-06T19:55:38Z" 360 }, 361 { 362 "hn_id": "36903968", 363 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 364 "points": 1, 365 "comments": 0, 366 "url": "https://news.ycombinator.com/item?id=36903968", 367 "created_at": "2023-07-28T07:30:39Z" 368 } 369 ], 370 "top_points": 19, 371 "total_points": 29, 372 "total_comments": 1 373 } 374 }