scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18524B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
      6     "authors": [
      7       "Mahmoud Mohammadi",
      8       "Yipeng Li",
      9       "Jane Lo",
     10       "Wendy Yip"
     11     ],
     12     "year": 2025,
     13     "venue": "KDD '25",
     14     "arxiv_id": "2507.21504",
     15     "doi": "10.1145/3711896.3736570"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract promises a two-dimensional taxonomy, enterprise-specific challenges, and future research directions — all of which are present in the paper's structure and sections.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper is a taxonomy and narrative survey; it makes no causal claims about what interventions improve evaluation outcomes.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims are framed as organizing existing literature and identifying gaps rather than asserting empirical findings; enterprise challenges are framed as observed gaps, not universal laws.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The two-dimensional taxonomy is presented as the natural organizing structure without acknowledging alternative taxonomic frameworks or justifying why this decomposition is superior.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "This is a taxonomy survey with no empirical measurements; there is no gap between measured proxy and claimed outcome to evaluate.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Section 6 on future research directions is forward-looking, not a self-critical limitations section; no limitations or threats-to-validity section exists.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats to validity are discussed — the survey's non-systematic paper selection, potential coverage gaps, and enterprise framing bias are not acknowledged.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper never specifies which years, venues, or paper types were included or excluded; boundaries of the review are entirely implicit.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source or acknowledgments section appears in the paper; all four authors are SAP Labs employees but no funding is disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list SAP Labs with city/location explicitly in the paper header.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All authors are SAP Labs employees; the survey devotes a full section to enterprise-specific challenges (RBAC, compliance, reliability) that align with SAP's commercial interests without disclosing this potential bias.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'LLM-based agents' are explicitly defined as 'autonomous or semi-autonomous systems that use LLMs to reason, plan, and act'; taxonomy dimensions and subcategories are defined with examples.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contributions are stated as two explicit bullet points: (1) a two-dimensional evaluation taxonomy, and (2) identification of enterprise-specific challenges.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper references prior surveys ([121], [107]) and explicitly differentiates its contribution as more holistic and enterprise-focused, though the engagement is brief rather than substantive.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "survey": {
    119       "search_and_selection": {
    120         "search_strategy_reproducible": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No search strategy is described anywhere; the paper reads as a curated narrative review with no explanation of how the 127 references were identified.",
    124           "source": "haiku"
    125         },
    126         "inclusion_exclusion_explicit": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No inclusion or exclusion criteria are stated; it is impossible to determine why specific benchmarks and papers were included or why others were omitted.",
    130           "source": "haiku"
    131         },
    132         "prisma_or_structured_protocol": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No PRISMA flowchart or any other structured review protocol is mentioned or followed.",
    136           "source": "haiku"
    137         },
    138         "search_terms_provided": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No search queries, keywords, or search strings are provided anywhere in the paper.",
    142           "source": "haiku"
    143         },
    144         "databases_listed": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No databases or sources (e.g., arXiv, ACM DL, Semantic Scholar) are named as having been searched.",
    148           "source": "haiku"
    149         },
    150         "screening_process_documented": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No screening process with paper counts at each stage is documented; the selection process is entirely opaque.",
    154           "source": "haiku"
    155         },
    156         "review_scope_justified": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Temporal scope, venue coverage, and topic boundaries are never justified; the paper claims to cover 'the emerging field' without bounding what qualifies.",
    160           "source": "haiku"
    161         }
    162       },
    163       "synthesis_quality": {
    164         "conflicting_findings_acknowledged": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper catalogs benchmarks and methods additively without acknowledging any conflicting evidence or disagreement across the reviewed literature.",
    168           "source": "haiku"
    169         },
    170         "quality_assessment_of_sources": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No quality rubric, risk-of-bias tool, or structured evaluation is applied to any reviewed paper; all cited works are treated as equally reliable.",
    174           "source": "haiku"
    175         },
    176         "publication_bias_discussed": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Publication bias is never mentioned; the survey does not acknowledge that available benchmarks and evaluation papers skew toward positive or publishable results.",
    180           "source": "haiku"
    181         },
    182         "quantitative_synthesis_present": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "There is no meta-analysis, vote counting, or quantitative aggregation of findings across reviewed papers; synthesis is entirely narrative.",
    186           "source": "haiku"
    187         },
    188         "recommendations_supported_by_evidence": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "The four future research directions (holistic frameworks, realistic settings, automated evaluation, time/cost-bounded protocols) are connected to gaps documented through the taxonomy review, though the support is qualitative.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Evaluating LLM agents is more complex than evaluating LLMs in isolation because agents operate in dynamic, interactive environments with tools, memory, and coordination.",
    200       "evidence": "Argued conceptually via analogy (engine vs. car) and supported by citing diverse agent benchmarks that require multi-step, environment-aware evaluation beyond static QA.",
    201       "supported": "moderate"
    202     },
    203     {
    204       "claim": "Existing surveys focus narrowly on LLM evaluation or specific agent capabilities without a holistic perspective.",
    205       "evidence": "References [121] and [107] as narrower prior work but does not systematically compare coverage across these surveys.",
    206       "supported": "weak"
    207     },
    208     {
    209       "claim": "Enterprise applications require evaluation considerations (RBAC, reliability guarantees, compliance) that are rarely addressed in existing literature.",
    210       "evidence": "Only IntellAgent [45] and TheAgentCompany [97] are cited as partially addressing enterprise constraints; the 'rarely' claim is asserted rather than verified through systematic coverage analysis.",
    211       "supported": "weak"
    212     },
    213     {
    214       "claim": "Current agents struggle with consistency as measured by the pass^k metric.",
    215       "evidence": "Directly supported by τ-bench [104] results showing agents fail to maintain consistent performance across repeated trials in retail and airline domains.",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "The two-dimensional taxonomy (evaluation objectives × evaluation process) brings clarity to the fragmented agent evaluation landscape.",
    220       "evidence": "The taxonomy is mapped to 127 references and visualized in a hierarchical tree and Table 1, but no formal evaluation of the taxonomy's completeness or comparative utility is provided.",
    221       "supported": "moderate"
    222     }
    223   ],
    224   "methodology_tags": [
    225     "survey",
    226     "qualitative"
    227   ],
    228   "key_findings": "This KDD '25 survey proposes a two-dimensional taxonomy of LLM agent evaluation organized by evaluation objectives (agent behavior, capabilities, reliability, safety) and evaluation process (interaction modes, datasets/benchmarks, metrics computation, tooling, contexts). The paper identifies enterprise-specific evaluation gaps including role-based access control, reliability guarantees across repeated runs, long-horizon interaction assessment, and domain-specific compliance verification — areas underserved by academic benchmarks. Future research directions include holistic multi-dimensional frameworks, more realistic enterprise-like evaluation environments, automated and scalable evaluation techniques, and time/cost-bounded protocols. The survey is non-systematic with no described search methodology, making it a curated overview rather than a rigorous literature review.",
    229   "red_flags": [
    230     {
    231       "flag": "Non-systematic paper selection",
    232       "detail": "No search strategy, inclusion/exclusion criteria, databases searched, or screening process is described; the 127 references appear hand-curated with no transparency about omissions."
    233     },
    234     {
    235       "flag": "No quality assessment of sources",
    236       "detail": "All reviewed benchmarks and evaluation papers are treated equally with no methodological quality assessment, making it impossible to distinguish rigorous from weak evaluations."
    237     },
    238     {
    239       "flag": "Undisclosed enterprise conflict of interest",
    240       "detail": "All four authors are SAP Labs employees; the survey devotes Section 5 to enterprise-specific challenges that align with SAP's commercial interests, with no disclosure of this potential framing bias."
    241     },
    242     {
    243       "flag": "No limitations section",
    244       "detail": "The paper has no limitations or threats-to-validity section, omitting discussion of coverage gaps, selection bias, recency constraints, or non-systematic methodology."
    245     },
    246     {
    247       "flag": "No publication bias acknowledgment",
    248       "detail": "The survey does not acknowledge that the available corpus of agent evaluation benchmarks and papers skews heavily toward positive or publishable results."
    249     }
    250   ],
    251   "cited_papers": [
    252     {
    253       "title": "AgentBench: Evaluating LLMs as Agents",
    254       "relevance": "Central benchmark for evaluating LLMs across diverse task environments; anchor reference for the evaluation objectives dimension"
    255     },
    256     {
    257       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    258       "relevance": "Representative software engineering benchmark illustrating task completion evaluation in coding domains"
    259     },
    260     {
    261       "title": "Holistic Evaluation of Language Models (HELM)",
    262       "relevance": "Reference framework for multi-dimensional evaluation incorporating toxicity, bias, and robustness alongside task performance"
    263     },
    264     {
    265       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    266       "relevance": "Introduces pass^k consistency metric; primary reference for reliability evaluation in enterprise-relevant domains"
    267     },
    268     {
    269       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    270       "relevance": "Frequently cited as a realistic dynamic evaluation environment exemplifying online/interactive evaluation mode"
    271     },
    272     {
    273       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    274       "relevance": "Core reference for the safety/harm evaluation dimension"
    275     },
    276     {
    277       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    278       "relevance": "Key benchmark for adversarial robustness and security evaluation of agents"
    279     },
    280     {
    281       "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks",
    282       "relevance": "Enterprise-relevant benchmark with organizational policies; cited for both compliance evaluation and enterprise challenges sections"
    283     },
    284     {
    285       "title": "Survey on Evaluation of LLM-based Agents (Yehudai et al., 2025)",
    286       "relevance": "Prior related survey that the authors explicitly position their work against as being narrower in scope"
    287     },
    288     {
    289       "title": "HAL: A Holistic Agent Leaderboard for Centralized and Reproducible Agent Evaluation",
    290       "relevance": "Infrastructure reference for standardized leaderboard-based centralized evaluation"
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 2,
    296       "justification": "Practitioners designing LLM agent evaluation pipelines can use the taxonomy to structure coverage across behavior, capabilities, reliability, and safety dimensions."
    297     },
    298     "surprise_contrarian": {
    299       "score": 1,
    300       "justification": "The enterprise-specific challenges section surfaces underappreciated evaluation requirements (RBAC, pass^k consistency) not commonly foregrounded in academic benchmark literature."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "The safety section covers harm, toxicity, prompt injection, and compliance risks, but the paper's primary contribution is organizational rather than a safety alarm."
    305     },
    306     "drama_conflict": {
    307       "score": 0,
    308       "justification": "No controversy, debate between competing approaches, or conflicting findings are surfaced; the paper is a neutral taxonomy."
    309     },
    310     "demo_ability": {
    311       "score": 0,
    312       "justification": "The paper offers no artifact, tool, dataset, or interactive system that readers can immediately access or try."
    313     },
    314     "brand_recognition": {
    315       "score": 1,
    316       "justification": "SAP is a well-known enterprise software vendor but not a top-tier AI research lab; KDD '25 venue adds credibility but is not a top-prestige AI venue for surveys."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "44120359",
    323         "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective",
    324         "points": 19,
    325         "comments": 1,
    326         "url": "https://news.ycombinator.com/item?id=44120359",
    327         "created_at": "2025-05-28T20:27:45Z"
    328       },
    329       {
    330         "hn_id": "45472586",
    331         "title": "Physics of Learning: A Lagrangian perspective to different learning paradigms",
    332         "points": 3,
    333         "comments": 0,
    334         "url": "https://news.ycombinator.com/item?id=45472586",
    335         "created_at": "2025-10-04T11:38:44Z"
    336       },
    337       {
    338         "hn_id": "36931866",
    339         "title": "Universal and Transferable Adversarial Attacks on LLM",
    340         "points": 3,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=36931866",
    343         "created_at": "2023-07-30T15:04:08Z"
    344       },
    345       {
    346         "hn_id": "45418635",
    347         "title": "Can LLMs Be Creative? Paper: Combinatorial Creativity: A New Frontier",
    348         "points": 2,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=45418635",
    351         "created_at": "2025-09-29T20:53:22Z"
    352       },
    353       {
    354         "hn_id": "41174642",
    355         "title": "Case-Based Reasoning for Explainable Depression Detection on Twitter Using LLMs",
    356         "points": 1,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=41174642",
    359         "created_at": "2024-08-06T19:55:38Z"
    360       },
    361       {
    362         "hn_id": "36903968",
    363         "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    364         "points": 1,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=36903968",
    367         "created_at": "2023-07-28T07:30:39Z"
    368       }
    369     ],
    370     "top_points": 19,
    371     "total_points": 29,
    372     "total_comments": 1
    373   }
    374 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs