ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (17555B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
      6     "authors": ["Mahmoud Mohammadi", "Yipeng Li", "Jane Lo", "Wendy Yip"],
      7     "year": 2025,
      8     "venue": "KDD '25",
      9     "arxiv_id": "2507.21504",
     10     "doi": "10.1145/3711896.3736570"
     11   },
     12   "checklist": {
     13     "claims_and_evidence": {
     14       "abstract_claims_supported": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The abstract claims a two-dimensional taxonomy, enterprise challenge coverage, and future directions — all delivered in the paper's six sections.",
     18         "source": "haiku"
     19       },
     20       "causal_claims_justified": {
     21         "applies": false,
     22         "answer": false,
     23         "justification": "The paper is a taxonomy/survey and makes no causal claims requiring study designs.",
     24         "source": "haiku"
     25       },
     26       "generalization_bounded": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper presents its taxonomy as a comprehensive framework for 'the field' without bounding which papers were reviewed, what time range was covered, or what domains are excluded.",
     30         "source": "haiku"
     31       },
     32       "alternative_explanations_discussed": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper presents its two-dimensional taxonomy without discussing alternative taxonomic frameworks or why other organizational schemes were not adopted.",
     36         "source": "haiku"
     37       },
     38       "proxy_outcome_distinction": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper is a conceptual taxonomy — it does not measure outcomes or report experimental results.",
     42         "source": "haiku"
     43       }
     44     },
     45     "limitations_and_scope": {
     46       "limitations_section_present": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Section 6 discusses 'Future Research Directions' but there is no dedicated limitations or threats-to-validity section anywhere in the paper.",
     50         "source": "haiku"
     51       },
     52       "threats_to_validity_specific": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No threats to validity are discussed; the paper does not acknowledge potential gaps in coverage, selection bias, or limitations of the taxonomy.",
     56         "source": "haiku"
     57       },
     58       "scope_boundaries_stated": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper does not state explicit scope boundaries — no time range, venue filter, paper type, or excluded topic areas are defined.",
     62         "source": "haiku"
     63       }
     64     },
     65     "conflicts_of_interest": {
     66       "funding_disclosed": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "There is no funding acknowledgment or disclosure anywhere in the paper.",
     70         "source": "haiku"
     71       },
     72       "affiliations_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "All four authors are explicitly listed with SAP Labs affiliations (Bellevue and Palo Alto) on the first page.",
     76         "source": "haiku"
     77       },
     78       "funder_independent_of_outcome": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "All authors are from SAP Labs, an enterprise software company, and the paper significantly amplifies enterprise-specific evaluation challenges — a framing that aligns with SAP's business interests.",
     82         "source": "haiku"
     83       },
     84       "financial_interests_declared": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No competing interests, patent disclosures, or financial interests statement is present in the paper.",
     88         "source": "haiku"
     89       }
     90     },
     91     "scope_and_framing": {
     92       "key_terms_defined": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "LLM agents are defined as 'autonomous or semi-autonomous systems that use LLMs to reason, plan, and act'; taxonomy dimensions (behavior, capabilities, reliability, safety) are explained in Section 2.",
     96         "source": "haiku"
     97       },
     98       "intended_contribution_clear": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Two contributions are explicitly bulleted in Section 1: a two-dimensional taxonomy and identification of enterprise-specific challenges.",
    102         "source": "haiku"
    103       },
    104       "engagement_with_prior_work": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper positions itself against prior surveys ([107], [121]) and extensively references prior benchmarks; however comparative depth is limited to a single paragraph in the introduction.",
    108         "source": "haiku"
    109       }
    110     }
    111   },
    112   "type_checklist": {
    113     "survey": {
    114       "search_and_selection": {
    115         "search_strategy_reproducible": {
    116           "applies": true,
    117           "answer": false,
    118           "justification": "No search strategy is described; it is unclear how the 127 cited papers were identified or what databases were queried.",
    119           "source": "haiku"
    120         },
    121         "inclusion_exclusion_explicit": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No inclusion or exclusion criteria are stated anywhere; papers appear selected ad hoc based on author familiarity.",
    125           "source": "haiku"
    126         },
    127         "prisma_or_structured_protocol": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No PRISMA flowchart, structured review protocol, or systematic methodology is mentioned or used.",
    131           "source": "haiku"
    132         },
    133         "search_terms_provided": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No search queries or keywords used to identify papers are provided.",
    137           "source": "haiku"
    138         },
    139         "databases_listed": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No databases, repositories, or search engines used to find papers are mentioned.",
    143           "source": "haiku"
    144         },
    145         "screening_process_documented": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No screening process, stage counts, or PRISMA-style funnel is documented.",
    149           "source": "haiku"
    150         },
    151         "review_scope_justified": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The scope is not justified beyond a vague statement that prior surveys are 'too narrow'; no rationale for time range, venue selection, or topic boundaries is provided.",
    155           "source": "haiku"
    156         }
    157       },
    158       "synthesis_quality": {
    159         "conflicting_findings_acknowledged": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper presents a descriptive taxonomy without acknowledging conflicting findings or methodological debates among the benchmarks and evaluation approaches reviewed.",
    163           "source": "haiku"
    164         },
    165         "quality_assessment_of_sources": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of reviewed papers is performed; all cited papers are treated as equally authoritative.",
    169           "source": "haiku"
    170         },
    171         "publication_bias_discussed": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Publication bias is not mentioned; the survey does not acknowledge that published evaluation benchmarks may skew toward positive results or popular approaches.",
    175           "source": "haiku"
    176         },
    177         "quantitative_synthesis_present": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The survey is entirely narrative; there is no meta-analysis, vote counting, effect size aggregation, or even a count of how many papers cover each category.",
    181           "source": "haiku"
    182         },
    183         "recommendations_supported_by_evidence": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "Future research directions (holistic frameworks, realistic settings, scalable evaluation) are stated as author opinion without citing systematic evidence of their necessity or comparing alternatives.",
    187           "source": "haiku"
    188         }
    189       }
    190     }
    191   },
    192   "claims": [
    193     {
    194       "claim": "Evaluating LLM agents is more complex than evaluating LLMs in isolation, requiring assessment across dynamic, interactive environments.",
    195       "evidence": "Logical argument by analogy (LLM = engine, agent = car) plus enumeration of agent capabilities. No empirical support.",
    196       "supported": "moderate"
    197     },
    198     {
    199       "claim": "Existing surveys focus narrowly on LLM evaluation or specific capabilities without a holistic perspective.",
    200       "evidence": "Single citation to [121] plus general assertion; no systematic comparison of prior surveys.",
    201       "supported": "weak"
    202     },
    203     {
    204       "claim": "Enterprise applications introduce requirements (RBAC, compliance, reliability, long-horizon) rarely addressed in current research.",
    205       "evidence": "Section 5 describes these challenges with selective citations but no systematic analysis of what fraction of current benchmarks address them.",
    206       "supported": "weak"
    207     },
    208     {
    209       "claim": "Current agents struggle with consistency as measured by the pass^k metric.",
    210       "evidence": "Cites τ-bench [104], which showed agents fail consistency tests in retail and airline booking domains.",
    211       "supported": "moderate"
    212     },
    213     {
    214       "claim": "Task completion (success rate) is the predominant measure of overall agent performance.",
    215       "evidence": "Supported by citing multiple major benchmarks (SWE-bench, AgentBench, WebArena) all using success rate as their primary metric.",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "LLM-as-a-Judge has gained traction for evaluating subjective and nuanced agent responses.",
    220       "evidence": "Multiple citations [30, 46, 125, 127] support uptake, though no quantitative comparison with human evaluation baselines is provided.",
    221       "supported": "moderate"
    222     }
    223   ],
    224   "methodology_tags": ["theoretical", "qualitative"],
    225   "key_findings": "This survey proposes a two-dimensional taxonomy for LLM agent evaluation organized by evaluation objectives (behavior, capabilities, reliability, safety) and evaluation process (interaction modes, datasets, metrics, tooling, contexts). A significant secondary contribution is identification of enterprise-specific evaluation gaps: role-based access control, reliability guarantees, long-horizon interactions, and compliance requirements are largely unaddressed by current benchmarks. The paper is a narrative taxonomy rather than a systematic review — no search strategy, inclusion criteria, or paper quality assessment is reported, making coverage selection opaque. Future directions include holistic, realistic, and scalable evaluation frameworks, but these are opinion-based rather than evidence-driven.",
    226   "red_flags": [
    227     {
    228       "flag": "No systematic literature review",
    229       "detail": "No search strategy, inclusion/exclusion criteria, database listing, or PRISMA protocol. Paper selection appears ad hoc based on author familiarity with the field."
    230     },
    231     {
    232       "flag": "Enterprise bias from SAP authors",
    233       "detail": "All four authors are from SAP Labs, an enterprise software company. The paper disproportionately amplifies enterprise-specific evaluation challenges in a way that aligns with SAP's product direction, without disclosing this potential conflict."
    234     },
    235     {
    236       "flag": "No funding disclosure",
    237       "detail": "No acknowledgment of funding sources or statement that the work is independently funded."
    238     },
    239     {
    240       "flag": "No quality assessment of sources",
    241       "detail": "All 127 cited benchmarks and papers are treated as equally authoritative with no quality filtering, risk-of-bias assessment, or critical appraisal."
    242     },
    243     {
    244       "flag": "No quantitative synthesis",
    245       "detail": "Purely narrative taxonomy with no paper counts per category, coverage statistics, or meta-analytic aggregation across reviewed papers."
    246     },
    247     {
    248       "flag": "No limitations section",
    249       "detail": "The paper has no dedicated limitations or threats-to-validity discussion; coverage gaps and potential selection biases are entirely unacknowledged."
    250     }
    251   ],
    252   "cited_papers": [
    253     {
    254       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    255       "relevance": "Key benchmark for coding agent evaluation using GitHub issue resolution as ground truth; highly cited in agent evaluation literature"
    256     },
    257     {
    258       "title": "AgentBench: Evaluating LLMs as Agents",
    259       "relevance": "Multi-environment benchmark covering coding, web, games — foundational agent evaluation framework"
    260     },
    261     {
    262       "title": "tau-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    263       "relevance": "Introduces pass^k consistency metric; demonstrates current agents fail reliability requirements in retail and airline domains"
    264     },
    265     {
    266       "title": "Holistic Evaluation of Language Models (HELM)",
    267       "relevance": "Comprehensive evaluation framework covering accuracy, robustness, bias, toxicity — canonical holistic evaluation reference"
    268     },
    269     {
    270       "title": "AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents",
    271       "relevance": "Multi-turn agent evaluation with fine-grained progress rate metric; distinguishes trajectory quality from binary success"
    272     },
    273     {
    274       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    275       "relevance": "Canonical web agent benchmark used across many evaluation studies in the survey"
    276     },
    277     {
    278       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    279       "relevance": "Safety evaluation benchmark specifically targeting harmful agent behaviors"
    280     },
    281     {
    282       "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks",
    283       "relevance": "Enterprise task evaluation framework requiring agents to follow organizational policy constraints"
    284     },
    285     {
    286       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    287       "relevance": "Foundational paper establishing the LLM-as-a-Judge evaluation paradigm discussed extensively in the survey"
    288     },
    289     {
    290       "title": "Survey on Evaluation of LLM-based Agents",
    291       "relevance": "Prior survey explicitly positioned against as 'too narrow', motivating this paper's broader scope"
    292     }
    293   ],
    294   "engagement_factors": {
    295     "practical_relevance": {
    296       "score": 3,
    297       "justification": "Directly useful as a reference for practitioners and researchers designing LLM agent evaluation systems; comprehensive benchmark coverage."
    298     },
    299     "surprise_contrarian": {
    300       "score": 1,
    301       "justification": "The taxonomy is well-organized but not surprising; the enterprise angle adds modest novelty but does not challenge conventional wisdom."
    302     },
    303     "fear_safety": {
    304       "score": 2,
    305       "justification": "Section 3.4 covers harm, toxicity, bias, and compliance with concrete failure examples (CoSafe coreference attacks, ToolEmu failures), raising substantive safety concerns."
    306     },
    307     "drama_conflict": {
    308       "score": 0,
    309       "justification": "No controversy or conflict angle; the paper is a straightforward organizational taxonomy."
    310     },
    311     "demo_ability": {
    312       "score": 0,
    313       "justification": "Conceptual taxonomy paper with no demo, tool, dataset, or interactive artifact."
    314     },
    315     "brand_recognition": {
    316       "score": 1,
    317       "justification": "SAP Labs is a recognized enterprise software company but not a top AI research lab; KDD venue adds credibility."
    318     }
    319   },
    320   "hn_data": {
    321     "threads": [
    322       {"hn_id": "44120359", "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective", "points": 19, "comments": 1, "url": "https://news.ycombinator.com/item?id=44120359", "created_at": "2025-05-28T20:27:45Z"},
    323       {"hn_id": "45472586", "title": "Physics of Learning: A Lagrangian perspective to different learning paradigms", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=45472586", "created_at": "2025-10-04T11:38:44Z"},
    324       {"hn_id": "36931866", "title": "Universal and Transferable Adversarial Attacks on LLM", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=36931866", "created_at": "2023-07-30T15:04:08Z"},
    325       {"hn_id": "45418635", "title": "Can LLMs Be Creative? Paper: Combinatorial Creativity: A New Frontier", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=45418635", "created_at": "2025-09-29T20:53:22Z"},
    326       {"hn_id": "41174642", "title": "Case-Based Reasoning for Explainable Depression Detection on Twitter Using LLMs", "points": 1, "comments": 0, "url": "https://news.ycombinator.com/item?id=41174642", "created_at": "2024-08-06T19:55:38Z"},
    327       {"hn_id": "36903968", "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", "points": 1, "comments": 0, "url": "https://news.ycombinator.com/item?id=36903968", "created_at": "2023-07-28T07:30:39Z"}
    328     ],
    329     "top_points": 19,
    330     "total_points": 29,
    331     "total_comments": 1
    332   }
    333 }

Impressum · Datenschutz