ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18585B)


      1 {
      2   "paper": {
      3     "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
      4     "authors": ["Mahmoud Mohammadi", "Yipeng Li", "Jane Lo", "Wendy Yip"],
      5     "year": 2025,
      6     "venue": "KDD '25",
      7     "arxiv_id": "2507.21504",
      8     "doi": "10.1145/3711896.3736570"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This survey proposes a two-dimensional taxonomy for LLM agent evaluation organized by evaluation objectives (behavior, capabilities, reliability, safety) and evaluation process (interaction modes, data, metrics, tooling, contexts). It identifies enterprise-specific challenges including role-based access control, reliability guarantees, long-horizon interactions, and compliance requirements that are underexplored in current research. The paper catalogues metrics, benchmarks, and frameworks across each taxonomy dimension but does not perform any quantitative analysis or quality assessment of the surveyed work.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or analysis scripts are mentioned or released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset of surveyed papers, extracted metadata, or structured corpus is released."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "Pure survey paper with no computational experiments requiring an environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No instructions for reproducing the survey search or paper selection process."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper with no quantitative experiments or statistical aggregation."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical comparisons are made in this survey."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No experiments or meta-analytic effect size calculations."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experiments with sample sizes."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs to report variance across."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not compare its taxonomy or coverage against prior surveys of LLM agent evaluation (e.g., Yehudai et al. [107] or Zhang et al. [121] are mentioned but not systematically compared)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No systematic comparison with prior surveys to assess whether baselines are contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate; this is a taxonomy/survey paper."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation of any system is performed."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No experiments requiring train/test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides a detailed breakdown of evaluation objectives, metrics, and relevant papers organized by category (Agent Behavior, Agent Capability, Reliability, Safety)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5 discusses enterprise-specific challenges where current evaluation approaches fail, and Section 6 outlines gaps in existing methods."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper identifies multiple areas where current evaluation is insufficient: lack of holistic frameworks, unrealistic evaluation settings, expensive manual evaluation, and missing enterprise considerations."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to provide an in-depth overview and two-dimensional taxonomy, which the body delivers through Sections 2-6. The abstract claims to highlight enterprise challenges, addressed in Section 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims; it proposes a taxonomy and surveys existing work."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper presents itself as comprehensive ('in-depth overview of the emerging field') but does not state explicit scope boundaries for which papers or domains were included or excluded from the survey."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "Pure taxonomy/survey paper presenting no empirical results requiring alternative explanations."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No measurements are taken; this is a survey paper."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are used in this survey."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments with hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how papers were selected for inclusion in the survey. No search queries, databases searched, inclusion/exclusion criteria, or PRISMA-style filtering pipeline is documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. Section 6 discusses future research directions but does not address limitations of the survey itself."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed for the survey methodology."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what is excluded from scope. It does not define inclusion criteria, time bounds, or which types of agent evaluation work are out of scope."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No list of surveyed papers, search results, or underlying data is made available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not describe how the 127 references were identified, what databases were searched, or what search terms were used."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data source is published literature (not a standard benchmark requiring NA)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how papers were collected, screened, or organized into the taxonomy."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding statement or acknowledgments section is present. All authors are from SAP Labs, suggesting corporate funding, but this is not disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All four authors list SAP Labs affiliations prominently in the paper header."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "SAP Labs (the employer) has a commercial interest in enterprise AI agent evaluation. The paper highlights enterprise-specific challenges that align with SAP's business interests. No independence statement is provided."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper; does not evaluate any pre-trained model on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper; no model evaluation."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper; no model evaluation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper with no computational method."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper with no computational method."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No PRISMA flow diagram, structured search strategy, or review protocol is described. The paper appears to use ad-hoc paper collection."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey treats all cited papers equally without assessing their methodological quality. Benchmarks and frameworks are catalogued without evaluating whether the cited work is rigorous."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias in the surveyed literature. The paper does not consider whether published evaluation methods skew toward positive demonstrations."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "LLM agent evaluation requires a two-dimensional taxonomy organized by evaluation objectives (what to evaluate) and evaluation process (how to evaluate).",
    313       "evidence": "The taxonomy is presented in Section 2 and Figure 1, with detailed breakdowns in Sections 3-4 covering behavior, capabilities, reliability, safety, interaction modes, data, metrics, tooling, and contexts.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "Enterprise-specific challenges such as RBAC, reliability guarantees, long-horizon interactions, and compliance are often overlooked in current agent evaluation research.",
    318       "evidence": "Section 5 discusses these four challenges with examples (IntellAgent for RBAC, τ-benchmark for consistency, Park et al. for long-horizon, TheAgentCompany for compliance), but the claim of 'often overlooked' is not quantified.",
    319       "supported": "weak"
    320     },
    321     {
    322       "claim": "Existing surveys focus narrowly on LLM evaluation or cover specific agent capabilities without a holistic perspective.",
    323       "evidence": "Section 1 cites [121] and [107] but does not systematically compare against these or other surveys to demonstrate the gap.",
    324       "supported": "weak"
    325     }
    326   ],
    327   "red_flags": [
    328     {
    329       "flag": "No systematic review methodology",
    330       "detail": "The survey does not describe how papers were selected, what databases were searched, or what criteria determined inclusion. This makes it impossible to assess completeness or reproduce the survey."
    331     },
    332     {
    333       "flag": "No quality assessment of surveyed work",
    334       "detail": "All cited benchmarks and frameworks are presented at face value without evaluating their methodological rigor. This launders the signal-to-noise ratio of the sources."
    335     },
    336     {
    337       "flag": "Corporate alignment with enterprise framing",
    338       "detail": "All authors are from SAP Labs, and the paper emphasizes enterprise-specific challenges (Section 5) that align with SAP's commercial interests. This framing is not disclosed as a potential bias."
    339     },
    340     {
    341       "flag": "Shallow coverage breadth over depth",
    342       "detail": "The paper covers a very wide range of topics (127 references) in 11 pages but provides only brief descriptions of most work. Many categories receive 1-2 paragraph treatment without critical analysis."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    348       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    349       "year": 2024,
    350       "arxiv_id": "2310.06770",
    351       "relevance": "Key benchmark for evaluating coding agents on real-world software engineering tasks."
    352     },
    353     {
    354       "title": "AgentBench: Evaluating LLMs as Agents",
    355       "authors": ["Xiao Liu"],
    356       "year": 2023,
    357       "arxiv_id": "2308.03688",
    358       "relevance": "Multi-domain benchmark for evaluating LLMs as agents across diverse tasks."
    359     },
    360     {
    361       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    362       "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"],
    363       "year": 2024,
    364       "arxiv_id": "2406.12045",
    365       "relevance": "Introduces pass^k metric for measuring agent consistency, key for reliability evaluation."
    366     },
    367     {
    368       "title": "AgentHarm: a benchmark for measuring harmfulness of LLM agents",
    369       "authors": ["Maksym Andriushchenko"],
    370       "year": 2025,
    371       "arxiv_id": "2410.09024",
    372       "relevance": "Benchmark for evaluating safety of LLM agents against harmful behavior."
    373     },
    374     {
    375       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    376       "authors": ["Edoardo Debenedetti"],
    377       "year": 2024,
    378       "arxiv_id": "2406.13352",
    379       "relevance": "Evaluates resilience of LLM agents against prompt injection attacks."
    380     },
    381     {
    382       "title": "Holistic Evaluation of Language Models",
    383       "authors": ["Percy Liang"],
    384       "year": 2023,
    385       "arxiv_id": "2211.09110",
    386       "relevance": "HELM benchmark providing holistic evaluation framework including robustness and bias metrics."
    387     },
    388     {
    389       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    390       "authors": ["Shunyu Yao"],
    391       "year": 2023,
    392       "arxiv_id": "2210.03629",
    393       "relevance": "Foundational paradigm for agent reasoning-action interleaving that evaluation frameworks must assess."
    394     },
    395     {
    396       "title": "Survey on Evaluation of LLM-based Agents",
    397       "authors": ["Asaf Yehudai"],
    398       "year": 2025,
    399       "arxiv_id": "2503.16416",
    400       "relevance": "Closely related prior survey on LLM agent evaluation."
    401     },
    402     {
    403       "title": "TheAgentCompany: benchmarking LLM agents on consequential real world tasks",
    404       "authors": ["Frank F Xu"],
    405       "year": 2024,
    406       "relevance": "Enterprise-oriented benchmark evaluating agents under organizational policy constraints."
    407     },
    408     {
    409       "title": "Agent-as-a-judge: Evaluate agents with agents",
    410       "authors": ["Mingchen Zhuge"],
    411       "year": 2024,
    412       "relevance": "Extension of LLM-as-judge to multi-agent evaluation, relevant to scalable evaluation methods."
    413     },
    414     {
    415       "title": "ScienceAgentBench: toward rigorous assessment of language agents for data-driven scientific discovery",
    416       "authors": ["Ziru Chen"],
    417       "year": 2025,
    418       "arxiv_id": "2410.05080",
    419       "relevance": "Benchmark for evaluating scientific data analysis agents with structured evaluation."
    420     },
    421     {
    422       "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research",
    423       "authors": ["Giulio Starace"],
    424       "year": 2025,
    425       "arxiv_id": "2504.01848",
    426       "relevance": "Benchmark for evaluating AI agents on reproducing research papers."
    427     }
    428   ]
    429 }

Impressum · Datenschutz