scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28743B)
      1 {
      2   "paper": {
      3     "title": "Where LLM Agents Fail and How They Can Learn from Failures",
      4     "authors": [
      5       "Kunlun Zhu",
      6       "Zijia Liu",
      7       "Bingxuan Li",
      8       "Muxin Tian",
      9       "Yingxuan Yang",
     10       "Jiaxun Zhang",
     11       "Pengrui Han",
     12       "Qipeng Xie",
     13       "Fuyang Cui",
     14       "Weijia Zhang",
     15       "Xiaoteng Ma",
     16       "Xiaodong Yu",
     17       "Gowtham Ramesh",
     18       "Jialian Wu",
     19       "Zicheng Liu",
     20       "Pan Lu",
     21       "James Zou",
     22       "Jiaxuan You"
     23     ],
     24     "year": 2025,
     25     "venue": "arXiv preprint",
     26     "arxiv_id": "2509.25370"
     27   },
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The abstract states 'The code and data will be available at https://github.com/ulab-uiuc/AgentDebug', which is a promise of future release. Per the schema criteria, a promise of future release counts as NO."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper promises data availability at the same GitHub URL as the code, but this is a future release promise, not a currently accessible dataset. No evidence that the AgentErrorBench dataset is currently downloadable."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No requirements.txt, Dockerfile, or environment specification file is mentioned. The paper specifies models used (GPT-4.1, GPT-4o-mini, Qwen3-8B, Qwen3-Next-80B) and temperature (0 for GPT-4.1), but provides no dependency specifications to recreate the environment."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided in the paper. Algorithm 1 describes the method at a pseudocode level but not the commands or scripts needed to replicate the experiments."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Table 1 and Figures 5-7 report only point estimates (percentage accuracy, success rates) with no confidence intervals or error bars. For example, Table 1 reports 'AgentDebug: 45.0% step accuracy' with no uncertainty quantification."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No statistical significance tests are reported. The paper claims AgentDebug 'consistently surpasses baselines' (Section 4.1) based solely on comparing point estimates without any p-values, t-tests, or other statistical tests."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper reports percentage improvements with baseline context throughout: '24.3% vs. 0.3% all-correct' (Table 1), '45.0% vs. 28.0% step accuracy' (Table 1), 'success from 21 to 55' (Section 4.2). Per the schema, 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper consistently provides this form of context."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The benchmark contains 200 annotated trajectories (100 ALFWorld, 50 WebShop, 50 GAIA), but no power analysis or justification for why this sample size is sufficient for the claims made is provided."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "Results across runs are reported as single point estimates. Figure 7a shows accumulative success rates across attempts but these are cumulative counts, not variance or standard deviation across repeated experimental runs."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 4.1 includes three baselines for error detection: Direct Prompting, Brute Force, and Binary Search. Section 4.2 includes Self-Refine, Vanilla Debugger, Tree-of-Thought (ToT), and Best-of-N as baselines for downstream task success."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The baselines are contemporary: Self-Refine and Tree-of-Thought are recent methods, and the paper explicitly states it controls for compute budget to ensure fair comparison. The baselines are appropriate for the task."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 5.1 presents ablation studies analyzing: (1) max number of attempts, (2) different base models for AgentDebug, and (3) different rollout strategies (ReAct, Reflection, Act-only, Memory+ReAct vs Modular)."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 reports three metrics for error detection: Step accuracy (S), Step+Module accuracy (S+M), and All Correct (Step+Module+Error Type). Task success rate is also reported separately in Section 4.2."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The benchmark was annotated by human experts (10 graduate students), but there is no human evaluation of AgentDebug's outputs. Human annotation was used for dataset construction, not for evaluating the system's outputs. All system evaluation is automated."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "AgentErrorBench is used as the test set for evaluating error detection (Section 4.1). The downstream task evaluation uses ALFWorld, GAIA, and WebShop benchmarks as separate test environments. The paper does not describe any tuning decisions made using these sets."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 1 and Figures 5-6 break down results per benchmark (ALFWorld, WebShop, GAIA) separately. Appendix A.4 provides failure distribution breakdowns per module, error type, and benchmark."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 5.2 discusses error propagation patterns in detail. Appendix A.3 provides a concrete qualitative failure case comparison. Figure 8 illustrates cascading failure patterns across trajectory steps."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Figure 7b (ablation) shows that alternative base models (Llama-3.3-70B, GPT-4o-mini, Qwen3-Next-80B) perform substantially worse as detectors. The paper also reports that Brute Force performs worse than Direct Prompting in some metrics."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The abstract claims 24% higher all-correct accuracy and 17% higher step accuracy, which are confirmed by Table 1 (24.3% vs 0.3% all-correct; 45.0% vs 28.0% step accuracy). The 26% relative improvement in task success is shown in Figures 5-6."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper makes causal claims such as 'targeting root-cause errors...is key to efficient debugging' and 'correcting a single root-cause mistake can often flip an otherwise failing trajectory.' These are tested via ablation but there is no controlled single-variable manipulation — the comparison is between full methods, not isolated components, making causal attribution difficult."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper's conclusion states 'AgentDebug establishes debugging as a foundation for agents that can continuously learn and evolve,' which overgeneralizes from three specific benchmarks (ALFWorld, WebShop, GAIA). The title 'Where LLM Agents Fail' suggests broader scope than what is tested."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The limitations section (Appendix A.1) discusses scale and domain diversity but does not consider alternative explanations for the results — for example, whether gains could be due to additional compute (though they control for this) or whether GPT-4.1 specifically enables the gains rather than the framework design."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 4.1 states 'We use GPT-4.1 as the base model' and Section 4.2 mentions 'GPT-4o-mini, Qwen3-8B, and Qwen3-Next-80B.' None of these include specific version snapshots, API versions, or dates, which is necessary since model behavior changes across versions."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix A.5 provides verbatim prompt content for the Detector Prompt (Figure 14), AgentDebug Prompt (Figure 15), Baseline Prompts (Figure 16), and Environment Rollout Prompts for ALFWorld (Figures 16-17). The actual prompt text with placeholders is provided along with the fill values described in context."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section 4.1 states 'temperature set to 0 for deterministic outputs' for GPT-4.1, but other hyperparameters (temperature for other models, max tokens, top-p) are not reported. For the backbone agents used in Section 4.2, no sampling hyperparameters are given."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.2 and Algorithm 1 describe AgentDebug's three-stage scaffolding in detail: Stage 1 (fine-grained analysis), Stage 2 (critical error detection), Stage 3 (iterative debugging with re-rollouts). The modular rollout design with memory, reflection, planning, and action modules is described in Section 3.1."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "Section 2.2 states 200 representative trajectories were 'curated' from 500+ failed trajectories, but the selection criteria for which 200 were included from the 500+ are not explained. The paper does not describe filtering criteria for how the 200 representative trajectories were selected."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Appendix A.1 is titled 'LIMITATION' and provides a substantive discussion of limitations, including scale/domain diversity constraints and the cost of annotation."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Appendix A.1 identifies specific threats: (1) the benchmark 'remains limited in scale and domain diversity' specifically naming multimodal environments and safety-critical applications, and (2) the annotation cost constraint preventing a trained debugging model. These are specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The limitations section mentions scope boundaries only vaguely. The paper does not explicitly state what the results do NOT show — for example, it does not bound claims to the specific models tested or acknowledge that results may not apply to multi-agent settings or different error taxonomies."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper promises future data availability at GitHub but AgentErrorBench (the 200 annotated trajectories) is not currently available for independent verification. The 500+ failed trajectories used for taxonomy development are not released."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.2 describes that 200 representative trajectories were collected: 100 from ALFWorld, 50 from WebShop, 50 from GAIA, and how annotation proceeded at the decision-step level using the taxonomy schema."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 2.2 describes that 'ten expert annotators—graduate students with prior experience in NLP and LLMs agent research' performed the annotations. The qualification criteria (graduate students with relevant experience) are stated."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The paper collected 500+ failed trajectories but only used 200 in the benchmark without explaining the selection criteria. The pipeline from 500+ to 200 is not documented with explicit filtering criteria, which represents an unexplained jump."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding acknowledgment or grant source is mentioned in the paper. The acknowledgment section thanks the OpenManus team for 'discussion and providing some resources' but does not disclose any funding."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed on the first page: University of Illinois Urbana-Champaign, Stanford University, AMD, OpenManus, University of Toronto, and Likelihood Lab. The paper does not evaluate any of these institutions' products specifically."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "No funding is disclosed; this question does not apply since there is no identified funder whose independence can be assessed."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "There is no competing interests statement or financial interests declaration in the paper. Absence of disclosure is not the same as absence of conflict."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper evaluates GPT-4.1, GPT-4o-mini, Qwen3-8B, and Qwen3-Next-80B on benchmarks including GAIA and WebShop, but does not state the training data cutoff dates for any of these models."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No analysis of whether ALFWorld, GAIA, or WebShop benchmarks were in the training data of the evaluated models is provided. GAIA was published in 2023 and WebShop in 2022, making contamination plausible for models with later training cutoffs."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "ALFWorld (2020), WebShop (2022), and GAIA (2023) are public benchmarks that likely appeared in training data for GPT-4.1 and Qwen3 series models. The paper does not discuss this contamination risk."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "This paper involves human annotators for benchmark construction but is not a human subjects study in the experimental sense; it is a benchmark and system evaluation paper. No pre-registration is relevant or expected."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "The ethical statement explicitly states 'Our study does not involve human subjects, sensitive personal data, or information that could directly identify individuals.' The graduate student annotators are not study subjects; IRB approval is not applicable."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "The annotators are described as 'graduate students with prior experience in NLP and LLMs agent research,' which is sufficient characterization for their role as annotators rather than study participants."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the experimental sense; the annotation is a data construction process, not a human study. Human studies items are not applicable."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No experimental conditions involving human subjects requiring randomization. Human studies items are not applicable."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects experiment requiring blinding. Human studies items are not applicable."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in the experimental sense. Human studies items are not applicable."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "AgentDebug calls LLMs for per-step error analysis plus critical error detection per trajectory. With 200 trajectories and multiple re-rollouts, this involves substantial API costs. The paper does not report API costs, tokens consumed, or cost per example."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper controls for total token usage between methods (Section 4.2: 'the max number of attempts of all baselines is matched to AgentDebug by total token usage'), but does not state the absolute compute budget in terms of total tokens, GPU hours, or API spend."
    299       }
    300     }
    301   },
    302   "claims": [
    303     {
    304       "claim": "AgentDebug achieves 24% higher all-correct accuracy (24.3% vs 0.3%) and 17% higher step accuracy (45.0% vs 28.0%) compared to the strongest baseline on AgentErrorBench.",
    305       "evidence": "Table 1 reports average all-correct accuracy of 24.3% for AgentDebug vs 0.3% for Direct Prompting (strongest baseline), and step accuracy of 45.0% vs 28.0%. Section 4.1 presents these results.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "AgentDebug enables LLM agents to achieve up to 26% relative improvements in task success across ALFWorld, GAIA, and WebShop.",
    310       "evidence": "Figure 5 shows ALFWorld improvements: GPT-4o-mini from 21 to 55 (162% improvement), Qwen3-8B from 48 to 74, Qwen3-Next-80B from 60 to 84. Figure 6 shows cross-benchmark comparisons. The 26% figure appears to refer to relative improvement on ALFWorld specifically.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Error propagation — early mistakes cascading into later failures — is the primary bottleneck in LLM agent reliability.",
    315       "evidence": "Section 2.1 states this as a 'Key Insight' derived from analyzing 500+ failed trajectories. Figure 8 illustrates cascading failures. Section 5.2 provides additional analysis showing memory and reflection errors cause most propagation.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "Human annotators achieved substantial inter-annotator agreement (Cohen's kappa = 0.55) on the AgentErrorTaxonomy categories.",
    320       "evidence": "Section 2.2 explicitly reports: 'Inter-annotator agreement, measured using Cohen's κ, reached κ = 0.55 across modules, indicating substantial agreement.'",
    321       "supported": "strong"
    322     },
    323     {
    324       "claim": "GPT-4.1 substantially outperforms other models as the AgentDebug base model, achieving 42% step accuracy vs 16% for Llama-3.3-70B.",
    325       "evidence": "Figure 7b (Table) shows GPT-4.1: 42% step, 32% all-correct; Llama-3.3-70B: 16% step, 2% all-correct. Section 5.1 reports this ablation.",
    326       "supported": "strong"
    327     }
    328   ],
    329   "methodology_tags": [
    330     "benchmark-eval",
    331     "qualitative"
    332   ],
    333   "key_findings": "The paper introduces AgentErrorTaxonomy (a five-module failure taxonomy covering memory, reflection, planning, action, and system errors), AgentErrorBench (200 annotated failure trajectories from ALFWorld, GAIA, and WebShop), and AgentDebug (a three-stage debugging framework that localizes root-cause errors and provides corrective feedback). AgentDebug achieves 24.3% all-correct accuracy on error localization versus 0.3% for baselines, and improves task success rates by up to 26% relative gains across three benchmarks. A key finding is that error propagation — where early mistakes cascade into downstream failures — is the central reliability bottleneck, with memory and reflection errors being the most common sources of cascading failure.",
    334   "red_flags": [
    335     {
    336       "flag": "No statistical significance testing",
    337       "detail": "All comparisons between AgentDebug and baselines are based on point estimates with no confidence intervals, standard deviations, or significance tests. With only 50-100 trajectories per benchmark, differences could be within noise bounds."
    338     },
    339     {
    340       "flag": "Model version not specified",
    341       "detail": "The paper uses 'GPT-4.1' and 'GPT-4o-mini' without API version numbers or snapshot dates. Since OpenAI updates these models, results may not be reproducible."
    342     },
    343     {
    344       "flag": "Selection bias in benchmark construction",
    345       "detail": "200 trajectories were 'curated' from 500+ failed trajectories without documenting the selection criteria. If the 200 were selected to make the taxonomy look clean, the benchmark may not represent the true distribution of agent failures."
    346     },
    347     {
    348       "flag": "Contamination not addressed",
    349       "detail": "ALFWorld (2020), WebShop (2022), and GAIA (2023) are public benchmarks that likely appeared in GPT-4.1 and Qwen3 training data, but contamination risk is not discussed."
    350     },
    351     {
    352       "flag": "Code and data promised but not released",
    353       "detail": "The abstract promises code and data availability at GitHub, but as a future release. Results cannot be independently verified at time of publication."
    354     },
    355     {
    356       "flag": "Overgeneralized conclusion",
    357       "detail": "The conclusion frames AgentDebug as 'a foundation for agents that can continuously learn and evolve from their mistakes,' which significantly overstates what was shown: improvement on 3 specific benchmarks with a single model (GPT-4.1) as the debugger."
    358     }
    359   ],
    360   "cited_papers": [
    361     {
    362       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    363       "authors": [
    364         "Shunyu Yao",
    365         "Jeffrey Zhao",
    366         "Dian Yu",
    367         "Nan Du",
    368         "Izhak Shafran",
    369         "Karthik Narasimhan",
    370         "Yuan Cao"
    371       ],
    372       "year": 2022,
    373       "arxiv_id": "2210.03629",
    374       "relevance": "Foundational paper on the reasoning-acting paradigm for LLM agents, directly relevant to understanding agent failure modes."
    375     },
    376     {
    377       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    378       "authors": [
    379         "Shunyu Yao",
    380         "Dian Yu",
    381         "Jeffrey Zhao",
    382         "Izhak Shafran",
    383         "Thomas L. Griffiths",
    384         "Yuan Cao",
    385         "Karthik Narasimhan"
    386       ],
    387       "year": 2023,
    388       "arxiv_id": "2305.10601",
    389       "relevance": "Test-time scaling method for LLM reasoning used as baseline for agent debugging; relevant to evaluating agent methodology quality."
    390     },
    391     {
    392       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    393       "authors": [
    394         "Noah Shinn",
    395         "Federico Cassano",
    396         "Edward Berman",
    397         "Ashwin Gopinath",
    398         "Karthik Narasimhan",
    399         "Shunyu Yao"
    400       ],
    401       "year": 2023,
    402       "arxiv_id": "2303.11366",
    403       "relevance": "Self-reflection method for LLM agents that is a key baseline and related work for agent error recovery."
    404     },
    405     {
    406       "title": "GAIA: a benchmark for general AI assistants",
    407       "authors": [
    408         "Gregoire Mialon",
    409         "Clementine Fourrier",
    410         "Thomas Wolf",
    411         "Yann LeCun",
    412         "Thomas Scialom"
    413       ],
    414       "year": 2023,
    415       "relevance": "Benchmark used to evaluate AgentDebug; relevant as an evaluation environment for LLM agent capabilities."
    416     },
    417     {
    418       "title": "WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents",
    419       "authors": [
    420         "Shunyu Yao",
    421         "Howard Chen",
    422         "John Yang",
    423         "Karthik Narasimhan"
    424       ],
    425       "year": 2022,
    426       "relevance": "E-commerce agent benchmark used to evaluate AgentDebug; important for evaluating web interaction agent methods."
    427     },
    428     {
    429       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    430       "authors": [
    431         "Mohit Shridhar",
    432         "Xingdi Yuan",
    433         "Marc-Alexandre Cote",
    434         "Yonatan Bisk",
    435         "Adam Trischler",
    436         "Matthew Hausknecht"
    437       ],
    438       "year": 2020,
    439       "arxiv_id": "2010.03768",
    440       "relevance": "Embodied agent benchmark used to evaluate AgentDebug; key testbed for long-horizon task completion."
    441     },
    442     {
    443       "title": "Why do multi-agent LLM systems fail?",
    444       "authors": [
    445         "Mert Cemri",
    446         "Melissa Z. Pan",
    447         "Shuyi Yang",
    448         "Lakshya A. Agrawal",
    449         "Bhavya Chopra",
    450         "Rishabh Tiwari",
    451         "Kurt Keutzer",
    452         "Aditya G. Parameswaran",
    453         "Dan Klein",
    454         "Kannan Ramchandran",
    455         "Matei Zaharia",
    456         "Joseph E. Gonzalez",
    457         "Ion Stoica"
    458       ],
    459       "year": 2025,
    460       "arxiv_id": "2503.13657",
    461       "relevance": "Related work on failure analysis in multi-agent LLM systems; directly relevant to understanding agent error taxonomy scope."
    462     },
    463     {
    464       "title": "Defining and Detecting the Defects of the Large Language Model-Based Autonomous Agents",
    465       "authors": [
    466         "Kaiwen Ning",
    467         "Jiachi Chen",
    468         "Jingwen Zhang",
    469         "Wei Li",
    470         "Zexu Wang",
    471         "Yuming Feng",
    472         "Weizhe Zhang",
    473         "Zibin Zheng"
    474       ],
    475       "year": 2024,
    476       "arxiv_id": "2412.18371",
    477       "relevance": "Prior work on agent defect taxonomy that this paper builds upon and differentiates from."
    478     },
    479     {
    480       "title": "VeriLA: A Human-Centered Evaluation Framework for Interpretable Verification of LLM Agent Failures",
    481       "authors": [
    482         "Yoo Yeon Sung",
    483         "Hannah Kim",
    484         "Dan Zhang"
    485       ],
    486       "year": 2025,
    487       "arxiv_id": "2503.12651",
    488       "relevance": "Related work on human-centered evaluation of LLM agent failures, relevant to the survey's interest in methodology quality."
    489     },
    490     {
    491       "title": "Testing and Understanding Erroneous Planning in LLM Agents through Synthesized User Inputs",
    492       "authors": [
    493         "Zhenlan Ji",
    494         "Daoyuan Wu",
    495         "Pingchuan Ma",
    496         "Zongjie Li",
    497         "Shuai Wang"
    498       ],
    499       "year": 2024,
    500       "arxiv_id": "2404.17833",
    501       "relevance": "Related work on LLM agent planning failures that informed this paper's taxonomy development."
    502     },
    503     {
    504       "title": "Which Agent Causes Task Failures and When? On Automated Failure Attribution of LLM Multi-Agent Systems",
    505       "authors": [
    506         "Shaokun Zhang",
    507         "Ming Yin",
    508         "Jieyu Zhang",
    509         "Jiale Liu",
    510         "Zhiguang Han",
    511         "Jingyang Zhang",
    512         "Beibin Li",
    513         "Chi Wang",
    514         "Huazheng Wang",
    515         "Yiran Chen",
    516         "Qingyun Wu"
    517       ],
    518       "year": 2025,
    519       "arxiv_id": "2505.00212",
    520       "relevance": "Related work on failure attribution in multi-agent systems, relevant to agent debugging methodology."
    521     },
    522     {
    523       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs",
    524       "authors": [
    525         "Yujia Qin",
    526         "Shihao Liang",
    527         "Yining Ye",
    528         "Kunlun Zhu"
    529       ],
    530       "year": 2023,
    531       "arxiv_id": "2307.16789",
    532       "relevance": "Tool-use LLM agent framework relevant to understanding tool-use errors in the agent error taxonomy."
    533     }
    534   ]
    535 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs