scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20072B)
      1 {
      2   "paper": {
      3     "title": "Failure Modes in LLM Systems: A System-Level Taxonomy for Reliable AI Applications",
      4     "authors": ["Vaishali Vinay"],
      5     "year": 2025,
      6     "venue": "IEEE (conference paper)",
      7     "arxiv_id": "2511.19933",
      8     "doi": "10.48550/arXiv.2511.19933"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical", "qualitative"],
     13   "key_findings": "The paper proposes a taxonomy of 15 system-level failure modes in LLM-based applications, organized into three dimensions: reasoning failures (5), input/context failures (5), and system/operational failures (5). It argues that LLM reliability should be framed as a systems-engineering problem rather than a model-centric one, highlighting gaps in current evaluation benchmarks that focus on accuracy over stability, reproducibility, and drift. Design principles proposed include input canonicalization, verification layers, semantic observability, and cost governance.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code or repository is mentioned. The paper is theoretical but could have released taxonomy data or analysis scripts."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset or structured taxonomy artifact is released. The taxonomy exists only in prose and figures within the paper."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No experiments are conducted; this is a theoretical taxonomy paper with no computational environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "No experiments to reproduce. The paper is a theoretical taxonomy and literature synthesis."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No experiments or quantitative results are produced by this paper. It cites others' numbers but generates none of its own."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical comparisons are made. The paper is a theoretical taxonomy."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No experiments; no effect sizes to report."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No data collection or sampling is performed."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experiments conducted."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper references prior taxonomies (hallucination taxonomies, risk taxonomies like [12]) but does not systematically compare its taxonomy against them to show what it adds."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "While the paper cites contemporary work, it does not formally compare its taxonomy structure against specific contemporary alternatives."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The paper is a taxonomy proposal with no system components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation metrics are used; this is a theoretical contribution."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate. The paper is a taxonomy/framework proposal."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No experiments conducted."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The taxonomy is organized into three dimensions (reasoning, input/context, system/operational) with five failure modes each, and each is described individually in Section III."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The entire paper is about failure modes. Each of the 15 failure modes is described with examples of how they manifest in practice (e.g., hallucination propagation in multi-agent systems, tool invocation errors in Gorilla LLM)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper's central thesis is about what goes wrong with LLM systems. It reports on evaluation gaps, production failures, and limitations of current benchmarks."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims the paper 'presents a system-level taxonomy' and 'analyzes the growing gap in evaluation and monitoring practices.' The taxonomy is presented, but the 'analysis' of the gap is largely a narrative literature review without systematic evidence gathering or structured comparison."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims like cost-driven reductions causing performance collapse and version drift causing regression, but these are asserted based on cited literature without the paper producing its own causal evidence. Language like 'cost constraints further worsen the risk' and 'adjustments to the underlying versions can introduce regression' are causal claims supported only by narrative citation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper's title claims to address 'Reliable AI Applications' broadly. The taxonomy is presented as general but is derived from a selective literature review without systematic coverage. No boundaries are placed on what types of LLM systems, deployment contexts, or scales the taxonomy applies to."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper presents its taxonomy as the framing for LLM failures without considering alternative organizational schemes or acknowledging that other taxonomies might capture the same phenomena differently. No discussion of whether the 15 modes are exhaustive or whether the three-dimension grouping is the most useful."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "The paper makes no measurements; it is a theoretical taxonomy. No proxy-outcome gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are used in experiments. The paper is theoretical."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used. The paper is a theoretical taxonomy."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments conducted."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper is a literature-based taxonomy but does not describe how papers were selected for the review. No search strategy, inclusion/exclusion criteria, or databases queried are documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no limitations section. The paper goes directly from 'Design Principles' (Section VI) to 'Future Work' (Section VII) to 'Conclusion' (Section VIII) without discussing limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. No acknowledgment that the taxonomy might be incomplete, that the literature selection might be biased, or that the failure modes might overlap."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what it does NOT cover. It presents the taxonomy as comprehensive ('fifteen hidden failure modes') without bounding the scope of applicability."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is available. The taxonomy is derived from literature but no structured dataset of papers or failure instances is provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No description of how the literature was searched or selected. The references appear to be ad-hoc rather than systematically collected."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is literature, but this is covered by data_collection_described."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how the 62 cited references were found, filtered, or organized into the taxonomy. The process from literature to taxonomy is opaque."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure. The author is affiliated with Microsoft Security Research but no funding statement is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The author's affiliation with Microsoft Security Research is clearly stated in the header, along with the disclaimer that views do not reflect Microsoft's positions."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "The author works at Microsoft, which sells LLM-based products (Azure OpenAI, Copilot). A paper framing LLM failures as solvable engineering problems (rather than fundamental limitations) could serve Microsoft's commercial interests. No independence is established."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement. The author's Microsoft employment is disclosed as affiliation but not as a potential conflict of interest."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained models are evaluated on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Theoretical taxonomy paper with no method that incurs inference cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No computation performed."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Output divergence in multi-step reasoning tasks exceeds 20-30%",
    296       "evidence": "Cited from [5] (Chen et al., 'Two Failures of Self-Consistency'), Section I. The paper does not produce this number itself.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Existing benchmarks measure knowledge or reasoning but provide little insight into stability, reproducibility, drift, or workflow integration",
    301       "evidence": "Section IV discusses evaluation gaps citing [50]-[54], arguing benchmarks like BLEU/ROUGE don't capture operational reliability. The argument is narrative rather than systematic.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Nearly 50% of LLM-as-judge pairwise comparisons reverse verdicts when response order is mirrored",
    306       "evidence": "Cited from [51] (Anghel et al.), Section IV. The paper reports this as '48.4%' from a meta-evaluation.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The proposed 15-mode taxonomy captures failure modes not addressed by existing taxonomies focused on hallucinations, bias, or safety",
    311       "evidence": "Section III presents the taxonomy; the gap claim appears in Sections I-II. However, no systematic comparison with existing taxonomies is provided to demonstrate the claimed novelty.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "red_flags": [
    316     {
    317       "flag": "No systematic methodology for taxonomy construction",
    318       "detail": "The 15 failure modes appear to be derived from ad-hoc literature review. No systematic search strategy, inclusion criteria, or methodology for identifying and categorizing failure modes is described. The taxonomy could be incomplete or arbitrarily organized."
    319     },
    320     {
    321       "flag": "No validation of taxonomy",
    322       "detail": "The taxonomy is not validated empirically — no case studies applying it to real systems, no expert review, no comparison with alternative frameworks. It is unclear whether the 15 modes are exhaustive, mutually exclusive, or practically useful."
    323     },
    324     {
    325       "flag": "Microsoft affiliation undisclosed as conflict",
    326       "detail": "The author works at Microsoft Security Research. Microsoft sells LLM products (Azure OpenAI, Copilot). The paper frames LLM failures as solvable engineering problems, which aligns with vendor interests. While a disclaimer is included, the potential conflict is not explicitly addressed."
    327     },
    328     {
    329       "flag": "Claims outrun evidence",
    330       "detail": "The paper claims to provide 'an analytical foundation for future research on evaluation methodology, AI system robustness, and dependable LLM deployment' but offers only a narrative literature review with a proposed taxonomy. No new data, experiments, or formal analysis supports this claim."
    331     },
    332     {
    333       "flag": "No limitations section",
    334       "detail": "The paper lacks any discussion of its own limitations, threats to validity, or scope boundaries — ironic for a paper about identifying failure modes and evaluation gaps."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs",
    340       "authors": ["A. Chen"],
    341       "year": 2024,
    342       "arxiv_id": "2305.14279",
    343       "doi": "10.48550/arXiv.2305.14279",
    344       "relevance": "Directly measures LLM reasoning inconsistency, relevant to evaluation methodology and reliability assessment."
    345     },
    346     {
    347       "title": "Why Do Multi-Agent LLM Systems Fail?",
    348       "authors": ["M. Cemri"],
    349       "year": 2025,
    350       "arxiv_id": "2503.13657",
    351       "doi": "10.48550/arXiv.2503.13657",
    352       "relevance": "Empirical study of multi-agent LLM failure patterns with failure rate data across models."
    353     },
    354     {
    355       "title": "A Taxonomy of Failures in Tool-Augmented LLMs",
    356       "authors": ["C. Winston", "R. Just"],
    357       "year": 2025,
    358       "doi": "10.1109/AST66626.2025.00019",
    359       "relevance": "Directly relevant taxonomy of tool-use failures in LLM systems with empirical Gorilla LLM data."
    360     },
    361     {
    362       "title": "Risk Taxonomy, Mitigation, and Assessment Benchmarks of Large Language Model Systems",
    363       "authors": ["T. Cui"],
    364       "year": 2024,
    365       "arxiv_id": "2401.05778",
    366       "doi": "10.48550/arXiv.2401.05778",
    367       "relevance": "Prior LLM risk taxonomy relevant to comparing taxonomy approaches."
    368     },
    369     {
    370       "title": "A Survey on Evaluation of Large Language Models",
    371       "authors": ["Y. Chang"],
    372       "year": 2024,
    373       "doi": "10.1145/3641289",
    374       "relevance": "Comprehensive survey of LLM evaluation methods, directly relevant to evaluation methodology assessment."
    375     },
    376     {
    377       "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation",
    378       "authors": ["S. Ouyang", "J. M. Zhang", "M. Harman", "M. Wang"],
    379       "year": 2025,
    380       "doi": "10.1145/3697010",
    381       "relevance": "Empirical study of LLM non-determinism in code generation, relevant to reproducibility and reliability."
    382     },
    383     {
    384       "title": "The Good, The Bad, and The Greedy: Evaluation of LLMs Should Not Ignore Non-Determinism",
    385       "authors": ["Y. Song", "G. Wang", "S. Li", "B. Y. Lin"],
    386       "year": 2025,
    387       "doi": "10.18653/v1/2025.naacl-long.211",
    388       "relevance": "Addresses non-determinism in LLM evaluation, relevant to evaluation methodology."
    389     },
    390     {
    391       "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
    392       "authors": ["M. Mohammadi", "Y. Li", "J. Lo", "W. Yip"],
    393       "year": 2025,
    394       "doi": "10.1145/3711896.3736570",
    395       "relevance": "Survey of LLM agent evaluation methods and benchmarks."
    396     },
    397     {
    398       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    399       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    400       "year": 2023,
    401       "arxiv_id": "2305.05176",
    402       "doi": "10.48550/arXiv.2305.05176",
    403       "relevance": "Addresses cost-performance tradeoffs in LLM deployment, relevant to practical AI system design."
    404     },
    405     {
    406       "title": "The Current Challenges of Software Engineering in the Era of Large Language Models",
    407       "authors": ["C. Gao", "X. Hu", "S. Gao", "X. Xia", "Z. Jin"],
    408       "year": 2025,
    409       "doi": "10.1145/3712005",
    410       "relevance": "Discusses SE challenges with LLMs including semantic mismatch between model output and software requirements."
    411     },
    412     {
    413       "title": "Diagnosing Bias and Instability in LLM Evaluation: A Scalable Pairwise Meta-Evaluator",
    414       "authors": ["C. Anghel", "A. A. Anghel", "E. Pecheanu"],
    415       "year": 2025,
    416       "doi": "10.3390/info16080652",
    417       "relevance": "Meta-evaluation showing ~48% verdict reversal rate in LLM-as-judge setups, relevant to evaluation reliability."
    418     }
    419   ]
    420 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs