ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (19372B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Failure Modes in LLM Systems: A System-Level Taxonomy for Reliable AI Applications",
      6     "authors": [
      7       "Vaishali Vinay"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.19933",
     12     "doi": "10.48550/arXiv.2511.19933"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract promises a taxonomy of 15 failure modes, analysis of evaluation gaps, examination of production challenges, and design principles — all of which appear in Sections III–VI respectively.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper makes causal claims (e.g., 'cost-driven reductions cause accuracy degradation,' 'version drift causes regressions in previously stable behavior') but conducts no original empirical study; it is a taxonomy paper relying on secondary citations to support these causal assertions.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper makes sweeping statements like 'most assessments of LLMs are still anchored to static benchmarks' and 'current evaluation frameworks often overlook how LLM systems behave when deployed' without bounding scope to specific model families, deployment contexts, or application domains.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper presents only one interpretation — that system-level redesign is required — without considering alternatives such as whether model quality improvements alone might obsolete these failure modes, or whether existing MLOps practices already address them.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper explicitly and repeatedly distinguishes between benchmark accuracy (proxy) and real production reliability (actual outcome), which is central to its argument in Section IV.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "There is a 'Future Work' section (VII) but no dedicated limitations or threats-to-validity section; the future work section identifies research gaps rather than critiquing the paper's own methodology or claims.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No threats to validity are discussed anywhere in the paper — the taxonomy is presented without acknowledging that it may be incomplete, overlapping, or non-exhaustive.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper never states what its taxonomy does NOT cover or which types of LLM applications fall outside its scope; it presents the 15 failure modes as broadly applicable without qualification.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source is disclosed; there is only a disclaimer that views do not reflect Microsoft's positions, but no statement on whether or how the work was funded.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The author's affiliation (Microsoft Security Research, Redmond, WA) is clearly disclosed on the title page.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The author is employed by Microsoft, which develops and markets LLM products (Azure OpenAI, Copilot); the paper advocates for system-level reliability improvements that would directly benefit Microsoft's enterprise LLM offerings, creating a potential conflict of interest.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "There is no competing interests statement, no declaration of patents or equity, and no COI disclosure beyond the disclaimer that views are the author's own.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Central terms like 'reliability,' 'LLM system,' and 'hidden failure' are used throughout without precise operational definitions; 'drift' is partially defined (version/data/behavior drift in Section V) but 'reliability' is never formally defined.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly states in the abstract and introduction that it contributes a taxonomy of 15 failure modes, analysis of evaluation gaps, production deployment challenges, and design principles.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper contrasts its taxonomy with existing literature focused on hallucinations, bias, and abstract safety (citing [11, 12]) and positions its system-level framing as distinct from model-centric approaches, citing 62 references throughout.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "position": {
    116       "argument_quality": {
    117         "argument_internally_consistent": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "The paper consistently argues that LLM failures are system-level problems throughout all sections — the taxonomy, evaluation gap analysis, production gap, and design principles all reinforce the same central thesis without contradiction.",
    121           "source": "haiku"
    122         },
    123         "counterarguments_addressed": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The paper does not engage with any counterarguments — it does not address whether model improvements alone might eventually solve these issues, whether existing MLOps practices already handle most failure modes, or whether the proposed taxonomy is the right categorization.",
    127           "source": "haiku"
    128         },
    129         "analogies_appropriate": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper's primary analogy — contrasting LLM behavior against classical deterministic ML systems — is appropriate and accurately captures the distinction being made.",
    133           "source": "haiku"
    134         },
    135         "prescriptions_proportional": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "The design principles in Section VI are framed as high-level recommendations (input canonicalization, validation layers, semantic monitoring) proportional to the argument; no sweeping mandates are issued that exceed the paper's analytical scope.",
    139           "source": "haiku"
    140         },
    141         "evidence_for_claims_cited": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Most factual assertions are backed by citations; for example, '20-30% output divergence' cites [5], '48.4% verdict reversal in LLM-as-judge' cites [51], and 'version drift regression' cites [8, 55, 56].",
    145           "source": "haiku"
    146         },
    147         "alternatives_discussed": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper presents no alternative frameworks or taxonomies for understanding LLM reliability — it acknowledges related work on hallucinations and bias but dismisses them as insufficient without comparing the merits of alternative framings.",
    151           "source": "haiku"
    152         },
    153         "historical_context_accurate": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "The paper's historical framing — that benchmarks designed for static tasks predate the era of agentic, multi-step LLM deployments — appears accurate, and citations to LLM survey papers [9, 16] are used correctly.",
    157           "source": "haiku"
    158         }
    159       },
    160       "clarity_and_scope": {
    161         "key_terms_defined_precisely": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "'Reliability' is used throughout as the core concept but never given a precise definition; 'system-level' vs. 'model-level' failure is explained descriptively but without a formal operational boundary.",
    165           "source": "haiku"
    166         },
    167         "engages_with_existing_literature": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "The paper engages substantively with existing literature on hallucinations [11], risk taxonomies [12], evaluation surveys [9, 53], multi-agent failures [6], and tool-use failures [24], situating its contribution relative to each.",
    171           "source": "haiku"
    172         },
    173         "intended_audience_clear": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The paper's intended audience — whether systems engineers, ML researchers, or enterprise architects — is never explicitly stated; the technical vocabulary suggests practitioners but this is not declared.",
    177           "source": "haiku"
    178         },
    179         "assumptions_stated": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The paper's core assumptions (that LLMs are fundamentally non-deterministic, that production environments are qualitatively different from benchmarks, that system design can compensate for model limitations) are never explicitly stated as assumptions requiring reader acceptance.",
    183           "source": "haiku"
    184         },
    185         "scope_of_applicability_discussed": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The taxonomy is presented as applicable to all 'LLM-based applications' without discussing whether it applies equally to simple chatbots, complex multi-agent pipelines, regulated domains, or different scales of deployment.",
    189           "source": "haiku"
    190         }
    191       }
    192     }
    193   },
    194   "claims": [
    195     {
    196       "claim": "Output divergence in multi-step reasoning tasks has been reported to be greater than 20-30%",
    197       "evidence": "Cited to Chen et al. 2024 [5] on self-consistency failures in LLM multi-step reasoning",
    198       "supported": "moderate"
    199     },
    200     {
    201       "claim": "48.4% of LLM-as-judge pipelines reversed verdicts when response order was mirrored",
    202       "evidence": "Cited to Anghel et al. 2025 [51] meta-evaluation study",
    203       "supported": "moderate"
    204     },
    205     {
    206       "claim": "Benchmark-aligned improvements often fail to translate into downstream operational stability",
    207       "evidence": "Asserted with citations [21, 22] that evaluate instruction-following stability; neither directly establishes this as a systematic finding across deployments",
    208       "supported": "weak"
    209     },
    210     {
    211       "claim": "Current evaluation frameworks do not capture stability, drift, reproducibility, or cross-version consistency",
    212       "evidence": "Argued via literature synthesis pointing to gaps in BLEU/ROUGE metrics [50] and non-determinism studies [52]; no meta-analysis or systematic review conducted",
    213       "supported": "weak"
    214     },
    215     {
    216       "claim": "LLM reliability must be framed as a system-engineering problem rather than a model-centric one",
    217       "evidence": "Supported by the 15-failure-mode taxonomy and cited deployment studies, but the taxonomy itself has no empirical validation",
    218       "supported": "weak"
    219     }
    220   ],
    221   "methodology_tags": [
    222     "theoretical",
    223     "qualitative"
    224   ],
    225   "key_findings": "The paper presents a system-level taxonomy of 15 failure modes in LLM applications, organized into reasoning failures (hallucination, logical inconsistency, planning collapse, overconfidence, task-constraint violation), input/context failures (ambiguous prompts, prompt injection, context truncation, domain mismatch, conflicting instructions), and system/operational failures (tool invocation errors, external tool breakdowns, multi-agent communication failures, business-rule misalignment, cost-driven degradation). It argues that existing evaluation benchmarks fail to capture production reliability because they measure static accuracy rather than stability, reproducibility, or drift. The paper concludes that reliable LLM deployment requires system-design-level interventions — input canonicalization, validation layers, semantic monitoring, and cost-aware governance — rather than model tuning alone.",
    226   "red_flags": [
    227     {
    228       "flag": "No empirical validation of taxonomy",
    229       "detail": "The 15 failure modes are asserted based on informal observation and secondary citations — there is no systematic study, failure log analysis, or user study establishing that these modes are exhaustive, non-overlapping, or accurately categorized."
    230     },
    231     {
    232       "flag": "Undisclosed funding, potential COI",
    233       "detail": "The author is a Microsoft Security Research employee; Microsoft produces LLM products (Azure OpenAI, Copilot) that benefit from the reliability improvements advocated. No funding disclosure and no competing interests statement is present."
    234     },
    235     {
    236       "flag": "No limitations section",
    237       "detail": "The paper presents no limitations, threats to validity, or acknowledgment that the taxonomy might be incomplete, overlapping, or inapplicable to certain deployment contexts."
    238     },
    239     {
    240       "flag": "Broad generalizations without scope bounds",
    241       "detail": "Claims like 'most assessments of LLMs are still anchored to static benchmarks' and 'current evaluation frameworks often overlook production behavior' are presented as settled facts without systematic evidence or scoping."
    242     },
    243     {
    244       "flag": "No counterarguments considered",
    245       "detail": "The paper does not engage with the possibility that model improvements alone could address these issues, that existing MLOps practices already handle many failure modes, or that alternative taxonomies may be superior."
    246     },
    247     {
    248       "flag": "Taxonomy not compared to prior taxonomies",
    249       "detail": "The paper cites prior taxonomies (hallucination-focused [11], risk taxonomies [12], tool-failure taxonomy [24]) but does not systematically compare how its 15 modes relate to or subsume categories in those works."
    250     }
    251   ],
    252   "cited_papers": [
    253     {
    254       "title": "Why Do Multi-Agent LLM Systems Fail?",
    255       "relevance": "Primary empirical source for multi-agent failure claims; cited extensively as foundational evidence for the taxonomy"
    256     },
    257     {
    258       "title": "A Taxonomy of Failures in Tool-Augmented LLMs",
    259       "relevance": "Directly related prior taxonomy of tool-use failures; a key comparison point for this paper's system-level framing"
    260     },
    261     {
    262       "title": "Risk Taxonomy, Mitigation, and Assessment Benchmarks of Large Language Model Systems",
    263       "relevance": "Prior risk taxonomy that this paper positions against, arguing existing taxonomies miss system-level failure modes"
    264     },
    265     {
    266       "title": "A Survey on Evaluation of Large Language Models",
    267       "relevance": "Cited as foundation for the evaluation gap argument; establishes what current evaluation covers and misses"
    268     },
    269     {
    270       "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
    271       "relevance": "Survey evidence that 'reliability' and 'long-horizon interaction' are underaddressed in current benchmarks"
    272     },
    273     {
    274       "title": "Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs",
    275       "relevance": "Empirical basis for claims about 20-30% output divergence in multi-step reasoning"
    276     },
    277     {
    278       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    279       "relevance": "Cited in the cost-driven degradation failure mode discussion; relevant to cost-accuracy trade-off literature"
    280     },
    281     {
    282       "title": "The Good, The Bad, and The Greedy: Evaluation of LLMs Should Not Ignore Non-Determinism",
    283       "relevance": "Empirical evidence that single-run evaluation is insufficient due to LLM output stochasticity"
    284     }
    285   ],
    286   "engagement_factors": {
    287     "practical_relevance": {
    288       "score": 2,
    289       "justification": "The taxonomy gives practitioners a vocabulary for categorizing LLM deployment failures, though the design principles remain high-level and unvalidated."
    290     },
    291     "surprise_contrarian": {
    292       "score": 1,
    293       "justification": "The system-level framing over model-centric framing is a moderately contrarian but increasingly common position; not novel enough to be surprising."
    294     },
    295     "fear_safety": {
    296       "score": 2,
    297       "justification": "Raises concrete reliability and safety concerns for LLM deployment in healthcare, finance, and legal domains where auditability failures have regulatory consequences."
    298     },
    299     "drama_conflict": {
    300       "score": 1,
    301       "justification": "Implicitly criticizes the LLM benchmarking community and organizations deploying LLMs without adequate reliability testing, but without naming specific products or sparking overt controversy."
    302     },
    303     "demo_ability": {
    304       "score": 0,
    305       "justification": "No tool, dataset, code, or interactive demo is provided or linked."
    306     },
    307     "brand_recognition": {
    308       "score": 2,
    309       "justification": "Author is from Microsoft Security Research, a recognizable and high-credibility affiliation in the AI reliability space."
    310     }
    311   },
    312   "hn_data": {
    313     "threads": [
    314       {
    315         "hn_id": "46055177",
    316         "title": "Image Diffusion Models Exhibit Emergent Temporal Propagation in Videos",
    317         "points": 124,
    318         "comments": 22,
    319         "url": "https://news.ycombinator.com/item?id=46055177",
    320         "created_at": "2025-11-26T07:55:49Z"
    321       },
    322       {
    323         "hn_id": "44313278",
    324         "title": "S1: Simple Test-Time Scaling",
    325         "points": 3,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=44313278",
    328         "created_at": "2025-06-18T21:06:55Z"
    329       },
    330       {
    331         "hn_id": "43005221",
    332         "title": "s1: Simple Test-Time Scaling",
    333         "points": 2,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=43005221",
    336         "created_at": "2025-02-10T21:13:00Z"
    337       },
    338       {
    339         "hn_id": "42979455",
    340         "title": "Test-time scaling new approach: extra test-time compute improves LLM reasoning",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=42979455",
    344         "created_at": "2025-02-08T01:23:41Z"
    345       }
    346     ],
    347     "top_points": 124,
    348     "total_points": 131,
    349     "total_comments": 22
    350   }
    351 }

Impressum · Datenschutz