scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18923B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Failure Modes in LLM Systems: A System-Level Taxonomy for Reliable AI Applications",
      6     "authors": [
      7       "Vaishali Vinay"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.19933",
     12     "doi": "10.48550/arXiv.2511.19933"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract claims the paper 'presents a system-level taxonomy' and 'analyzes the growing gap in evaluation and monitoring practices.' The taxonomy is presented, but the 'analysis' of the gap is largely a narrative literature review without systematic evidence gathering or structured comparison.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper makes causal claims like cost-driven reductions causing performance collapse and version drift causing regression, but these are asserted based on cited literature without the paper producing its own causal evidence. Language like 'cost constraints further worsen the risk' and 'adjustments to the underlying versions can introduce regression' are causal claims supported only by narrative citation.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper's title claims to address 'Reliable AI Applications' broadly. The taxonomy is presented as general but is derived from a selective literature review without systematic coverage. No boundaries are placed on what types of LLM systems, deployment contexts, or scales the taxonomy applies to.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper presents its taxonomy as the framing for LLM failures without considering alternative organizational schemes or acknowledging that other taxonomies might capture the same phenomena differently. No discussion of whether the 15 modes are exhaustive or whether the three-dimension grouping is the most useful.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "The paper makes no measurements; it is a theoretical taxonomy. No proxy-outcome gap exists.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "There is no limitations section. The paper goes directly from 'Design Principles' (Section VI) to 'Future Work' (Section VII) to 'Conclusion' (Section VIII) without discussing limitations.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No threats to validity are discussed. No acknowledgment that the taxonomy might be incomplete, that the literature selection might be biased, or that the failure modes might overlap.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper does not explicitly state what it does NOT cover. It presents the taxonomy as comprehensive ('fifteen hidden failure modes') without bounding the scope of applicability.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding disclosure. The author is affiliated with Microsoft Security Research but no funding statement is provided.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The author's affiliation with Microsoft Security Research is clearly stated in the header, along with the disclaimer that views do not reflect Microsoft's positions.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The author works at Microsoft, which sells LLM-based products (Azure OpenAI, Copilot). A paper framing LLM failures as solvable engineering problems (rather than fundamental limitations) could serve Microsoft's commercial interests. No independence is established.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement. The author's Microsoft employment is disclosed as affiliation but not as a potential conflict of interest.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Key terms like 'hidden failure modes,' 'system-level,' 'reliability,' and 'drift' are used throughout without formal definition. What makes a failure 'hidden' versus observable is never specified.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The abstract and introduction clearly state the contribution: a system-level taxonomy of 15 failure modes providing 'an analytical foundation for future research on evaluation methodology, AI system robustness, and dependable LLM deployment.'",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper cites 62 references and explicitly positions its taxonomy against existing work on hallucinations, bias, and safety (Cemri et al. [6], Cui et al. [12], Winston and Just [24]), explaining what prior work misses.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "position": {
    116       "argument_quality": {
    117         "argument_internally_consistent": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "The central argument — that LLM failures are system-level rather than model-level — is consistently maintained across all sections, from the taxonomy through design principles.",
    121           "source": "haiku"
    122         },
    123         "counterarguments_addressed": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No counterarguments are addressed. The paper does not consider whether model improvements will render the taxonomy obsolete, whether the system/model distinction holds, or whether 15 categories is the right granularity.",
    127           "source": "haiku"
    128         },
    129         "analogies_appropriate": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Analogies to traditional software engineering (deterministic inference, regression testing) are appropriate and the paper correctly identifies where LLM properties break those assumptions.",
    133           "source": "haiku"
    134         },
    135         "prescriptions_proportional": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Design principles (canonicalization, verification layers, behavioral monitoring) are high-level and proportional; they are presented as principles for future work rather than definitive mandates.",
    139           "source": "haiku"
    140         },
    141         "evidence_for_claims_cited": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Most factual claims carry citations — e.g., '20-30% output divergence' cites [5], '48.4% LLM-as-judge reversal rate' cites [51], non-determinism evaluation issues cites [52].",
    145           "source": "haiku"
    146         },
    147         "alternatives_discussed": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper does not systematically compare its 15-mode framework against the competing taxonomies by Winston and Just [24] or Cui et al. [12], making the novel contribution unclear.",
    151           "source": "haiku"
    152         },
    153         "historical_context_accurate": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Historical framing of LLM development, benchmark limitations, and classical ML evaluation assumptions appears accurate and consistent with the cited literature.",
    157           "source": "haiku"
    158         }
    159       },
    160       "clarity_and_scope": {
    161         "key_terms_defined_precisely": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "'Hidden failure modes' is central to the paper but 'hidden' is never defined. The boundary between 'system-level' and 'model-level' failures is described by example but never formally demarcated.",
    165           "source": "haiku"
    166         },
    167         "engages_with_existing_literature": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "The paper engages substantively with LLM evaluation literature (Chang et al. [9], Mohammadi et al. [53]) and reliability work (Majeed [15]), explaining what gaps remain rather than just listing citations.",
    171           "source": "haiku"
    172         },
    173         "intended_audience_clear": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The intended audience (researchers, practitioners, system architects) is not explicitly stated; the paper spans conceptual taxonomy work and engineering design principles without specifying who should act on which.",
    177           "source": "haiku"
    178         },
    179         "assumptions_stated": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Core assumptions — that system-level framing is the right lens, that 15 categories are sufficient and non-overlapping, that production behavior is fundamentally different from benchmark behavior — are implicit, never stated as assumptions.",
    183           "source": "haiku"
    184         },
    185         "scope_of_applicability_discussed": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The taxonomy is presented as applying broadly to all LLM-based applications with no discussion of where it applies less well, e.g., simple chatbots vs. multi-agent pipelines, or open-source vs. API-accessed models.",
    189           "source": "haiku"
    190         }
    191       }
    192     }
    193   },
    194   "claims": [
    195     {
    196       "claim": "Output divergence in multi-step reasoning tasks has been reported to be greater than 20-30%",
    197       "evidence": "Cites Chen et al. [5] 'Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs'",
    198       "supported": "moderate"
    199     },
    200     {
    201       "claim": "48.4% of LLM-as-judge evaluation pipelines reversed verdicts when response order was mirrored",
    202       "evidence": "Cites Anghel et al. [51] 'Diagnosing Bias and Instability in LLM Evaluation'",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Existing benchmarks measure knowledge and reasoning but fail to capture stability, reproducibility, drift, or workflow integration",
    207       "evidence": "Argued throughout Section IV with citations to benchmark literature; no original measurement conducted by the authors",
    208       "supported": "moderate"
    209     },
    210     {
    211       "claim": "Benchmark-aligned improvements often fail to translate into downstream operational stability",
    212       "evidence": "Cites LIFBench [21] and LLM evaluation work [22], but without systematic review of the claim's generality",
    213       "supported": "moderate"
    214     },
    215     {
    216       "claim": "Model version updates can introduce regressions in previously stable workflows without visible benchmark signals",
    217       "evidence": "Cites Herrera-Poyatos et al. [56] on model uncertainty and Hajmohammed et al. [55] on concept drift",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Standardized prompt formats contribute to response stability across multiple task types",
    222       "evidence": "Cites Chen et al. [58] and Sahoo et al. [59] on prompt engineering; these support the claim but are themselves surveys",
    223       "supported": "moderate"
    224     }
    225   ],
    226   "methodology_tags": [
    227     "theoretical"
    228   ],
    229   "key_findings": "The paper proposes a taxonomy of 15 system-level failure modes in LLM applications grouped into: reasoning failures (hallucinations, logical inconsistency, planning collapse, overconfidence, task constraint violations), input/context failures (ambiguous prompts, prompt injection, context truncation, domain mismatch, conflicting instructions), and system/operational failures (tool invocation errors, external tool breakdowns, multi-agent communication failures, business logic misalignment, cost-driven degradation). The central argument is that LLM reliability is a systems engineering problem, not a model quality problem, and that current benchmarks systematically miss production reliability concerns including drift, reproducibility, and behavioral stability. Design principles proposed include input canonicalization, intermediate validation layers, and behavioral drift monitoring distinct from infrastructure telemetry.",
    230   "red_flags": [
    231     {
    232       "flag": "Taxonomy not empirically grounded",
    233       "detail": "The 15 failure modes are asserted without explanation of how they were derived — no systematic literature review protocol, no incident analysis, no grounded theory methodology. The number and three-category grouping appear stipulated rather than discovered."
    234     },
    235     {
    236       "flag": "No comparison to existing taxonomies",
    237       "detail": "Winston and Just [24] and Cui et al. [12] already published competing failure taxonomies for LLM systems. The paper cites them but does not systematically compare, making the novelty of the 15-mode framework unclear."
    238     },
    239     {
    240       "flag": "Secondary citation of key statistics",
    241       "detail": "Key quantitative claims (20-30% divergence, 48.4% reversal rate) are borrowed from other papers without verifying whether those specific studies support the exact claims being made in this paper's context."
    242     },
    243     {
    244       "flag": "No limitations section",
    245       "detail": "The taxonomy's completeness, coverage, and categorization choices are never questioned. There is no acknowledgment that the taxonomy could be incomplete, overlapping, or domain-specific."
    246     },
    247     {
    248       "flag": "Microsoft affiliation without COI disclosure",
    249       "detail": "The author works at Microsoft Security Research — a company with significant commercial interest in LLM reliability tooling (Azure OpenAI) — but no competing interests statement is included."
    250     }
    251   ],
    252   "cited_papers": [
    253     {
    254       "title": "Why Do Multi-Agent LLM Systems Fail?",
    255       "relevance": "Primary empirical source for multi-agent LLM failure rates; heavily cited as evidence throughout the taxonomy sections"
    256     },
    257     {
    258       "title": "Risk Taxonomy, Mitigation, and Assessment Benchmarks of Large Language Model Systems",
    259       "relevance": "Competing LLM risk taxonomy that the paper positions against"
    260     },
    261     {
    262       "title": "A Taxonomy of Failures in Tool-Augmented LLMs",
    263       "relevance": "Most directly related prior work on tool-use failure classification; cited for tool invocation failure modes"
    264     },
    265     {
    266       "title": "A Survey on Evaluation of Large Language Models",
    267       "relevance": "Used to characterize limitations of existing evaluation approaches and motivate the evaluation gap argument"
    268     },
    269     {
    270       "title": "Evaluation and Benchmarking of LLM Agents: A Survey",
    271       "relevance": "Used to support the argument that reliability dimensions are underrepresented in current agent benchmarks"
    272     },
    273     {
    274       "title": "The Good, The Bad, and The Greedy: Evaluation of LLMs Should Not Ignore Non-Determinism",
    275       "relevance": "Empirical evidence cited for LLM non-determinism as an evaluation validity problem"
    276     },
    277     {
    278       "title": "AIRepr: An Analyst-Inspector Framework for Evaluating Reproducibility of LLMs in Data Science",
    279       "relevance": "Used to support claims about LLM reproducibility gaps in production deployments"
    280     },
    281     {
    282       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    283       "relevance": "Cited as evidence that cost-performance trade-offs exist and can be managed intelligently"
    284     }
    285   ],
    286   "engagement_factors": {
    287     "practical_relevance": {
    288       "score": 2,
    289       "justification": "Practitioners building LLM production systems will recognize the failure modes and may find the taxonomy useful for risk assessment checklists."
    290     },
    291     "surprise_contrarian": {
    292       "score": 1,
    293       "justification": "The system-vs-model reframe is useful but not surprising to anyone who has deployed LLMs in production; the specific failure modes are largely already-known issues."
    294     },
    295     "fear_safety": {
    296       "score": 2,
    297       "justification": "Discusses failures in healthcare, finance, and legal contexts and raises auditability and compliance risks from non-reproducible LLM outputs."
    298     },
    299     "drama_conflict": {
    300       "score": 1,
    301       "justification": "Implicitly critical of the benchmark-focused AI research culture but not a pointed attack on specific systems or companies."
    302     },
    303     "demo_ability": {
    304       "score": 0,
    305       "justification": "Pure taxonomy paper with no tool, dataset, code, or interactive demonstration."
    306     },
    307     "brand_recognition": {
    308       "score": 2,
    309       "justification": "Microsoft affiliation provides institutional credibility, though the author is not a widely recognized named researcher."
    310     }
    311   },
    312   "hn_data": {
    313     "threads": [
    314       {
    315         "hn_id": "46055177",
    316         "title": "Image Diffusion Models Exhibit Emergent Temporal Propagation in Videos",
    317         "points": 124,
    318         "comments": 22,
    319         "url": "https://news.ycombinator.com/item?id=46055177",
    320         "created_at": "2025-11-26T07:55:49Z"
    321       },
    322       {
    323         "hn_id": "44313278",
    324         "title": "S1: Simple Test-Time Scaling",
    325         "points": 3,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=44313278",
    328         "created_at": "2025-06-18T21:06:55Z"
    329       },
    330       {
    331         "hn_id": "43005221",
    332         "title": "s1: Simple Test-Time Scaling",
    333         "points": 2,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=43005221",
    336         "created_at": "2025-02-10T21:13:00Z"
    337       },
    338       {
    339         "hn_id": "42979455",
    340         "title": "Test-time scaling new approach: extra test-time compute improves LLM reasoning",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=42979455",
    344         "created_at": "2025-02-08T01:23:41Z"
    345       }
    346     ],
    347     "top_points": 124,
    348     "total_points": 131,
    349     "total_comments": 22
    350   }
    351 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs