ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17958B)


      1 {
      2   "paper": {
      3     "title": "A comprehensive taxonomy of hallucinations in Large Language Models",
      4     "authors": ["Manuel Cossio"],
      5     "year": 2025,
      6     "arxiv_id": "2508.01781"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No code or analysis scripts are released. No repository URL is provided."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No dataset or corpus of reviewed papers is released. The survey does not provide a structured dataset of its findings."
     19       },
     20       "environment_specified": {
     21         "applies": false,
     22         "answer": false,
     23         "justification": "This is a narrative survey paper with no computational experiments requiring an environment specification."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No instructions for reproducing the literature review are provided. No search queries, databases searched, or systematic review protocol is described."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "This is a narrative survey with no original experiments or statistical analyses."
     36       },
     37       "significance_tests": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No statistical comparisons are made in this survey."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No original quantitative results are reported."
     46       },
     47       "sample_size_justified": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No experiments with sample sizes are conducted."
     51       },
     52       "variance_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original experimental results are reported."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "This is a taxonomy/survey paper, not an evaluation of a system. No baselines are applicable."
     63       },
     64       "baselines_contemporary": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No system evaluation is performed."
     68       },
     69       "ablation_study": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No system with components to ablate."
     73       },
     74       "multiple_metrics": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No evaluation of a system is performed."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No system outputs are produced that would require human evaluation."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No experiments are conducted."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper provides detailed per-category breakdowns of hallucination types (Tables 1, 2, 3), organizing findings by type (intrinsic/extrinsic, factuality/faithfulness) and by cause (data, model, prompt factors)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper extensively discusses failure cases of LLMs including specific examples of hallucinations (e.g., Google Bard's JWST claim, ChatGPT defamation cases) in Sections 4.1-4.9."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 7.3 discusses limitations of existing evaluation metrics and benchmarks, and Section 8 notes that no single mitigation technique fully eliminates hallucinations."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims to provide a comprehensive taxonomy of hallucinations, which the paper delivers through detailed categorization in Sections 2-9. Claims about theoretical inevitability are supported by discussing the formal framework from reference [100]."
    110       },
    111       "causal_claims_justified": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "The paper does not make original causal claims. It reports causal theories from cited literature (e.g., causes of hallucinations in Section 5) but does not present its own causal evidence."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes broad claims about LLM hallucinations being 'inevitable' and discusses them as universal to all computable LLMs, but does not bound these generalizations to specific model families, sizes, or deployment contexts tested by the cited studies."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": false,
    123         "answer": false,
    124         "justification": "As a pure taxonomy/survey with no original empirical results, alternative explanations for observed results are not applicable."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": false,
    130         "answer": false,
    131         "justification": "No models are used in experiments. The paper is a survey."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "No prompting is used. This is a survey paper."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No experiments are conducted."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper does not describe any systematic literature review methodology: no search queries, databases, inclusion/exclusion criteria, or filtering pipeline are documented. It is unclear how the surveyed papers were selected."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "There is no dedicated limitations section. The conclusions (Section 10) do not discuss limitations of the survey itself."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No threats to validity of this survey are discussed. There is no acknowledgment of potential selection bias in the papers reviewed or limitations of the narrative approach."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what is out of scope. No boundaries are set regarding which LLM families, time periods, or domains are excluded from the taxonomy."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw data (e.g., list of all papers reviewed, extraction sheets) is available for verification."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The paper does not describe how the literature was identified or collected. No systematic review protocol is documented."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. Data source is published literature, but the selection methodology is not a recruitment question."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No data pipeline is documented. The paper goes directly from introduction to taxonomy without describing how papers were identified, screened, or selected."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding information is disclosed anywhere in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The author's affiliation with Universitat de Barcelona is clearly stated on the first page."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is treated as non-disclosure."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "This is a survey paper that does not evaluate any model on a benchmark."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "No model evaluation is performed."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "No model evaluation is performed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this survey."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "Survey paper with no method whose cost could be reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "Survey paper with no computational experiments."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "Hallucination is an inherent and inevitable limitation of computable LLMs, irrespective of architecture or training.",
    285       "evidence": "Section 2.2 presents three theorems and a corollary from reference [100] (Xu et al.) using diagonalization arguments from computability theory to argue inevitability.",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "LLM hallucinations can be categorized into intrinsic vs. extrinsic and factuality vs. faithfulness dimensions.",
    290       "evidence": "Section 3 presents these taxonomies with citations to multiple prior works [7, 70, 79, 42, 50, 64, 96, 61] and Table 2 summarizes the categories.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "Hallucination causes span data-related, model-related, and prompt-related factors.",
    295       "evidence": "Section 5 and Table 3 detail 18 specific factors across these three categories with supporting citations.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "A one-size-fits-all solution for hallucination is unlikely to be effective due to diverse manifestation types.",
    300       "evidence": "Section 4.10 argues this based on the enumeration of different hallucination types across domains, but provides no empirical comparison of mitigation strategies.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Increased training compute correlates with higher model accuracy and reduced hallucination propensity.",
    305       "evidence": "Section 9.3.1 discusses Epoch AI dashboard data showing ~12 percentage point accuracy increase per 10x compute increase on GPQA Diamond, but this is reported from external dashboards, not original analysis.",
    306       "supported": "moderate"
    307     }
    308   ],
    309   "methodology_tags": ["meta-analysis", "qualitative"],
    310   "key_findings": "This paper presents a comprehensive taxonomy of LLM hallucinations, categorizing them along intrinsic/extrinsic and factuality/faithfulness dimensions, with detailed sub-categories including factual errors, contextual inconsistencies, logical errors, temporal disorientation, and ethical violations. It synthesizes formal theoretical arguments for the inevitability of hallucination in computable LLMs based on diagonalization proofs, and surveys evaluation benchmarks (TruthfulQA, HalluLens, FActScore) and mitigation strategies (RAG, tool augmentation, guardrails). The paper also covers cognitive/human factors affecting hallucination perception and introduces web-based monitoring resources.",
    311   "red_flags": [
    312     {
    313       "flag": "No systematic review methodology",
    314       "detail": "The paper presents itself as a comprehensive survey but does not describe any systematic literature search methodology: no databases searched, no search queries, no inclusion/exclusion criteria, no PRISMA-style flow diagram. The selection of 109 references appears ad hoc, making it impossible to assess completeness or bias in coverage."
    315     },
    316     {
    317       "flag": "No quality assessment of reviewed studies",
    318       "detail": "The survey summarizes findings from cited papers without any structured quality assessment of those sources. Claims from individual papers are presented as established facts without evaluating the rigor of the underlying evidence. This risks laundering weak results through aggregation."
    319     },
    320     {
    321       "flag": "Unbounded generalizations",
    322       "detail": "The paper makes sweeping claims about hallucination inevitability and its implications for 'all computable LLMs' based on theoretical arguments from a single reference [100], without discussing the practical limitations or assumptions of these formal proofs."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper lacks any discussion of its own limitations as a survey, such as potential selection bias in papers reviewed, language restrictions, or temporal coverage boundaries."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "CodeMirage: Hallucinations in Code Generated by Large Language Models",
    332       "authors": ["Vibhor Agarwal", "Yulong Pei", "Salwa Alamir", "Xiaomo Liu"],
    333       "year": 2024,
    334       "arxiv_id": "2408.08333",
    335       "relevance": "Directly addresses hallucination in code generation by LLMs, a key concern for AI-assisted programming."
    336     },
    337     {
    338       "title": "HalluLens: LLM Hallucination Benchmark",
    339       "authors": ["Yejin Bang", "Ziwei Ji", "Alan Schelten"],
    340       "year": 2025,
    341       "arxiv_id": "2504.17550",
    342       "relevance": "Comprehensive hallucination benchmark with taxonomy-aware evaluation relevant to LLM quality assessment."
    343     },
    344     {
    345       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    346       "authors": ["Maksym Andriushchenko"],
    347       "year": 2024,
    348       "arxiv_id": "2410.09024",
    349       "relevance": "Benchmark for measuring harmful outputs from LLM agents, relevant to AI safety evaluation."
    350     },
    351     {
    352       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    353       "authors": ["Timo Schick"],
    354       "year": 2023,
    355       "relevance": "Foundational work on tool-augmented LLMs, relevant to agentic AI capabilities and hallucination mitigation."
    356     },
    357     {
    358       "title": "Hallucination is Inevitable: An Innate Limitation of Large Language Models",
    359       "year": 2024,
    360       "relevance": "Formal theoretical framework proving hallucination inevitability in computable LLMs, fundamental to understanding LLM limitations."
    361     },
    362     {
    363       "title": "CodeHaluEval: A Large Language Models Hallucination Benchmark for Code Generation",
    364       "year": 2024,
    365       "relevance": "Benchmark specifically targeting hallucination evaluation in code-generating LLMs."
    366     },
    367     {
    368       "title": "Sparks of Artificial General Intelligence: Early Experiments with GPT-4",
    369       "authors": ["Sebastien Bubeck"],
    370       "year": 2023,
    371       "relevance": "Early comprehensive evaluation of GPT-4 capabilities and limitations relevant to LLM assessment methodology."
    372     },
    373     {
    374       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    375       "year": 2022,
    376       "relevance": "Key benchmark for evaluating LLM truthfulness, widely used in AI safety and capability assessment."
    377     },
    378     {
    379       "title": "Building Guardrails for Large Language Models",
    380       "authors": ["Yi Dong"],
    381       "year": 2024,
    382       "arxiv_id": "2402.01822",
    383       "relevance": "Directly relevant to LLM safety mechanisms and deployment guardrails."
    384     },
    385     {
    386       "title": "Mind the Confidence Gap: Overconfidence, Calibration, and Distractor Effects in Large Language Models",
    387       "authors": ["Prateek Chhikara"],
    388       "year": 2025,
    389       "arxiv_id": "2502.11028",
    390       "relevance": "Addresses LLM calibration and overconfidence, relevant to understanding reliability of AI-generated outputs."
    391     }
    392   ]
    393 }

Impressum · Datenschutz