ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19983B)


      1 {
      2   "paper": {
      3     "title": "Review of Hallucination Understanding in Large Language and Vision Models",
      4     "authors": ["Ho Zheng Yi", "Liang Siyuan", "Tao Dacheng"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.00034",
      8     "doi": "10.48550/arXiv.2510.00034"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No code repository or analysis scripts are mentioned or linked in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset, corpus of reviewed papers, or supplementary data files are released. A survey could release its paper corpus or extracted taxonomy data."
     23       },
     24       "environment_specified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "This is a pure survey paper with no computational experiments requiring environment specification."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No instructions for reproducing the literature search or taxonomy construction are provided."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a survey paper that does not run experiments or report quantitative results of its own."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No experiments are conducted; the paper synthesizes findings from other works."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No original experiments; the paper only reports effect sizes from cited works."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No original experiments are conducted in this survey."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No original experiments are conducted in this survey."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 2 (Related Works) explicitly compares the survey against six prior surveys (Ji et al., Lin et al., Huang et al., Bai et al., Sahoo et al., Kamali et al.), noting what each covers and how this work differs."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The compared surveys are recent (2022-2025), representing the current landscape of hallucination survey literature."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "A survey paper has no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No experiments are conducted that would require evaluation metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No system outputs to evaluate; this is a literature review."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No experiments requiring train/test splits."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper provides detailed breakdowns across five lifecycle stages (training data, architecture, inference, loss/optimization, evaluation), each with 3-4 subcategories, as shown in Table 2 and Figures 3-7."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The entire paper is organized around failure cases (hallucination causes). Each section discusses specific failure modes with examples from cited literature."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports negative findings from cited works, e.g., multi-agent debates generally underperforming simpler strategies (Section 4.3.2), attention regularizers not fully fixing glitches (Section 4.2.1)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims to present a unified framework (MOWI, Section 3.1), link hallucinations to lifecycle mechanisms (Section 4), and reveal predictable patterns — all delivered in the paper body."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper's causal claims (e.g., 'hallucinations stem from predictable patterns in data distributions') are grounded in cited empirical and theoretical work rather than original experiments. The survey synthesizes causal evidence from multiple sources, which is appropriate for a review."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 6 (Future Directions) explicitly acknowledges limitations: 'Current research has largely focused on textual language models... less common modalities, such as audio, demand greater attention' and 'the landscape presented here is not exhaustive.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper systematically presents competing explanations, e.g., for directional asymmetry it discusses both architectural explanations (Section 4.1.4, mechanistic interpretability) and data-driven explanations (Papadopoulos et al., Grosse et al.), noting the issue is 'likely' data-inherited rather than purely architectural."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "This is a survey paper with no original measurements; it synthesizes findings from other works."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No models are used in this survey paper."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting is used in this survey."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No experiments are conducted."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper does not describe how papers were selected for review. There is no search strategy, database selection, inclusion/exclusion criteria, or filtering pipeline documented."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The end of Section 6 contains a limitations paragraph discussing gaps in coverage of visual/multimodal dimensions, audio modalities, and the non-exhaustive nature of the survey."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The limitations discuss specific issues: textual bias in current research ('vision-language models often interpreted through textual anchors'), need for 'deeper theoretical analysis, stronger mechanistic understanding,' and acknowledgment that 'many perspectives and nuances remain unexplored.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 3.2 explicitly scopes the three model types (LLMs, LVLMs, TVMs) with precise definitions. Section 6 states what is NOT covered: audio modalities, exhaustive coverage, and purely visual hallucination analysis."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No list of all reviewed papers, search queries, or extracted data is provided for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not describe how the reviewed papers were collected — no search databases, queries, date ranges, or collection methodology is specified."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants; the data sources are published papers (standard literature)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No documentation of the pipeline from initial paper discovery to final inclusion in the review. The reader cannot assess completeness or selection bias."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding sources are mentioned anywhere in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All three authors are listed with their affiliation at Nanyang Technological University, Singapore."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation on benchmarks is conducted."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No model evaluation on benchmarks is conducted."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this survey."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this survey."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this survey."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this survey."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this survey."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this survey."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this survey."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper with no computational method of its own."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "This is a survey paper with no computational experiments."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No PRISMA flow diagram, structured search strategy, or systematic review protocol is described. The paper selection appears ad-hoc with no reproducible search queries or database specification."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The survey does not assess the methodological quality of its source papers. All cited works are treated as equally reliable evidence regardless of their study design, sample size, or rigor."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No discussion of publication bias. The survey does not consider whether the cited research skews toward particular findings or whether negative results are underrepresented in the hallucination literature."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "Hallucinations are tightly linked to the frequency of data patterns seen during pretraining — performance can vary by up to 70% based on term frequency.",
    311       "evidence": "Section 4.1.1 cites Razeghi et al. [139] showing 70% performance difference, Kandpal et al. [91] showing 54% accuracy increase as frequency increases from 10^1 to 10^4, and Udandarao et al. [161] on concept frequency and zero-shot generalization.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Multi-LLM debates generally underperform simpler single-agent prompting strategies.",
    316       "evidence": "Section 4.3.2 cites Zhang et al. [185] finding that debate methods 'generally underperform simpler single-agent prompting strategies' and 'increasing agent diversity, volume, or dialogue rounds rarely improved accuracy.'",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Hallucinations are not isolated anomalies but systematic and predictable artifacts rooted in how models are trained and used.",
    321       "evidence": "Section 5 (Discussion) synthesizes evidence across all five lifecycle stages, identifying four broad themes: out-of-distribution failure, inherited biases, internal dynamics, and evaluation blind spots.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "The MOWI framework provides a more general and unified definition of hallucinations than prior task-specific definitions.",
    326       "evidence": "Table 1 shows the MOWI framework covering 9 scenarios across LLMs, LVLMs, and TVMs where existing definitions from [80], [84], [8], [90] each fail to cover multiple cases.",
    327       "supported": "moderate"
    328     }
    329   ],
    330   "methodology_tags": ["meta-analysis"],
    331   "key_findings": "This survey proposes the MOWI (Model-Observer-World-Input) framework as a unified, modality-agnostic definition of hallucinations across LLMs, LVLMs, and text-to-vision models. It traces hallucination root causes across five lifecycle stages: training data factors (frequency, memorization, self-consumption, directional asymmetries), architectural limitations (attention glitches, autoregressive constraints, positional encoding, inductive biases), inference mechanisms (few-shot quality, multi-agent debates, exposure bias), loss/optimization dynamics (pretraining, post-training, shortcut learning, heterogeneous preferences), and misleading evaluations (metric blind spots, biased judges, test contamination). The key insight is that hallucinations are systematic and predictable rather than random errors.",
    332   "red_flags": [
    333     {
    334       "flag": "No systematic review protocol",
    335       "detail": "The paper does not describe any structured literature search methodology — no search databases, queries, inclusion/exclusion criteria, or PRISMA flow. Paper selection appears entirely ad-hoc, making it impossible to assess completeness or selection bias."
    336     },
    337     {
    338       "flag": "No quality assessment of sources",
    339       "detail": "All cited works are treated as equally reliable. A workshop paper and a NeurIPS oral are given the same evidentiary weight. The survey risks laundering weak findings by presenting them alongside rigorous ones without differentiation."
    340     },
    341     {
    342       "flag": "Publication date anomaly",
    343       "detail": "The ACM Reference Format lists 'August 2018' as publication date and 'J. ACM 37, 4, Article 111' which appears to be a template placeholder, not actual publication metadata. The paper is from 2025 based on arXiv submission."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "CodeHalu: Investigating Code Hallucinations in LLMs via Execution-based Verification",
    349       "authors": ["Yuchen Tian", "Weixiang Yan", "Qian Yang"],
    350       "year": 2024,
    351       "arxiv_id": "2405.00253",
    352       "relevance": "Directly studies code hallucination in LLMs, relevant to AI coding quality assessment."
    353     },
    354     {
    355       "title": "Importing Phantoms: Measuring LLM Package Hallucination Vulnerabilities",
    356       "authors": ["Arjun Krishna", "Erick Galinkin", "Leon Derczynski", "Jeffrey Martin"],
    357       "year": 2025,
    358       "arxiv_id": "2501.19012",
    359       "relevance": "Measures package hallucination vulnerabilities in LLM-generated code, directly relevant to software reliability."
    360     },
    361     {
    362       "title": "Secret Collusion among AI Agents: Multi-Agent Deception via Steganography",
    363       "authors": ["Sumeet Ramesh Motwani"],
    364       "year": 2024,
    365       "relevance": "Studies covert multi-agent deception including GPT-4 steganographic communication, relevant to AI safety and agentic systems."
    366     },
    367     {
    368       "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors",
    369       "authors": ["Weize Chen"],
    370       "year": 2024,
    371       "relevance": "Studies multi-agent LLM collaboration and emergent behaviors including destructive tendencies."
    372     },
    373     {
    374       "title": "Are emergent abilities of large language models a mirage?",
    375       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    376       "year": 2024,
    377       "relevance": "Challenges claims of emergent LLM abilities as evaluation metric artifacts, relevant to methodology assessment."
    378     },
    379     {
    380       "title": "Cooperate or Collapse: Emergence of Sustainable Cooperation in a Society of LLM Agents",
    381       "authors": ["Giorgio Piatti"],
    382       "year": 2024,
    383       "relevance": "Studies cooperation dynamics in multi-LLM societies, relevant to agentic AI systems."
    384     },
    385     {
    386       "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions",
    387       "authors": ["Lei Huang"],
    388       "year": 2025,
    389       "doi": "10.1145/3703155",
    390       "relevance": "Comprehensive survey on LLM hallucination that this paper positions against, relevant to survey methodology comparison."
    391     },
    392     {
    393       "title": "Red-Teaming LLM Multi-Agent Systems via Communication Attacks",
    394       "authors": ["Pengfei He"],
    395       "year": 2025,
    396       "arxiv_id": "2502.14847",
    397       "relevance": "Studies adversarial attacks on multi-agent LLM systems, relevant to AI safety and agentic workflows."
    398     },
    399     {
    400       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    401       "authors": ["Lianmin Zheng"],
    402       "year": 2023,
    403       "relevance": "Foundational work on LLM-as-judge evaluation, relevant to AI evaluation methodology."
    404     },
    405     {
    406       "title": "Shortcut Learning of Large Language Models in Natural Language Understanding",
    407       "authors": ["Mengnan Du"],
    408       "year": 2023,
    409       "doi": "10.1145/3596490",
    410       "relevance": "Studies shortcut learning in LLMs which affects code and reasoning quality."
    411     }
    412   ]
    413 }

Impressum · Datenschutz