scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19711B)
      1 {
      2   "paper": {
      3     "title": "Consistency Is the Key: Detecting Hallucinations in LLM Generated Text By Checking Inconsistencies About Key Facts",
      4     "authors": ["Raavi Gupta", "Pranav Hari Panicker", "Sumit Bhatia", "Ganesh Ramakrishnan"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2511.12236"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Our code is available here' with a link. Code availability is claimed."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All datasets used (NQ-Open, HotpotQA, WebQA, WikiBio) are publicly available standard benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions NVIDIA A6000 GPUs and specific model names but does not provide requirements.txt, Dockerfile, or detailed dependency/library version specifications."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or scripts for replicating experiments are described in the paper."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., AUC-PR scores like 0.73, 0.80) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CONFACTCHECK outperforms baselines by comparing raw AUC-PR numbers without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw AUC-PR differences are shown but no formal effect sizes (Cohen's d, etc.) are reported. Percentage gains are mentioned in ablation (e.g., '18%') but without baseline context this is partial."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the sample sizes of the datasets used (e.g., why 3,610 NQ-Open questions, 238 WikiBio articles). No power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported across runs. Results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Four baselines are compared: HaDes, SAC3, SelfCheckGPT (MQAG and Prompt variants), and INSIDE (Table 1)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include INSIDE (ICLR 2024), SelfCheckGPT (EMNLP 2023), SAC3 (EMNLP 2023 Findings), and HaDes (ACL 2022). These are recent and representative methods."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5.4 provides ablation studies on: role of fact alignment vs. uniform distribution check (Table 4), decoding strategies (Table 5), and tagging methods (Table 6)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only AUC-PR is reported as the evaluation metric. No other metrics (AUC-ROC, F1, accuracy) are provided for the main comparison."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the system's hallucination detection outputs is performed. All evaluation is automated against gold labels."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses validation/test splits of established benchmarks (NQ-Open validation split with 3,610 pairs, WikiBio with pre-existing labels)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per dataset (NQ-Open, HotpotQA, WebQA, WikiBio) and per model backbone (LLaMA3.1, Qwen2.5) in Table 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 7 (Limitations) discusses specific failure cases including incorrect tagging on correct outputs (the Eisenhower building example) and inefficient question generation."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper acknowledges that SelfCheck-Prompt outperforms CONFACTCHECK in several settings (Table 1), and that INSIDE is faster. The WikiBio results show lower performance than SelfCheck-Prompt."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims higher accuracy and efficiency compared to baselines. Table 1 shows CONFACTCHECK is best or second-best in most settings, and Table 2 shows fewer LLM calls and competitive latency, supporting these claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims are made via ablation studies (Section 5.4.1, Table 4) showing the contribution of the uniform distribution check component. The controlled single-variable manipulation is adequate."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The Limitations section explicitly bounds scope to English language, notes reliance on NER/POS tools limits applicability to low-resource languages, and acknowledges API-based LLM limitations."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. For example, no consideration of whether the LLM-as-a-judge (GPT-4.1-mini) introduces bias, or whether the gains are due to the judge rather than the method."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are given: LLaMA3.1-8B-Instruct, Qwen2.5-7B-Instruct, Phi-3 family (3.8B, 7B, 13B), GPT-4.1-mini. These are sufficiently specific identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompting approaches in natural language (e.g., 'querying GPT4.1-mini using few-shot prompting') but does not provide the actual prompt text used for question generation, fact alignment judging, or other LLM calls."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Key hyperparameters are reported: temperature 0 for probing, temperature 1 for initial generation, beam size 5, KS test significance level 0.05, top-5 tokens for distribution check, 20 samples for SelfCheckGPT, β1=β2=0.95."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The pipeline is described in detail in Section 3 and Figure 2: NER/POS tagging → question generation (T5-based) → LLM probing → fact alignment via LLM-as-judge → KS test distribution check."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 describes dataset usage, Section 4.3 describes how golden labels are assigned using GPT-4.1-mini as judge, and Appendix B describes how CONFACTCHECK is applied at sentence level with coreference resolution."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 is a dedicated Limitations section with substantive discussion of failure cases, language constraints, and API limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Limitations section discusses specific threats: incorrect NER tags on correct outputs (with concrete example), question generation ambiguity, English-only evaluation, and dependency on token probability access."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states it has only been tested for English, that it requires open-source LLMs with token probability access, and that NER/POS tools constrain applicability to low-resource languages."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The generated outputs, extracted key facts, and hallucination scores are not released. Only aggregate AUC-PR scores are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes how datasets were selected and used, including dataset sizes and splits. Section 4.3 describes how golden labels are generated."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; all data comes from standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from input question → LLM generation → NER/POS tagging → question generation → fact regeneration → alignment checking → distribution check is documented in Section 3 and Figure 2."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Columbia University, IIT Bombay, and Adobe (MDSR Lab)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "One author is affiliated with Adobe, which has commercial interest in LLM reliability. No funding disclosure means independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses LLaMA3.1 and Qwen2.5 on QA benchmarks like NQ-Open and HotpotQA without stating training data cutoff dates for these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the LLMs' training data contains the QA benchmark answers. NQ-Open (2019) and HotpotQA (2018) predate these models' training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "All four benchmarks (NQ-Open 2019, HotpotQA 2018, WebQA 2013, WikiBio) were published well before the models' training cutoffs. No contamination discussion is provided, though the paper partly addresses this by measuring hallucination detection rather than QA accuracy."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 2 reports average inference time per sample (e.g., 9.51s for LLaMA3.1) and average number of LLM calls (3.8) for CONFACTCHECK and all baselines."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (NVIDIA A6000 GPUs) but total compute budget (GPU hours, total API spend for GPT-4.1-mini judge calls) is not quantified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CONFACTCHECK outperforms most baselines on QA datasets while being either the best or second-best method in all settings.",
    286       "evidence": "Table 1 shows AUC-PR scores across 4 datasets and 2 LLM backbones. CONFACTCHECK is best in 4/8 QA settings and second-best in the remaining, with SelfCheck-Prompt as the main competitor.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "CONFACTCHECK achieves significant speedups over self-checking baselines (up to 1.4x over SelfCheck-Prompt, up to 3x over SAC3).",
    291       "evidence": "Table 2 shows inference times: CONFACTCHECK 9.51s vs SelfCheck-Prompt 13.35s (LLaMA3.1), and 9.03s vs SAC3 29.37s (Qwen2.5). However, INSIDE is faster at 4.89s.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The uniform distribution check step leads to significant performance gains of up to 18%.",
    296       "evidence": "Table 4 shows ablation: fact alignment alone gets 0.56 AUC-PR on WebQA (LLaMA3.1), adding distribution check raises it to 0.66, an 18% gain.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CONFACTCHECK works consistently across different model scales.",
    301       "evidence": "Table 3 shows AUC-PR scores for Phi-3 family (3.8B, 7B, 13B) ranging 0.62-0.76 across datasets, showing moderate consistency but no clear scaling trend.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CONFACTCHECK is a hallucination detection method that extracts key facts (via NER/POS tagging) from LLM outputs, generates targeted questions, and checks consistency of regenerated answers. It achieves competitive AUC-PR scores across 4 datasets (NQ-Open, HotpotQA, WebQA, WikiBio) with 2 LLM backbones, while requiring only ~3.8 LLM calls per sample compared to 5-20 for baselines. The method combines fact alignment checking with a KS-test-based uniform distribution check on token probabilities, with the latter contributing up to 18% improvement in ablation studies.",
    307   "red_flags": [
    308     {
    309       "flag": "No variance or significance testing",
    310       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or significance tests. The margins between methods are often small (e.g., 0.73 vs 0.76) making it impossible to assess whether differences are meaningful."
    311     },
    312     {
    313       "flag": "Single evaluation metric",
    314       "detail": "Only AUC-PR is reported. Other standard metrics (AUC-ROC, F1 at various thresholds, accuracy) would provide a more complete picture of detection performance."
    315     },
    316     {
    317       "flag": "GPT-4.1-mini as gold label generator and pipeline component",
    318       "detail": "GPT-4.1-mini is used both to generate gold labels for QA datasets (comparing LLM outputs to golden answers) AND as the fact alignment judge in the pipeline. This dual role could introduce systematic bias favoring the method."
    319     },
    320     {
    321       "flag": "No contamination discussion",
    322       "detail": "All QA benchmarks predate the evaluated models' training data. The LLMs may have memorized benchmark answers, which could affect both the hallucination rates and the detection method's behavior."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models",
    328       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark Gales"],
    329       "year": 2023,
    330       "relevance": "Key baseline for self-consistency-based hallucination detection without external knowledge."
    331     },
    332     {
    333       "title": "SAC3: Reliable hallucination detection in black-box language models via semantic-aware cross-check consistency",
    334       "authors": ["Jiaxin Zhang", "Zhuohang Li", "Kamalika Das", "Bradley Malin", "Sricharan Kumar"],
    335       "year": 2023,
    336       "relevance": "Baseline method using semantic-aware cross-checking for hallucination detection."
    337     },
    338     {
    339       "title": "INSIDE: llms' internal states retain the power of hallucination detection",
    340       "authors": ["Chao Chen", "Kai Liu", "Ze Chen"],
    341       "year": 2024,
    342       "relevance": "Baseline using internal LLM representations (EigenScore) for hallucination detection."
    343     },
    344     {
    345       "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation",
    346       "authors": ["Sewon Min", "Kalpesh Krishna", "Xinxi Lyu"],
    347       "year": 2023,
    348       "relevance": "Related approach decomposing outputs into atomic facts for factuality evaluation."
    349     },
    350     {
    351       "title": "Chain-of-verification reduces hallucination in large language models",
    352       "authors": ["Shehzaad Dhuliawala", "Mojtaba Komeili", "Jing Xu"],
    353       "year": 2024,
    354       "relevance": "Verification-based approach to reducing LLM hallucinations through targeted questioning."
    355     },
    356     {
    357       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    358       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    359       "year": 2023,
    360       "relevance": "Comprehensive survey on LLM hallucination taxonomy and detection methods."
    361     },
    362     {
    363       "title": "Self-refine: Iterative refinement with self-feedback",
    364       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    365       "year": 2023,
    366       "relevance": "Self-refinement approach for LLM output improvement relevant to hallucination mitigation."
    367     },
    368     {
    369       "title": "Judging llm-as-a-judge with mt-bench and chatbot arena",
    370       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    371       "year": 2023,
    372       "relevance": "Foundational work on LLM-as-a-judge paradigm used in this paper's pipeline."
    373     },
    374     {
    375       "title": "Hallucination is inevitable: An innate limitation of large language models",
    376       "authors": ["Ziwei Xu", "Sanjay Jain", "Mohan Kankanhalli"],
    377       "year": 2024,
    378       "relevance": "Theoretical analysis showing LLMs will always hallucinate, motivating detection work."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs