ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27928B)


      1 {
      2   "paper": {
      3     "title": "Chain-of-Thought Prompting Obscures Hallucination Cues in Large Language Models: An Empirical Evaluation",
      4     "authors": [
      5       "Jiahao Cheng",
      6       "Tiancheng Su",
      7       "Jia Yuan",
      8       "Guoxiu He",
      9       "Jiawei Liu",
     10       "Xinqi Tao",
     11       "Jingwen Xie",
     12       "Huaxia Li"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2506.17088"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract states 'Code is publicly available at: https://github.com/ECNU-Text-Computing/cot-hallu-detect' providing a direct GitHub repository URL."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available benchmark datasets (CommonsenseQA, ARC-Challenge, MMLU, TruthfulQA, TriviaQA, PopQA, HaluEval, CNN/Daily Mail) and specifies exactly which splits are used. All datasets are publicly accessible."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using vLLM for inference (Appendix F) and LLaMA-Factory for fine-tuning, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While the code repository is linked, the paper itself does not contain step-by-step reproduction instructions. The implementation details in Appendix F describe settings but not a reproducible workflow a researcher could follow without examining the repository."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results in Tables 1-11 report only point estimates (single AUROC values, accuracy percentages). No confidence intervals, error bars, or ± notation are provided for any results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper uses Kolmogorov-Smirnov (K-S) tests to evaluate whether score distributions differ significantly between conditions (Section 5.2, Figure 4, with p < 0.01 stated). The pilot experiment also uses K-S tests (Figure 2, p < 0.01)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper reports raw differences in AUROC and accuracy scores but does not report standardized effect sizes (e.g., Cohen's d, odds ratios). While percentage differences can be computed from the tables, the paper does not explicitly contextualize the magnitude of effects."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper states dataset sizes (e.g., 1,221 samples for CommonsenseQA, 817 for TruthfulQA) but does not justify why these sizes are sufficient or provide any power analysis. The total of 768 experimental configurations is stated but not justified as adequate."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, variance measures, or results from multiple runs are reported. All tables present single-run point estimates. The paper does not mention running experiments multiple times with different seeds."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper systematically compares CoT prompting conditions against a 'base' (no CoT) baseline scenario across all experiments (Tables 1-11). Each detection method's performance without CoT serves as the baseline."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The detection methods evaluated include recent work: LLM-Check (NeurIPS 2024), INSIDE (ICLR 2024), In-Context Sharpness (ICML 2024), Verbalized Certainty (ACL 2024), and MIND (ACL 2024 Findings). These are contemporary baselines."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper systematically varies three CoT prompting methods (CoT, LtM, MRPP) across multiple dimensions, effectively ablating the type of reasoning strategy. The comparison between different detection method categories (consistency-based, internal-state-based, self-evaluation-based) also serves as an ablation of detection approach."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses multiple evaluation metrics: AUROC for classification performance, Spearman's rank correlation for monotonic relationships, Expected Calibration Error (ECE) for confidence alignment, plus accuracy and entropy in the pilot experiments (Section 4.1, Appendix E)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 5 states: 'We validate our automated metrics by having two annotators manually annotate 150 samples, achieving a Cohen's Kappa of 0.863.' This validates the automated evaluation pipeline with human judgment."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper uses standard test/validation splits: ARC-Challenge test split, MMLU test split, CommonsenseQA validation split (since test labels unavailable), TruthfulQA validation subset, TriviaQA validation subset, PopQA test subset (Section 3.1, 4.2)."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per dataset (HaluEval, PopQA, TriviaQA, CNN/Daily Mail, TruthfulQA), per LLM (Llama-8B, Mistral-7B, Llama-70B, DeepSeek), per detection method (8 methods), and per annotation protocol (Textual vs. Semantic) across Tables 3, 7-11."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.3 provides a qualitative failure example: the TriviaQA question about Walter Matthau's first movie where LtM prompting causes Verbalized Certainty to increase from 0.2 to 1.0 despite an incorrect answer. The paper also discusses where consistency-based methods are more robust."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The central finding IS a negative result: CoT prompting degrades hallucination detection. The paper also reports that not all CoT variants consistently improve performance (Section 5.1), and that consistency-based methods show greater robustness (a nuanced finding contradicting a simple 'everything degrades' narrative)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that CoT prompting 'obscures critical signals used for detection, impairing the effectiveness of various detection methods' — supported by Table 3 showing 465/768 configurations with AUROC degradation. The claim about consistency-based methods being more robust is supported by Tables 3-4."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims about CoT prompting causing changes in detection performance. The experimental design is adequate: it uses controlled comparisons where the only variable changed is the prompting method (base vs. CoT/LtM/MRPP), with all other factors held constant. The K-S tests confirm distributional shifts in internal states."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims CoT prompting 'Obscures Hallucination Cues in Large Language Models' broadly, but the study only uses 4 open-source LLMs (all ≤70B parameters) and cannot test closed-source models. The Limitations section acknowledges this ('unable to conduct experiments on larger-scale LLMs' and 'prevented us from evaluating... closed-source LLMs') but the title and abstract do not bound the claims to open-source models."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper discusses several alternative explanations: that the effect might be due to evaluation metric limitations (Section 5.1, validated with human annotation), that different LLMs respond differently due to 'differences in response generation tendencies' (Section 5.3), and that the effect varies by detection paradigm type. The Limitations section discusses metric choice."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Exact model names with versions are provided: Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Llama-3.1-70B-Instruct, DeepSeek-R1-Distill-Llama-8B, with HuggingFace URLs in Appendix F.2. These are specific enough to reproduce."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C provides the actual prompt templates used: CoT ('Think about it step by step.'), LtM ('What subproblems must be solved before answering the inquiry?'), MRPP ('You need to perform multi-step reasoning, with each step carrying out as many basic operations as possible.'). The system prompt is stated: 'You are a helpful assistant.' (Appendix F.2)."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix F reports: temperature=0.5, max_tokens=512 for reasoning generation; INSIDE sampling n=15, temperature=0.5, top_p=0.99, top_k=5; SelfCheckGPT n=20, temperature=1; Sharpness alpha=1.0; specific layer numbers for internal-state methods; LoRA fine-tuning: lr=5e-05, batch_size=32, epochs=5, lora_rank=8, lora_alpha=16."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The experiments involve direct prompting of LLMs without any agent framework, tool use, or multi-step orchestration."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Sections 3.1 and 4.2 describe which splits and subsets are used for each dataset (e.g., CommonsenseQA validation split with 1,221 samples, TriviaQA rc.wikipedia.nocontext validation, PopQA test in closed-book setting). Appendix F describes the normalization for token probabilities (Eq. 8) and ECE normalization (3%-97% range scaling)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "There is a dedicated 'Limitations' section after the Discussion and Conclusion that discusses computational constraints, inability to test closed-source LLMs, and evaluation metric limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The Limitations section discusses specific threats: 'Due to computational resource constraints, we are unable to conduct experiments on larger-scale LLMs, which limits the generalizability of our conclusions.' It also addresses the specific limitation of Exact Match and ROUGE-L metrics for assessing response correctness in QA tasks."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly states: 'Addressing the impact of CoT prompting on hallucination detection is beyond the scope of this study and is left for future work.' It also notes the limitation to open-source LLMs and the specific detection methods that do not rely on external knowledge bases (Section 2)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While the code repository is linked, the paper does not explicitly state that raw experimental outputs (model predictions, hallucination scores, per-sample results) are available for independent verification. Only aggregated metrics are presented in tables."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is well-described: all datasets are standard public benchmarks with specific splits identified (Section 3.1, 4.2). The pipeline for generating responses, computing hallucination scores, and evaluating correctness is documented in Section 4.1 and Appendix F."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants were recruited. The two annotators for validation (150 samples) are mentioned but this is a minor validation component using standard benchmark data, not a human subjects study."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Figure 3 illustrates the full experimental pipeline: LLM response generation → correctness measurement (textual/semantic) → hallucination detection scoring → evaluation across three dimensions. Section 4.1 and Appendix E describe each step."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The Acknowledgment section lists funding sources: National Natural Science Foundation of China (72204087), Shanghai Planning Office of Philosophy and Social Science Youth Project (2022ETQ001), Chenguang Program, Shanghai Pujiang Program (23PJC030), Young Elite Scientists Sponsorship Program by CAST, and Fundamental Research Funds for Central Universities."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: East China Normal University, Wuhan University, and Xiaohongshu Inc. Three authors are from Xiaohongshu (a commercial company), which is disclosed."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The funders are Chinese government research foundations and academic programs (NSFC, Shanghai municipal programs, CAST). None have a financial stake in whether CoT prompting helps or hinders hallucination detection."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper. Three authors are affiliated with Xiaohongshu Inc. (a commercial tech company), but no declaration of competing interests is made."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff dates for any of the four LLMs evaluated (Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Llama-3.1-70B-Instruct, DeepSeek-R1-Distill-Llama-8B), despite evaluating them on benchmarks that may be in their training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of potential train/test overlap. The benchmarks used (CommonSenseQA from 2019, ARC from 2018, MMLU from 2021, TruthfulQA from 2022, TriviaQA from 2017, HaluEval from 2023) are all publicly available and were likely in the training data of models released in 2024-2025."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of benchmark contamination risk. Most benchmarks (TriviaQA 2017, ARC 2018, CommonsenseQA 2019, MMLU 2021) are old enough to almost certainly be in the training data of models from 2024. This is a notable omission given that contamination could affect the base accuracy numbers."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "This is not a human subjects study. The two annotators who validated 150 samples is a minor annotation task, not a human study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human subjects study. The paper includes an Ethics Statement but it confirms 'this research does not involve the collection or use of personal data.'"
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference costs, latency, or computational time per experiment are reported despite running 768 experimental configurations across multiple LLMs including a 70B parameter model. Consistency-based methods requiring 15-20 samples per input are noted as having 'high computational overhead' but no quantification is given."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget is stated. The paper does not mention GPU hours, total time, or hardware used for the experiments despite running 768 configurations across 4 LLMs (including a 70B model) with multiple detection methods requiring multiple forward passes."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "CoT prompting improves LLM accuracy but simultaneously degrades hallucination detection performance across multiple detection methods.",
    295       "evidence": "Table 1 shows accuracy improvements with CoT (e.g., Llama accuracy increases from 90.5% to 97.67% on ARC-Challenge). Table 3 shows 465 out of 768 experimental configurations exhibit AUROC degradation for hallucination detection under CoT prompting (Section 5.3).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "CoT prompting significantly alters the probability distribution of final answer tokens, making hallucinated answers appear more confident.",
    300       "evidence": "Table 1 shows entropy consistently decreases after CoT (e.g., Llama entropy drops from 23.96 to 6.63 on ARC-Challenge). Figure 2 shows K-S tests confirming significant distributional shifts (p < 0.01). Section 3.2 discusses how CoT increases confidence even for incorrect predictions.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Self-evaluation-based detection methods are most adversely affected by CoT prompting, while consistency-based methods show greater robustness.",
    305       "evidence": "Table 3 shows SelfCheckGPT-Prompt and Verbalized Certainty have the highest counts of AUROC degradation (59/96 and 67/96 respectively), while EigenScore and SelfCheckGPT-NLI show lower degradation counts (48/96 and 56/96). Section 5.3 provides qualitative explanation for why.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Advanced hallucination detection methods lose their advantage over simple perplexity baselines when CoT prompting is applied.",
    310       "evidence": "Section 5.3 states: 'when CoT prompts are introduced, many detection methods fail to surpass the perplexity baseline, and some perform worse.' Tables 7-11 show specific AUROC values where perplexity-based methods perform comparably to or better than advanced methods under CoT.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "CoT prompting leads to increased Expected Calibration Error (ECE) across most experimental settings.",
    315       "evidence": "Section 5.4 states: 'CoT prompting leads to an increase in ECE across most experimental settings, suggesting that the confidence of detection methods becomes less reliable.' Detailed results are deferred to Appendix G.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "CoT prompting creates a trade-off in LLMs: while it improves task accuracy by encouraging step-by-step reasoning, it simultaneously obscures the internal signals used by hallucination detection methods. Across 768 experimental configurations spanning 4 LLMs, 5 datasets, 8 detection methods, and 3 CoT variants, more than half showed degraded detection performance (measured by AUROC). Self-evaluation-based and internal-state-based detection methods are most vulnerable, while consistency-based methods that rely on multi-sample reasoning show greater robustness. Notably, advanced detection methods lose their typical advantage over simple perplexity baselines when CoT prompting is used.",
    323   "red_flags": [
    324     {
    325       "flag": "No variance or uncertainty reporting",
    326       "detail": "All experiments appear to be single-run with no error bars, confidence intervals, or standard deviations reported across any of the 768 configurations. Given that LLM outputs involve stochastic sampling (temperature=0.5), results could vary across runs."
    327     },
    328     {
    329       "flag": "No benchmark contamination discussion",
    330       "detail": "Several benchmarks used (TriviaQA 2017, ARC 2018, CommonsenseQA 2019, MMLU 2021) are old enough to likely be in the training data of models from 2024-2025. Contamination could inflate base accuracy and potentially affect the accuracy improvements attributed to CoT. The paper does not acknowledge this risk."
    331     },
    332     {
    333       "flag": "Computational cost not disclosed",
    334       "detail": "Running 768 experimental configurations across 4 LLMs (including 70B) with methods requiring multiple forward passes (INSIDE: 15 samples, SelfCheckGPT: 20 samples) represents substantial compute. No hardware, GPU hours, or cost information is provided, making it hard to assess reproducibility and practical relevance."
    335     },
    336     {
    337       "flag": "Title overgeneralizes beyond tested scope",
    338       "detail": "The title claims CoT 'Obscures Hallucination Cues in Large Language Models' broadly, but only 4 open-source models (≤70B) were tested. No closed-source models or models >70B were evaluated. The Limitations section acknowledges this but the title and abstract do not scope the claims."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Chain of thought prompting elicits reasoning in large language models",
    344       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed H. Chi", "Quoc V Le", "Denny Zhou"],
    345       "year": 2022,
    346       "relevance": "Foundational work on chain-of-thought prompting, central to the paper's investigation of CoT effects on LLM behavior."
    347     },
    348     {
    349       "title": "Large language models are zero-shot reasoners",
    350       "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"],
    351       "year": 2024,
    352       "relevance": "Introduces zero-shot CoT prompting, one of the three CoT methods evaluated in this study."
    353     },
    354     {
    355       "title": "SelfcheckGPT: Zero-resource black-box hallucination detection for generative large language models",
    356       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark Gales"],
    357       "year": 2023,
    358       "relevance": "Key hallucination detection baseline evaluated in the study; represents consistency-based detection methods."
    359     },
    360     {
    361       "title": "LLM-check: Investigating detection of hallucinations in large language models",
    362       "authors": ["Gaurang Sriramanan", "Siddhant Bharti", "Vinu Sankar Sadasivan", "Shoumik Saha", "Priyatham Kattakinda", "Soheil Feizi"],
    363       "year": 2024,
    364       "relevance": "Internal-state-based hallucination detection method that is a key evaluation target in this paper."
    365     },
    366     {
    367       "title": "INSIDE: LLMs' internal states retain the power of hallucination detection",
    368       "authors": ["Chao Chen", "Kai Liu", "Ze Chen", "Yi Gu", "Yue Wu", "Mingyuan Tao", "Zhihang Fu", "Jieping Ye"],
    369       "year": 2024,
    370       "relevance": "Hybrid hallucination detection method combining internal-state and consistency approaches, evaluated in this study."
    371     },
    372     {
    373       "title": "Confidence under the hood: An investigation into the confidence-probability alignment in large language models",
    374       "authors": ["Abhishek Kumar", "Robert Morabito", "Sanzhar Umbet", "Jad Kabbara", "Ali Emami"],
    375       "year": 2024,
    376       "relevance": "Introduces Verbalized Certainty detection method, one of the detection approaches most affected by CoT in this study."
    377     },
    378     {
    379       "title": "In-context sharpness as alerts: An inner representation perspective for hallucination mitigation",
    380       "authors": ["Shiqi Chen", "Miao Xiong", "Junteng Liu", "Zhengxuan Wu", "Teng Xiao", "Siyang Gao", "Junxian He"],
    381       "year": 2024,
    382       "relevance": "Internal-state-based detection method (Sharpness) evaluated as one of the eight detection approaches in this study."
    383     },
    384     {
    385       "title": "Language models don't always say what they think: Unfaithful explanations in chain-of-thought prompting",
    386       "authors": ["Miles Turpin", "Julian Michael", "Ethan Perez", "Samuel R. Bowman"],
    387       "year": 2023,
    388       "relevance": "Demonstrates unfaithfulness of CoT explanations, directly relevant to understanding how CoT affects LLM internal states and detection."
    389     },
    390     {
    391       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    392       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    393       "year": 2025,
    394       "arxiv_id": "2501.12948",
    395       "relevance": "Reasoning-oriented LLM architecture whose distilled variant is one of the four models evaluated in this study."
    396     },
    397     {
    398       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    399       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    400       "year": 2022,
    401       "relevance": "Key hallucination evaluation benchmark used in this study with a distinct evaluation protocol for truthfulness and informativeness."
    402     },
    403     {
    404       "title": "Chain of thoughtlessness? An analysis of CoT in planning",
    405       "authors": ["Kaya Stechly", "Karthik Valmeekam", "Subbarao Kambhampati"],
    406       "year": 2024,
    407       "relevance": "Critiques limitations of CoT prompting, providing context for why CoT may not genuinely improve reasoning and may alter LLM internal states."
    408     },
    409     {
    410       "title": "Unsupervised real-time hallucination detection based on the internal states of large language models",
    411       "authors": ["Weihang Su", "Changyue Wang", "Qingyao Ai", "Yiran Hu", "Zhijing Wu", "Yujia Zhou", "Yiqun Liu"],
    412       "year": 2024,
    413       "relevance": "MIND framework for hallucination detection, evaluated as supplementary evidence that model-training-based methods also degrade under CoT."
    414     }
    415   ]
    416 }

Impressum · Datenschutz