ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17715B)


      1 {
      2   "paper": {
      3     "title": "A Comprehensive Survey on Trustworthiness in Reasoning with Large Language Models",
      4     "authors": ["Yanbo Wang", "Yongcan Yu", "Jian Liang", "Ran He"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2509.03871"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub repository at https://github.com/ybwang119/Awesome-reasoning-safety for the full list of related papers."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No structured dataset or extracted data tables are released. The GitHub repo appears to be a paper list, not analysis data."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a survey paper with no experiments requiring an environment setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No description of how the survey methodology could be reproduced — no search queries, databases searched, or systematic review protocol."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Survey paper with no original experiments or statistical aggregation."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No statistical tests performed; this is a narrative literature review."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No meta-analysis or quantitative synthesis performed."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper; no experimental sample sizes."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experimental runs conducted."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against prior surveys on LLM safety [4,5,6] and reasoning [7,8], noting gaps in trustworthiness coverage that motivate this work."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Prior surveys cited are from 2024-2025 and are contemporary to this work."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Survey paper; no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper; no metrics applied."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper; no system outputs to evaluate."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper; no test set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The survey provides a detailed taxonomy (Figure 2) with per-category breakdowns across truthfulness, safety, robustness, fairness, and privacy, with subcategories within each."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The survey discusses failures and vulnerabilities extensively — e.g., reasoning models hallucinating more than non-reasoning models, safety vulnerabilities of DeepSeek-R1, contradictory findings across studies."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Multiple negative findings are reported: reasoning models can increase hallucination (§3.1.2), CoT does not necessarily improve safety (§4.1), reasoning models are more vulnerable to backdoor attacks (§4.4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that reasoning models 'often suffer from comparable or even greater vulnerabilities in safety, robustness, and privacy' — this is supported by extensive evidence reviewed in §4.1, §5, and §7."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The survey does not make its own causal claims; it reports findings from other papers."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper notes that 'pairwise safety ranks between models depend on datasets' (§4.1) and that contradictory findings arise from different evaluation methods, appropriately bounding generalizations."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "As a pure survey/taxonomy paper with no original empirical results, alternative explanations for its own findings are not applicable."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "Survey paper; no models used for experiments."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "Survey paper; no prompting performed."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper; no experiments."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper; no agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No systematic review protocol is documented. The paper does not describe search queries, databases, inclusion/exclusion criteria, or filtering pipeline for selecting the surveyed papers. It simply states it 'considers papers published up to June 30, 2025.'"
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations section exists. The paper has a 'Future Research Directions' section (§8) but does not discuss limitations of the survey itself."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed for the survey's own methodology or coverage."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper states it focuses on 'five core dimensions of trustworthy reasoning: truthfulness, safety, robustness, fairness, and privacy' and considers 'papers published up to June 30, 2025.' The scope is narrowed to CoT techniques and reasoning models specifically."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A GitHub repository (https://github.com/ybwang119/Awesome-reasoning-safety) with the full paper list is provided, allowing verification of coverage."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No description of how papers were discovered — no search terms, databases, or systematic collection procedure described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is published papers."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of how papers were selected, filtered, or categorized into the taxonomy. The classification methodology is not described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: University of Chinese Academy of Sciences and Institute of Automation, CAS."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "Survey paper; no model evaluation on benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Survey paper; no model evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper; no benchmarks used."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this survey."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper; no method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper; no computation performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Reasoning models often suffer from comparable or even greater vulnerabilities in safety, robustness, and privacy compared to non-reasoning models.",
    286       "evidence": "Supported by extensive review of vulnerability assessments in §4.1 (SafeChain, CNSafe evaluations showing high ASR for DeepSeek-R1), robustness issues in §5.2, and privacy leakage in §7.2.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Reasoning models can produce more significant hallucinations than non-reasoning models in simpler, non-reasoning tasks.",
    291       "evidence": "§3.1.2 cites multiple studies [62, 63, 64, 65, 225, 66] documenting higher hallucination rates in reasoning models on TruthfulQA, HaluEval, SimpleQA, and visual tasks.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The thinking process from LRMs may negatively affect the harmfulness of generated content.",
    296       "evidence": "§4.1 reports SafeChain experiments showing that forcing models to skip or shorten reasoning boosts harmlessness, and that thinking content has consistently lower safety rates than final answers.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Reasoning techniques hold promise for enhancing model trustworthiness through hallucination mitigation, harmful content detection, and robustness improvement.",
    301       "evidence": "§3.1.1 reviews HaluSearch, CLATTER, and other methods using CoT for hallucination detection; §4.2.3 reviews GuardReasoner and similar guardrail models leveraging reasoning.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CoT prompting is helpful but insufficient to fully solve dialect bias and fairness issues.",
    306       "evidence": "§6 cites Lin et al. [206] showing CoT prompting mitigates but does not eliminate AAVE dialect bias, and Cantini et al. [212] finding models with explicit reasoning are more vulnerable to bias.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["meta-analysis"],
    311   "key_findings": "This survey reviews trustworthiness of LLM reasoning across five dimensions: truthfulness, safety, robustness, fairness, and privacy. Key findings include that reasoning models often hallucinate more than non-reasoning models on simple tasks, that the reasoning process itself can increase harmful content generation, and that current open-source reasoning models remain vulnerable to jailbreak attacks. The survey also finds that while CoT techniques can be leveraged for safety defenses (guardrails, alignment), reasoning capability simultaneously expands the attack surface.",
    312   "red_flags": [
    313     {
    314       "flag": "No systematic review methodology",
    315       "detail": "The paper does not describe search queries, databases, inclusion/exclusion criteria, or any systematic procedure for paper selection. It is unclear how comprehensively or consistently papers were identified, which undermines the 'comprehensive survey' claim."
    316     },
    317     {
    318       "flag": "No quality assessment of reviewed papers",
    319       "detail": "The survey summarizes findings from reviewed papers without any structured quality assessment of the primary studies. Claims from weak or preliminary studies are presented alongside well-established findings without distinction."
    320     },
    321     {
    322       "flag": "No limitations discussion for the survey itself",
    323       "detail": "The paper discusses future research directions but does not acknowledge any limitations of its own methodology, coverage, or potential biases in paper selection."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "SafeChain: Safety of Language Models with Long Chain-of-Thought Reasoning Capabilities",
    329       "authors": ["Fengqing Jiang", "Zhangchen Xu", "Yuetai Li"],
    330       "year": 2025,
    331       "arxiv_id": "2502.12025",
    332       "relevance": "Evaluates safety vulnerabilities in reasoning models with long CoT, directly relevant to LLM safety assessment."
    333     },
    334     {
    335       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    336       "authors": ["Daya Guo", "Dejian Yang"],
    337       "year": 2025,
    338       "arxiv_id": "2501.12948",
    339       "relevance": "Major open-source reasoning model whose safety and reliability properties are extensively discussed."
    340     },
    341     {
    342       "title": "Measuring Faithfulness in Chain-of-Thought Reasoning",
    343       "authors": ["Tamera Lanham", "Anna Chen"],
    344       "year": 2023,
    345       "arxiv_id": "2307.13702",
    346       "relevance": "Foundational work on measuring whether CoT reasoning is faithful to the model's actual decision process."
    347     },
    348     {
    349       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    350       "authors": ["Jason Wei", "Xuezhi Wang"],
    351       "year": 2022,
    352       "relevance": "Seminal CoT prompting paper foundational to the entire reasoning paradigm discussed in this survey."
    353     },
    354     {
    355       "title": "Alignment Faking in Large Language Models",
    356       "authors": ["Evan Hubinger"],
    357       "year": 2024,
    358       "relevance": "Related to safety alignment concerns discussed in the survey's treatment of reasoning model alignment."
    359     },
    360     {
    361       "title": "GuardReasoner: Towards Reasoning-based LLM Safeguards",
    362       "authors": [],
    363       "year": 2025,
    364       "relevance": "Guardrail model using reasoning techniques for safety content detection, relevant to AI safety tooling."
    365     },
    366     {
    367       "title": "Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting",
    368       "authors": ["Miles Turpin", "Julian Michael", "Ethan Perez", "Samuel Bowman"],
    369       "year": 2023,
    370       "relevance": "Demonstrates unfaithfulness in CoT reasoning, key concern for trustworthy AI evaluation."
    371     },
    372     {
    373       "title": "AbstentionBench: Reasoning LLMs Fail on Unanswerable Questions",
    374       "authors": ["Polina Kirichenko", "Mark Ibrahim"],
    375       "year": 2025,
    376       "arxiv_id": "2506.09038",
    377       "relevance": "Benchmark evaluating reasoning model behavior on unanswerable questions, relevant to robustness and hallucination."
    378     },
    379     {
    380       "title": "Safety Reasoning with Guidelines",
    381       "authors": ["Haoyu Wang", "Zeyu Qin"],
    382       "year": 2025,
    383       "relevance": "Proposes safety reasoning guidelines and evaluates test-time scaling for safety, directly relevant to LLM safety methodology."
    384     },
    385     {
    386       "title": "Deliberate Alignment: Reasoning Enables Safer Language Models",
    387       "authors": [],
    388       "year": 2025,
    389       "relevance": "First method to align reasoning models with curated CoT data, foundational to reasoning model safety alignment."
    390     }
    391   ]
    392 }

Impressum · Datenschutz