scan.json (26075B)
1 { 2 "paper": { 3 "title": "Beyond Correctness: Rewarding Faithful Reasoning in Retrieval-Augmented Generation", 4 "authors": ["Zhichao Xu", "Zongyu Wu", "Yun Zhou", "Aosong Feng", "Kang Zhou", "Sangmin Woo", "Kiran Ramnath", "Yijun Tian", "Xuan Qi", "Weikang Qiu", "Lin Lee Cheong", "Haibo Ding"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.13272", 8 "doi": "10.48550/arXiv.2510.13272" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No repository URL, GitHub link, or Zenodo archive is provided in the paper. The paper builds on the Search-R1 framework but does not release its own code for VERITAS." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses publicly available QA benchmarks (NQ, TriviaQA, PopQA, HotpotQA, 2WikiMultihopQA, MuSiQue, Bamboogle) processed through FlashRAG. Dataset statistics and licenses are provided in Appendix Table 7." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions using 32x NVIDIA A100 GPUs and specifies some hyperparameters (learning rate 1e-6, batch size 256) and model names but does not provide a requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the methodology but does not give instructions sufficient for a researcher to reproduce the experiments without guessing." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results in Tables 1, 2, 3, and Figure 3 are reported as single point estimates without confidence intervals, error bars, or ± notation." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims VERITAS-R1 outperforms baselines (e.g., 'improves Information-Think faithfulness by around 14%') but no statistical significance tests (p-values, t-tests, etc.) are reported for any comparison." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports improvements with baseline context, e.g., 'Information-Think faithfulness increases from 0.734 for the baseline Search-R1 to 0.853 for VERITAS-R1' and 'improves the average EM score from 0.361 to 0.380'. Tables provide absolute scores for all methods, enabling effect size computation." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for the number of evaluation examples used. The human evaluation uses only 50 samples with no power analysis or justification for this sample size." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No standard deviations, variance, or spread measures across runs are reported. The paper acknowledges non-determinism in LLM inference (citing Chann 2023) but does not report variance across seeds or runs." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "Table 2 includes a comprehensive set of baselines: Direct Inference, CoT, IRCoT, Search-o1, RAG, SFT, R1-base, R1-instruct, Rejection Sampling, DeSA (3B and 7B), ReSearch (7B Base and Instruct), and Search-R1 variants." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines include contemporary RL-based agentic search methods: Search-R1 (2025), ReSearch (2025), DeSA (2025), and Search-o1 (2025), which represent the current state of the art." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 3 and Figure 3 show ablation over reward components: EM only (Search-R1 baseline), EM+Info-Think, EM+Think-Ans, and EM+Info-Think+Think-Ans. Figure 4 provides hyperparameter sensitivity analysis for reward weights." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper evaluates on four metrics: Exact Match (EM) for task performance, plus three faithfulness metrics (Information-Think, Think-Search, Think-Answer faithfulness). Results are reported across all four dimensions in Table 3." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 4 and Table 10 report human evaluation: 'An author of this paper annotated 50 samples and compare against Claude-3.7-Sonnet, Claude-4.5-Sonnet and our trained reward model.' This evaluates the reward model's alignment with human judgments." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper uses separate in-domain (NQ, HotpotQA) and out-of-domain datasets (TriviaQA, PopQA, 2Wiki, Musique, Bamboogle), explicitly marked with † and * in Table 2. The RM training uses a 24K/3K train/eval split from the 27K labeled instances." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 3 provides per-dataset breakdowns across all seven QA benchmarks, and results are also broken down by 'General QA' vs 'Multi-Hop QA' categories in Figure 3." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 3.4 discusses failure cases of Search-R1-7B-Base-PPO-Format, providing qualitative examples of unfaithful reasoning traces. Section 5.2 discusses where Think-Answer reward is not stable: 'the improvement of think-answer faithfulness is not stable on general QA datasets.'" 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that adding Rthink-answer 'does not reliably improve its corresponding metric and can sometimes be detrimental' (Section 5.2). It also reports that directly applying faithfulness rewards 'slightly reduced task performance' before curriculum learning was introduced (Section 4)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims VERITAS 'significantly improve reasoning faithfulness' (supported by Table 3, ~14% Info-Think improvement) and 'achieves better task performance compared to the baselines' (supported by Table 2, average EM 0.447 vs 0.431). These are borne out in the results." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims via ablation studies (removing/adding reward components in Table 3 and Figure 4). The controlled comparison with Search-R1 (same model, hyperparameters, and data, differing only in reward function) provides adequate support for the causal claim that process rewards improve faithfulness." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title and abstract generalize to 'Retrieval-Augmented Generation' broadly, but experiments are limited to a single base model (Qwen2.5-7B-Base), open-domain QA tasks only, and Wikipedia-based retrieval. The Limitations section acknowledges domain limitations but the title and claims are broader than what was tested." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not substantively discuss alternative explanations for the observed improvements. For example, it does not consider whether the improvement comes from curriculum learning alone rather than the faithfulness rewards, or whether the distilled reward model introduces systematic biases that happen to correlate with task performance." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper specifies exact model names: Qwen2.5-7B-Base as policy model, Qwen2.5-14B-Instruct as reward model, E5-base-v2 as retriever, google/t5_xxl_true_nli_mixture for NLI, Claude-3.7-Sonnet and Claude-4.5-Sonnet (Claude Sonnet-4.5) as judge models." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Appendix Tables 4, 5, and 6 provide the full prompt templates for Search-R1 (Table 4), Think-Search faithfulness evaluation (Table 5), and Information-Think faithfulness evaluation (Table 6)." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 5.1 reports key hyperparameters: learning rate (1e-6), batch size (256), reward weights (wEM=1.0, winfo-think=0.05, wthink-ans=0.02), and curriculum learning schedule (T1 at end of first epoch, warmup for 0.5 epoch). LoRA is used for reward model training." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 3.1 describes the agentic search trajectory structure in detail: <think>, <search>, <information>, <answer> tags, the multi-turn interaction loop, and Figure 2 provides a pipeline overview of the VERITAS-R1 training framework with PPO." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4 describes the data pipeline: 27,000 samples subsampled from NQ and HotpotQA training splits, labeled by Claude-3.7-Sonnet, split into 24K training and 3K evaluation. The paper uses FlashRAG-processed datasets (Appendix D) with statistics in Table 7." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "A dedicated 'Limitations' section appears on page 12, discussing three specific limitations: reliance on LLM-as-a-Judge, regex-based Think-Answer metric limitations, and domain-specific generalizability concerns." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "The Limitations section discusses specific threats: (1) the reward model is 'inherently subject to the biases and potential errors of the judge model', (2) the Sub-EM metric 'can fail to recognize legitimate paraphrasing', and (3) experiments on open-domain QA may not transfer to 'enterprise search or medical QA, where the nature of evidence and reasoning can be substantially different.'" 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "While the Limitations section mentions domain-specific concerns generically, it does not explicitly state what the results do NOT show. For example, it does not explicitly state that results are limited to a single base model size (7B), a single RL algorithm (PPO), or Wikipedia-only retrieval corpora." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "The raw training trajectories, reward model labels, and evaluation outputs are not made available. Only aggregated metrics are reported." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4 describes the data collection: 27K samples subsampled from NQ and HotpotQA, labeled by Claude-3.7-Sonnet for Information-Think faithfulness. Section F details preliminary studies with 8K instances and the scaling to 27K. The evaluation datasets and their sources are documented in Table 7." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited for the study. The human evaluation was conducted by an author of the paper (50 samples), and no external participants were involved." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline is documented: Search-R1 generates trajectories → subsample 27K instances → Claude-3.7-Sonnet labels for faithfulness → 24K/3K train/eval split → LoRA fine-tuning of Qwen2.5-14B-Instruct. The evaluation pipeline (NLI + LLMaaJ + Sub-EM) is described in Section 3.3." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding source or acknowledgments section mentioning grants or sponsors is found in the paper." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: AWS AI Fundamental Research (primary), The Pennsylvania State University, and Yale University. The paper header lists all affiliations." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "The primary affiliation is AWS AI Fundamental Research (Amazon). Amazon has a commercial interest in RAG and search agent quality. The paper does not disclose whether Amazon funded the research or whether Amazon has a stake in the outcomes." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement, patent disclosures, or financial interests declaration is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses Qwen2.5-7B-Base as the policy model but does not state its training data cutoff date. This is relevant because the model could have seen the QA benchmarks during pre-training." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of potential overlap between Qwen2.5-7B-Base's pre-training data and the test sets of NQ, TriviaQA, PopQA, HotpotQA, etc. These are well-known public benchmarks that could be in the training data." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "All seven QA benchmarks were published before 2024. Qwen2.5 was trained on data up to 2024. No discussion of potential contamination is provided despite this being a realistic risk." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "The human evaluation involves a single author annotating 50 samples for reward model validation, not a human subjects study requiring pre-registration." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants were recruited. The human evaluation was conducted by an author, not external participants." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants were recruited beyond a single author conducting the evaluation." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No external human participants; the evaluation was done by an author." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "Not a human subjects experimental study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "Not a human subjects experimental study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "Not a human subjects study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper acknowledges that LLMaaJ is costly ('cost and latency make it impractical') but does not report actual inference costs, API costs, tokens consumed, or wall-clock time for any experiments." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "The paper states '32x NVIDIA A100 GPUs' were used but does not report total GPU hours, training time, or total compute budget." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Models trained with pure outcome-based reward (Search-R1, ReSearch, DeSA) have significant room for improvement in reasoning faithfulness, particularly Information-Think faithfulness.", 287 "evidence": "Figure 1 and Table 1 show Information-Think faithfulness scores as low as 0.564 (Search-R1 on HotpotQA) and 0.081 (DeSA-3B on HotpotQA). Section 3.4 provides detailed analysis.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "VERITAS-R1 improves Information-Think faithfulness by around 14% over Search-R1 baseline.", 292 "evidence": "Table 3 shows Info-Think faithfulness increasing from 0.608 (Search-R1 baseline average) to 0.697 (VERITAS Info-Think average), approximately a 14.6% relative improvement.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "VERITAS-R1 achieves better task performance (Exact Match) compared to baselines trained with pure outcome-based reward.", 297 "evidence": "Table 2 shows VERITAS-R1 (EM+Info-Think) average EM of 0.447 vs Search-R1-7B-Base-PPO at 0.431, a modest 3.7% relative improvement. Table 3 confirms this with per-dataset breakdowns.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "The distilled reward model (LoRA-finetuned Qwen2.5-14B-Instruct) achieves high agreement with both Claude-3.7-Sonnet labels and human judgments.", 302 "evidence": "Table 9 reports consistency ratio of 0.899 and Cohen's kappa of 0.797 with Claude-3.7-Sonnet. Table 10 shows 0.920 consistency and 0.839 kappa with human annotations on 50 samples.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "Curriculum learning is needed to prevent reward hacking when introducing faithfulness rewards.", 307 "evidence": "Section 4 states: 'directly applying the faithfulness rewards... substantially improved faithfulness metrics but slightly reduced task performance compared to Search-R1.' No quantitative comparison between curriculum and non-curriculum training is provided.", 308 "supported": "weak" 309 }, 310 { 311 "claim": "The Think-Answer reward has a less stable and sometimes detrimental effect compared to the Information-Think reward.", 312 "evidence": "Section 5.2 and Figure 3 discuss this: 'adding Rthink-answer does not reliably improve its corresponding metric and can sometimes be detrimental.' Table 3 shows VERITAS Think-Ans has lower average faithfulness gains than VERITAS Info-Think.", 313 "supported": "moderate" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval"], 317 "key_findings": "The paper demonstrates that RL-based agentic search models (Search-R1, ReSearch, DeSA) trained with only outcome-based rewards produce unfaithful reasoning chains despite achieving good task accuracy. The proposed VERITAS framework, which incorporates process-based faithfulness rewards (Information-Think and Think-Answer) into the RL training loop, improves Information-Think faithfulness by ~14% while also modestly improving task performance (average EM from 0.431 to 0.447 across seven QA benchmarks). A distilled reward model (Qwen2.5-14B-Instruct) achieves 0.899 consistency with Claude-3.7-Sonnet labels, making scalable process supervision practical.", 318 "red_flags": [ 319 { 320 "flag": "No variance or error bars reported", 321 "detail": "All results are reported as single point estimates across Tables 1, 2, 3 and all figures. The paper even acknowledges non-determinism in LLM inference but does not report multiple runs or standard deviations. This makes it impossible to assess whether differences are meaningful or within noise." 322 }, 323 { 324 "flag": "Small human evaluation sample", 325 "detail": "The human evaluation used only 50 samples annotated by a single author. This is a very small sample for validating a reward model, and single-annotator evaluation cannot assess inter-annotator agreement." 326 }, 327 { 328 "flag": "Company evaluating its own approach without conflict disclosure", 329 "detail": "The research is primarily from AWS AI Fundamental Research (Amazon). Amazon has commercial interest in RAG systems. No conflict of interest statement, funding disclosure, or competing interests declaration is provided." 330 }, 331 { 332 "flag": "No contamination analysis", 333 "detail": "All seven QA benchmarks (NQ, TriviaQA, PopQA, HotpotQA, 2Wiki, MuSiQue, Bamboogle) were published before Qwen2.5's training cutoff. No analysis of potential benchmark contamination is provided." 334 }, 335 { 336 "flag": "Modest task performance improvement", 337 "detail": "While faithfulness improvements are substantial, the task performance (EM) improvement is modest: average EM increases from 0.431 to 0.447 (3.7% relative). Without significance tests or variance, it is unclear whether this difference is reliable." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning", 343 "authors": ["Bowen Jin", "Hansi Zeng", "Zhenrui Yue", "Jinsung Yoon", "Sercan Arik", "Dong Wang", "Hamed Zamani", "Jiawei Han"], 344 "year": 2025, 345 "arxiv_id": "2503.09516", 346 "relevance": "Primary baseline and framework on which VERITAS is built; exemplifies outcome-based RL for agentic search." 347 }, 348 { 349 "title": "Learning to Reason with Search for LLMs via Reinforcement Learning (ReSearch)", 350 "authors": ["Mingyang Chen", "Linzhuang Sun", "Tianpeng Li"], 351 "year": 2025, 352 "arxiv_id": "2503.19470", 353 "relevance": "Key baseline agentic search model evaluated for faithfulness; represents RL-based RAG approach." 354 }, 355 { 356 "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation", 357 "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"], 358 "year": 2025, 359 "arxiv_id": "2503.11926", 360 "relevance": "Directly relevant to AI safety: shows reasoning models can produce unfaithful CoT, motivating this work." 361 }, 362 { 363 "title": "Measuring faithfulness in chain-of-thought reasoning", 364 "authors": ["Tamera Lanham", "Anna Chen", "Ansh Radhakrishnan"], 365 "year": 2023, 366 "arxiv_id": "2307.13702", 367 "relevance": "Foundational work on CoT faithfulness evaluation that motivates the faithfulness framework in this paper." 368 }, 369 { 370 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 371 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 372 "year": 2025, 373 "arxiv_id": "2501.12948", 374 "relevance": "Key paper on RL-based reasoning in LLMs using GRPO; inspiration for agentic search approaches." 375 }, 376 { 377 "title": "Language models don't always say what they think: Unfaithful explanations in chain-of-thought prompting", 378 "authors": ["Miles Turpin", "Julian Michael", "Ethan Perez", "Samuel Bowman"], 379 "year": 2023, 380 "relevance": "Demonstrates that LLM chain-of-thought reasoning can be unfaithful, core motivation for VERITAS." 381 }, 382 { 383 "title": "Let's verify step by step", 384 "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yuri Burda"], 385 "year": 2023, 386 "relevance": "Foundational work on process reward models for step-by-step verification, directly related to VERITAS's process supervision approach." 387 }, 388 { 389 "title": "Chain-of-thought unfaithfulness as disguised accuracy", 390 "authors": ["Oliver Bentham", "Nathan Stringham", "Ana Marasovic"], 391 "year": 2024, 392 "relevance": "Shows outcome-based training produces unfaithful reasoning traces; motivates process-based rewards." 393 }, 394 { 395 "title": "Search and Refine during Think: Autonomous Retrieval-Augmented Reasoning of LLMs (AutoRefine)", 396 "authors": ["Yaorui Shi", "Sihang Li", "Chang Wu"], 397 "year": 2025, 398 "arxiv_id": "2505.11277", 399 "relevance": "Introduces retrieval-specific reward for evidence utilization, conceptually related to faithfulness rewards." 400 }, 401 { 402 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 403 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 404 "year": 2020, 405 "relevance": "Foundational RAG paper establishing the retrieve-then-generate paradigm that VERITAS builds upon." 406 }, 407 { 408 "title": "Are DeepSeek R1 and other reasoning models more faithful?", 409 "authors": ["James Chua", "Owain Evans"], 410 "year": 2025, 411 "arxiv_id": "2501.08156", 412 "relevance": "Evaluates faithfulness of reasoning models, directly relevant to the survey's assessment of reasoning quality." 413 }, 414 { 415 "title": "A comprehensive survey on reinforcement learning-based agentic search", 416 "authors": ["Minhua Lin", "Zongyu Wu", "Zhichao Xu"], 417 "year": 2025, 418 "arxiv_id": "2510.16724", 419 "relevance": "Survey of RL-based agentic search covering foundations, roles, optimizations and evaluations in the agentic AI space." 420 } 421 ] 422 }