scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21645B)
      1 {
      2   "paper": {
      3     "title": "CoT-RAG: Integrating Chain of Thought and Retrieval-Augmented Generation to Enhance Reasoning in Large Language Models",
      4     "authors": ["Feiyang Li", "Peng Fang", "Zhan Shi", "Arijit Khan", "Fang Wang", "Weihao Wang", "Xin Zhang", "Yongjian Cui"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2504.13534"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository URL provided: https://github.com/hustlfy123/CoT-RAG. The abstract and Appendix C state 'we have released the codebase, datasets, and manually designed decision trees.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses nine public benchmark datasets (AQuA, GSM8K, MultiArith, SingleEq, HotpotQA, CSQA, SIQA, Last Letter, Coin Flip) and four vertical domain datasets. The GitHub repo claims to include datasets and decision trees."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. Only mentions using LLMs via API with temperature 0 and max tokens 1000."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions in the paper. The GitHub link is provided but the paper itself does not contain a reproducing results section or specific commands to run."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as single point estimates (accuracy percentages) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CoT-RAG outperforms baselines across all datasets but provides no statistical significance tests. Comparisons are made solely by comparing accuracy numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context, e.g., 'accuracy improvements from 4.0% to 44.3%' and per-baseline breakdowns like 'compared to Manual-CoT... the average accuracy across datasets increases by 4.0%-15.2%'."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for dataset sizes or number of examples used. The paper mentions adapting datasets using an LLM (Appendix H) but does not justify the sample sizes chosen."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures reported. The paper states temperature is set to 0 for deterministic outputs, but does not discuss variance across any dimension."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Extensive baselines included: 13 methods for general domains (Zero-shot, Zero-shot-CoT, Manual-CoT, Auto-CoT, Complex-CoT, PS, QDMRPS, KD-CoT, IRCoT, KG-CoT, Iter-CoT, ZEUS, Pattern-CoT) and 8 graph-form RAG methods for vertical domains."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods such as ZEUS (Kumar et al., 2025), Pattern-CoT (Zhang et al., 2025), ToG-2 (Ma et al., 2025), and PoG (Tan et al., 2025)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix C.1 presents ablation study removing four components: node decomposition, RAG, PsePrompting, and expert inspection, tested on four datasets (Figure 4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only accuracy is used as the evaluation metric across all experiments. No other metrics (F1, precision, recall, etc.) are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of system outputs is conducted. All evaluation is via automated accuracy on benchmark datasets."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses standard public benchmark test sets (AQuA, GSM8K, etc.) for evaluation. These are established benchmarks with defined test splits."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per dataset across three reasoning categories (arithmetic, commonsense, symbolic) in Table 1, and per vertical domain dataset in Table 2."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure case analysis or error examples are discussed. The paper only shows where accuracy is lower but does not examine specific failure modes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation study shows accuracy drops when components are removed. The paper also notes CoT-RAG has slightly higher runtime than simpler baselines (Manual-CoT, Zero-shot) and that CoT-RAG (Zero-expert) performs 7.8% worse than the expert version."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'accuracy gains ranging from 4.0% to 44.3% over state-of-the-art methods' which is supported by Table 1 results. Claims about domain-specific datasets are supported by Table 2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies (Appendix C.1) where individual components are removed while others are held constant."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims to 'Enhance Reasoning in Large Language Models' broadly, but results are primarily on proprietary API-based LLMs (ERNIE, GPT-4o mini, GPT-4o, GLM-4-flash). The limitations section acknowledges this only partially, noting smaller LLMs are excluded. The paper does not bound claims to the specific models and datasets tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the observed improvements. The paper does not consider confounds such as whether the expert-provided decision trees simply encode the answer structure, or whether improvements come from additional context length rather than the framework design."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are referred to by marketing names only: 'GPT-4o mini', 'GPT-4o', 'ERNIE-Speed-128K', 'GLM-4-flash'. No API snapshot dates or version identifiers are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper provides pseudo-program prompt templates with actual fill values in the appendices (Tables 11-13 in Appendix E, and detailed prompts in Appendix F). The full prompt structures including sub-questions, sub-cases, and sub-descriptions are shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.2 states: 'we invoke LLMs that are not fine-tuned via API and set the temperature to 0 to ensure deterministic outputs' and 'we set the max tokens to 1000'. Number of demonstration examples is also specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The three-stage framework (KG-driven CoT Generation, Learnable Knowledge Case-aware RAG, Pseudo-Program Prompting Execution) is described in detail in Section 3, with algorithms in Appendix A."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 4.1 mentions 'following GraphRAG and Graph-CoT, we employ an LLM to adapt the datasets to satisfy our testing needs' but details are deferred to Appendix H. The adaptation process using an LLM to transform datasets raises questions about what transformations were applied."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 'Limitations' discusses two key limitations: reliance on proprietary LLMs with advanced capabilities and the influence of expert knowledge on decision tree construction."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section identifies specific threats: 'publicly available LLMs, especially those with a smaller scale (e.g., 7B or 13B parameters), fall short on these requirements' and 'the construction of decision trees is influenced by the expert's domain-specific knowledge and background.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. Limitations mention constraints but do not clearly bound what generalizations readers should not make from the results."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper uses publicly available benchmark datasets and claims to release code, datasets, and decision trees at the GitHub repository."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes the datasets used, their sources, and categories. Standard public benchmarks are referenced with citations."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; this is a benchmark evaluation study using standard datasets."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper mentions using an LLM to adapt datasets (Section 4.1: 'we employ an LLM to adapt the datasets to satisfy our testing needs') but does not clearly document the full pipeline from original datasets to final evaluation data, including what transformations were applied and how many examples were affected."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 9 'Acknowledgements' lists multiple funding sources: National Key Research and Development Program of China, NSFC grants, China Postdoctoral Science Foundation, and Novo Nordisk Foundation Grant."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed, including Huawei Technologies Co., Ltd for three authors (Weihao Wang, Xin Zhang, Yongjian Cui). The paper does not evaluate Huawei products specifically."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from government research foundations (NSFC, China Postdoctoral Science Foundation) and Novo Nordisk Foundation, none of which have a direct financial stake in the outcomes of this reasoning framework evaluation."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper. Three authors are affiliated with Huawei, but no conflicts of interest declaration is present."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses GPT-4o, GPT-4o mini, ERNIE, and GLM models on public benchmarks but does not state training data cutoff dates for any model."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether test examples (from benchmarks like GSM8K, AQuA, etc.) could have appeared in the training data of the LLMs used."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Benchmarks like GSM8K (2021), AQuA (2017), and MultiArith (2015) were published well before the training cutoffs of GPT-4o and other models used. No contamination discussion is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Appendix C.4 reports average runtime per question and token consumption across methods and datasets (Tables 8 and 9), including comparisons showing 29.2% runtime reduction and 33.4% token decrease vs GraphRAG."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, total API spend, or total GPU hours stated. Only per-question averages are reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoT-RAG achieves accuracy improvements ranging from 4.0% to 44.3% over state-of-the-art CoT methods on nine public datasets.",
    286       "evidence": "Table 1 shows results on ERNIE-Speed-128K across nine datasets. CoT-RAG achieves 89.1% average accuracy vs next best ZEUS at 77.7% (Section 5).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CoT-RAG increases accuracy by 8.9% to 80.6% on four vertical domain datasets compared to graph-form LLM-based RAG methods.",
    291       "evidence": "Table 2 shows CoT-RAG at 95.2% average accuracy vs best baseline (CoT-RAG IndexIVFPQ at 87.7% and RoG at 83.7%) on GPT-4o mini.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Node decomposition is the most critical component of CoT-RAG.",
    296       "evidence": "Ablation study (Appendix C.1, Figure 4) shows removing node decomposition causes the largest accuracy drop across all four tested datasets.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Expert involvement is indispensable for vertical domain applications, with CoT-RAG (Zero-expert) achieving 7.8% lower accuracy.",
    301       "evidence": "Table 2 shows CoT-RAG (Zero-expert) at 87.4% average vs CoT-RAG at 95.2%.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CoT-RAG achieves competitive runtime with 29.2% reduction in average runtime and 33.4% decrease in average token consumption compared to GraphRAG.",
    306       "evidence": "Appendix C.4, Table 9 reports runtime and token comparisons.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoT-RAG integrates knowledge graph-driven CoT generation, case-aware RAG, and pseudo-program prompting to improve LLM reasoning. On nine public benchmarks across arithmetic, commonsense, and symbolic reasoning, it achieves 89.1% average accuracy (ERNIE-Speed-128K), outperforming 13 baselines by 4.0-44.3%. On four vertical domain datasets (law, finance, logic), it achieves 95.2% average accuracy with GPT-4o mini. Ablation shows node decomposition is the most impactful component, and expert-crafted decision trees provide a 7.8% advantage over LLM-generated ones.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical significance testing",
    315       "detail": "All comparisons are based on raw accuracy numbers with no significance tests, confidence intervals, or variance measures. With temperature set to 0, determinism is claimed, but this does not address sampling variability from dataset composition."
    316     },
    317     {
    318       "flag": "Single metric evaluation",
    319       "detail": "Only accuracy is used across all experiments. No secondary metrics (F1, precision, recall, calibration) are reported, making it impossible to understand performance characteristics beyond aggregate correctness."
    320     },
    321     {
    322       "flag": "Benchmark contamination risk unaddressed",
    323       "detail": "Several benchmarks (AQuA 2017, MultiArith 2015, GSM8K 2021) predate the training data of GPT-4o and other models. No contamination analysis is provided, yet the framework's improvements are measured against these potentially contaminated benchmarks."
    324     },
    325     {
    326       "flag": "Dataset adaptation via LLM inadequately documented",
    327       "detail": "The paper states datasets were adapted using an LLM to 'satisfy testing needs' but the transformation process is not transparently documented in the main text. This could introduce systematic bias favoring the proposed method."
    328     },
    329     {
    330       "flag": "Suspiciously large improvements on some benchmarks",
    331       "detail": "CoT-RAG achieves 98.4% on HotpotQA and 98.7% on SIQA with ERNIE-Speed-128K, while baselines top out around 87-90%. These near-perfect scores on challenging benchmarks warrant scrutiny, especially given the LLM-adapted datasets."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    337       "authors": ["Jason Wei"],
    338       "year": 2022,
    339       "relevance": "Foundational CoT prompting paper that this work builds upon and compares against."
    340     },
    341     {
    342       "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization",
    343       "authors": ["Darren Edge"],
    344       "year": 2024,
    345       "arxiv_id": "2404.16130",
    346       "relevance": "GraphRAG baseline combining RAG with knowledge graphs for question answering."
    347     },
    348     {
    349       "title": "Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks",
    350       "authors": ["Wenhu Chen"],
    351       "year": 2023,
    352       "relevance": "Key baseline using code prompts for reasoning, motivating CoT-RAG's pseudo-program approach."
    353     },
    354     {
    355       "title": "Navigate through Enigmatic Labyrinth: A Survey of Chain of Thought Reasoning",
    356       "authors": ["Zheng Chu"],
    357       "year": 2024,
    358       "relevance": "Comprehensive survey of CoT reasoning advances and limitations in LLMs."
    359     },
    360     {
    361       "title": "Reasoning on Graphs: Faithful and Interpretable Large Language Model Reasoning",
    362       "authors": ["Linhao Luo"],
    363       "year": 2024,
    364       "relevance": "RoG baseline combining KG-grounded reasoning paths with LLMs."
    365     },
    366     {
    367       "title": "Think-on-Graph 2.0: Deep and Faithful Large Language Model Reasoning with Knowledge-Guided Retrieval Augmented Generation",
    368       "authors": ["Shengjie Ma"],
    369       "year": 2025,
    370       "relevance": "ToG-2 baseline using KGs to connect documents for enhanced LLM reasoning."
    371     },
    372     {
    373       "title": "Large Language Models are Zero-Shot Reasoners",
    374       "authors": ["Takeshi Kojima"],
    375       "year": 2022,
    376       "relevance": "Zero-shot-CoT baseline demonstrating LLM reasoning with minimal prompting."
    377     },
    378     {
    379       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    380       "authors": ["Patrick Lewis"],
    381       "year": 2020,
    382       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm."
    383     },
    384     {
    385       "title": "Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs",
    386       "authors": ["Bowen Jin"],
    387       "year": 2024,
    388       "relevance": "Graph-CoT baseline for iterative graph-based LLM reasoning."
    389     },
    390     {
    391       "title": "Enhancing Zero-Shot Chain of Thought Prompting via Uncertainty-Guided Strategy Selection",
    392       "authors": ["Shanu Kumar"],
    393       "year": 2025,
    394       "relevance": "ZEUS baseline improving CoT with uncertainty-guided demonstration selection."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs