scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29577B)
      1 {
      2   "paper": {
      3     "title": "SEAKR: Self-aware Knowledge Retrieval for Adaptive Retrieval Augmented Generation",
      4     "authors": [
      5       "Zijun Yao",
      6       "Weijian Qi",
      7       "Liangming Pan",
      8       "Shulin Cao",
      9       "Linmei Hu",
     10       "Weichuan Liu",
     11       "Lei Hou",
     12       "Juanzi Li"
     13     ],
     14     "year": 2024,
     15     "venue": "Annual Meeting of the Association for Computational Linguistics",
     16     "arxiv_id": "2406.19215",
     17     "doi": "10.48550/arXiv.2406.19215"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "SEAKR achieves 36.0%, 39.7%, and 23.5% F1 on 2WikiMultiHop, HotpotQA, and IIRC complex QA benchmarks using LLaMA-2-chat 7B, outperforming the best baseline (DRAGIN) by up to 6.0 percentage points. Ablation study reveals that self-aware re-ranking (knowledge integration) contributes more performance gain than self-aware retrieval (deciding when to retrieve), highlighting the importance of adaptive knowledge integration in RAG. The method leverages Gram determinant of internal hidden representations as an uncertainty measure, outperforming output-level alternatives (perplexity, prompting, energy scores), and scales positively from LLaMA-2 (7B) to LLaMA-3 (8B).",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The abstract states 'We release our code in our Github repository' but no URL or repository link is provided anywhere in the paper text or appendices."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All six datasets (2WikiMultiHopQA, HotpotQA, IIRC, NaturalQuestions, TriviaQA, SQuAD) are publicly available standard benchmarks. The Wikipedia dump (December 20, 2018) is also public."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Appendix C states vLLM 0.4.2, PyTorch 2.3.0, Elasticsearch 7.17.9. Limitations section specifies NVidia 3090 GPU with 24GiB GRAM. LLaMA-2-chat 7B is the backbone."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided in the paper. The paper describes the method but not how to execute the experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 1-4 report only point estimates (e.g., '36.0% F1') with no confidence intervals, error bars, or ± notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims 'SEAKR outperforms existing adaptive RAG methods' based solely on comparing point estimates across tables. No statistical significance tests (p-values, t-tests, bootstrap, etc.) are reported."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 4.2.1 reports improvements with baseline context: 'outperforms the best baselines by 6.0%, 5.5%, and 0.6%' on the three complex QA datasets, with all baseline numbers shown in Table 1."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Standard benchmark test/dev splits are used without justification. For analysis experiments, Section 5 states 'sample 500 questions from each dataset to reduce the cost' but no power analysis or formal sample size justification is provided."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run point estimates."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 4.1.2 lists five baselines: CoT, IRCoT, Self-RAG, FLARE, and DRAGIN, covering both non-adaptive and adaptive RAG methods."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include DRAGIN (2024), Self-RAG (2023), FLARE (2023), and IRCoT (2022), which are the most recent adaptive RAG methods at the time of submission."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 5.1 provides extensive ablation: removing self-aware retrieval, self-aware re-ranking, self-aware reasoning, and substituting alternative uncertainty estimators (prompting, perplexity, multi-perplexity, LN-entropy, energy). Results in Table 3."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "All experiments report both Exact Match (EM) and F1 score across all six datasets (Tables 1 and 2)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation is conducted. All evaluation is automated via EM and F1 metrics. Human evaluation of reasoning chain quality could have added value."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4.1.1 states hyperparameters are searched on NQ's training split, while results are reported on separate official development sets (complex QA following IRCoT, simple QA following DPR). The evaluation sets are not used for tuning."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down across all six datasets individually (3 complex QA in Table 1, 3 simple QA in Table 2), and ablation results are shown per-dataset in Table 3."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 5.4 and Table 5 show a case study where the LLM generates an incorrect pseudo-generation (wrong birthday for Jodorowsky) but SEAKR correctly identifies high uncertainty and retrieves corrective knowledge. Additional cases in Appendix A show the system's behavior."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 4.2.1 acknowledges limited improvement on IIRC due to numerical reasoning difficulty. The ablation study (Table 3) shows degradation when components are removed. Section 4.2.2 notes SEAKR lags behind Self-RAG on NQ."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract claims 'SEAKR outperforms existing adaptive RAG methods' on 'both complex and simple Question Answering datasets.' However, on NaturalQuestions (simple QA), Self-RAG achieves 40.2 F1 vs SEAKR's 35.5 F1 (Table 2), contradicting the claim."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims are made through controlled ablation studies (Table 3) that remove individual components (self-aware retrieval, re-ranking, reasoning) while holding other components fixed. This controlled single-variable manipulation adequately supports causal attribution."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The Limitations section explicitly bounds scope: only open-source LLMs (requires internal states), only short-form QA (not long-form or creative writing), only models up to 8B parameters. These specific boundaries are stated."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not discuss whether the ~20x additional compute (20 pseudo-generations per step) could explain performance gains vs baselines. No confounds are discussed. The limitations section discusses computational cost but not as an alternative explanation for results."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures EM and F1 on QA benchmarks and claims QA performance improvements. The measurements match the granularity of the claims without broader framing (no claims about 'intelligence' or 'understanding')."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.1.3 specifies 'LLaMA-2-chat with 7 billion parameters' and Table 4 lists 'LLaMA-3 with 8B Parameters' with base vs chat/instruct versions. These are specific, fixed-weight open-source model releases."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix B provides full prompt templates for self-aware retrieval (B.1), re-ranking (B.2), and reasoning (B.3). Figures 4-7 provide all in-context learning examples for all six datasets. A reader can reconstruct every prompt."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section 4.1.3 reports N=3 (knowledge recall), k=20 (Gram determinant samples), l=L/2 (layer), 10 ICL examples. However, LLM sampling parameters (temperature, top-p) for the 20 pseudo-generations are not stated, and these significantly affect the uncertainty estimation."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Figure 2 provides a detailed system diagram. Sections 3.1-3.4 describe the full pipeline: self-aware retrieval (uncertainty threshold → retrieval decision), query generation, re-ranking (pairwise uncertainty comparison), and reasoning strategy selection."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1.1 describes dataset selection and open-domain QA setting. Section 4.1.3 specifies Wikipedia dump date (December 20, 2018), BM25 search engine, and dataset splitting conventions (following IRCoT and DPR). Table 6 provides dataset statistics."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "A dedicated 'Limitations' section with five numbered points covers scope of usage, task coverage, computation issues, model scaling, and information retrieval."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Limitations are specific to this study: (1) requires access to internal states, limiting to open-source LLMs; (2) only tested on short-form QA; (3) 20 pseudo-generations are computationally costly; (4) maximum 8B parameter models tested; (5) self-aware re-ranking could be surpassed by advanced IR methods."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The Limitations section explicitly states what was NOT tested: closed-source LLMs (GPT series), long-form QA, creative writing, models larger than 8B. Task coverage limitation (point 2) directly states 'neglecting a broad spectrum of natural language processing tasks.'"
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "While the benchmark datasets are publicly available, the paper's experimental outputs (model predictions, uncertainty scores, intermediate reasoning chains) are not released for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1.1 describes each dataset, Section 4.1.3 specifies the Wikipedia dump (December 20, 2018), search engine (BM25/ElasticSearch), and dataset splitting conventions. Table 6 provides dataset statistics."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data sources are standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The overall experimental setup is described but intermediate processing steps are not documented. No details on how Wikipedia was indexed, how benchmark questions were preprocessed, or how final metrics were computed from raw model outputs."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding acknowledgment or grant information appears anywhere in the paper. One author is from Siemens Technology but no funding disclosure is made."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All author affiliations are listed in the header: Tsinghua University, UC Santa Barbara, Beijing Institute of Technology, and Siemens Technology. The paper does not evaluate products from these institutions."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, making it impossible to assess funder independence. The Siemens affiliation is noted but no funding relationship is described."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not state when LLaMA-2's training data was collected. LLaMA-2 was trained on data with a cutoff around September 2022, but this is not mentioned in the paper."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No analysis of whether benchmark questions (HotpotQA 2018, NQ 2019, SQuAD 2016, etc.) appeared in LLaMA-2's training data. These benchmarks predate the model and overlap risk is not discussed."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "All six benchmarks (2018 or earlier for most) were publicly available well before LLaMA-2's training. No contamination analysis or discussion is provided despite this clear risk."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. All evaluation is automated on QA benchmarks."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. The paper includes an Ethical Considerations section but it discusses potential misuse, not IRB approval."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The Limitations section mentions 20 pseudo-generations are 'computationally costly' and vLLM parallelization reduces latency, but no actual inference time, cost per query, or tokens consumed is reported."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper states experiments run on 'a single NVidia 3090 GPU with 24GiB GRAM' but does not report total GPU hours, wall-clock time, or computational budget for the experiments."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not state how many experimental runs produced the reported results. No mention of 'averaged over K runs' or similar."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Section 5.3 and Figure 3 show hyperparameter search results for N, k, δ, and layer selection on NQ training data. However, the total number of configurations tried and total compute spent on search are not stated."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Section 5.3 describes hyperparameter selection on NQ's training split (separate from evaluation data). The search process and selection criteria are shown in Figure 3 with clear rationale for each choice."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical significance tests are performed, so multiple comparison correction is inapplicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors re-implement FLARE with IRCoT strategy for complex QA evaluation. No acknowledgment that their re-implementation of baselines could systematically underperform the original authors' implementations."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "SEAKR requires 20 pseudo-generations per reasoning step for uncertainty estimation, roughly 20x the compute of baselines like FLARE/DRAGIN. This compute difference is mentioned in limitations but performance is never compared at matched compute budgets."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper uses six QA benchmarks without discussing whether EM/F1 on these benchmarks actually measures the adaptive RAG capabilities claimed. No construct validity analysis is provided."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "Section 4.1.3 states 'These choices and constraints are also applied to all our baseline methods for fair comparison,' using the same backbone LLM (LLaMA-2-chat 7B), search engine (BM25/ElasticSearch), and Wikipedia dump across all methods."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "All six benchmarks (SQuAD 2016, TriviaQA 2017, HotpotQA 2018, NQ 2019, 2WikiMultiHopQA 2020, IIRC 2020) predate LLaMA-2's training. The model may have seen benchmark solutions during training, but this temporal leakage is not discussed."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup leaks answer information through context or whether in-context learning examples provide inappropriate hints."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether train and test examples share structural similarities or whether the NQ training split used for hyperparameter search shares properties with other evaluation datasets."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "SEAKR outperforms existing adaptive RAG methods on complex QA benchmarks, achieving 36.0%, 39.7%, and 23.5% F1 on 2WikiMultiHop, HotpotQA, and IIRC respectively.",
    374       "evidence": "Table 1 (Section 4.2.1) shows SEAKR's F1 scores exceed the best baseline (DRAGIN at 30.0%, 34.2%, 22.9%) by 6.0, 5.5, and 0.6 percentage points.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Self-aware re-ranking contributes more to performance than self-aware retrieval in adaptive RAG.",
    379       "evidence": "Table 3 ablation (Section 5.1): removing re-ranking causes larger drops (−2.8 F1 on 2Wiki, −1.5 on HPQA) than removing retrieval (−2.1 on 2Wiki, −0.5 on HPQA). Authors state 'ablating self-aware re-ranking reduces the performance of SEAKR more than removing self-aware retrieval.'",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Internal-state uncertainty estimation outperforms output-level alternatives for adaptive RAG.",
    384       "evidence": "Table 3 compares SEAKR's Gram determinant approach against prompting-based, perplexity, multi-perplexity, LN-entropy, and energy score alternatives. SEAKR achieves 37.8 F1 on 2Wiki vs best alternative (LN-Entropy) at 36.0.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "SEAKR scales positively with more powerful backbone LLMs.",
    389       "evidence": "Table 4 (Section 5.2) shows improvement from LLaMA-2 (7B) to LLaMA-3 (8B): 37.8→48.1 F1 on 2Wiki, 38.1→47.7 on HPQA, 36.1→43.0 on NQ.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "SEAKR is effective on both complex and simple QA tasks.",
    394       "evidence": "Tables 1-2 show strong results on complex QA but mixed results on simple QA: SEAKR leads on TriviaQA (63.1 F1) and SQuAD (36.5 F1) but trails Self-RAG on NQ (35.5 vs 40.2 F1).",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars or statistical tests",
    401       "detail": "All results across Tables 1-4 are single-run point estimates with no variance, confidence intervals, or significance tests. Performance differences (e.g., 0.6% F1 improvement on IIRC) could be within random variation."
    402     },
    403     {
    404       "flag": "Abstract overclaims",
    405       "detail": "The abstract states SEAKR 'outperforms existing adaptive RAG methods' on 'both complex and simple QA datasets,' but Self-RAG outperforms SEAKR on NaturalQuestions (40.2 vs 35.5 F1, Table 2)."
    406     },
    407     {
    408       "flag": "No contamination analysis",
    409       "detail": "All six benchmarks (2016-2020) predate LLaMA-2's training data. The model may have memorized benchmark answers, inflating all results. No contamination check or temporal analysis is performed."
    410     },
    411     {
    412       "flag": "Uncontrolled compute overhead as confound",
    413       "detail": "SEAKR requires 20 pseudo-generations per reasoning step for uncertainty estimation, roughly 20x the compute of single-generation baselines like FLARE/DRAGIN. Performance improvement could partly stem from this additional compute budget rather than the self-aware mechanism itself."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    419       "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang", "Avirup Sil", "Hannaneh Hajishirzi"],
    420       "year": 2023,
    421       "relevance": "Adaptive RAG method that fine-tunes LLMs with special retrieval/critique tokens; direct baseline comparison for SEAKR."
    422     },
    423     {
    424       "title": "Active Retrieval Augmented Generation",
    425       "authors": ["Zhengbao Jiang", "Frank F Xu", "Luyu Gao", "Zhiqing Sun", "Qian Liu", "Jane Dwivedi-Yu", "Yiming Yang", "Jamie Callan", "Graham Neubig"],
    426       "year": 2023,
    427       "relevance": "FLARE triggers retrieval on low-probability tokens; key adaptive RAG baseline that SEAKR aims to improve upon."
    428     },
    429     {
    430       "title": "DRAGIN: Dynamic Retrieval Augmented Generation Based on the Real-time Information Needs of Large Language Models",
    431       "authors": ["Weihang Su", "Yichen Tang", "Qingyao Ai", "Zhijing Wu", "Yiqun Liu"],
    432       "year": 2024,
    433       "relevance": "Dynamic RAG using attention weights for retrieval decisions; strongest baseline that SEAKR outperforms on complex QA."
    434     },
    435     {
    436       "title": "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-intensive Multi-step Questions",
    437       "authors": ["Harsh Trivedi", "Niranjan Balasubramanian", "Tushar Khot", "Ashish Sabharwal"],
    438       "year": 2022,
    439       "relevance": "IRCoT combines CoT reasoning with retrieval for multi-hop QA; foundational retrieval-augmented reasoning method and baseline."
    440     },
    441     {
    442       "title": "INSIDE: LLMs' Internal States Retain the Power of Hallucination Detection",
    443       "authors": ["Chao Chen", "Kai Liu", "Ze Chen", "Yi Gu", "Yue Wu", "Mingyuan Tao", "Zhihang Fu", "Jieping Ye"],
    444       "year": 2023,
    445       "relevance": "Key methodological foundation for SEAKR's uncertainty estimation using Gram determinant of internal hidden representations."
    446     },
    447     {
    448       "title": "Retrieval-Augmented Generation for Knowledge-intensive NLP Tasks",
    449       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal"],
    450       "year": 2020,
    451       "relevance": "Foundational RAG paper combining retrieval with generation for knowledge-intensive tasks."
    452     },
    453     {
    454       "title": "Self-knowledge Guided Retrieval Augmentation for Large Language Models",
    455       "authors": ["Yile Wang", "Peng Li", "Maosong Sun", "Yang Liu"],
    456       "year": 2023,
    457       "relevance": "Trains a classifier to judge factuality of LLM generation for adaptive retrieval; alternative approach to SEAKR's internal-state method."
    458     },
    459     {
    460       "title": "Language Models (Mostly) Know What They Know",
    461       "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"],
    462       "year": 2022,
    463       "arxiv_id": "2207.05221",
    464       "relevance": "Demonstrates LLMs are aware of their uncertainty, foundational motivation for SEAKR's self-aware uncertainty approach."
    465     },
    466     {
    467       "title": "The Internal State of an LLM Knows When Its Lying",
    468       "authors": ["Amos Azaria", "Tom Mitchell"],
    469       "year": 2023,
    470       "relevance": "Shows internal states can detect factuality, supporting the premise that LLM internals can guide retrieval decisions."
    471     },
    472     {
    473       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    474       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Fei Xia", "Ed Chi", "Quoc V Le", "Denny Zhou"],
    475       "year": 2022,
    476       "relevance": "CoT reasoning is the backbone reasoning strategy that SEAKR augments with self-aware retrieval."
    477     },
    478     {
    479       "title": "Probabilistic Tree-of-Thought Reasoning for Answering Knowledge-intensive Complex Questions",
    480       "authors": ["Shulin Cao", "Jiajie Zhang", "Jiaxin Shi", "Xin Lv", "Zijun Yao"],
    481       "year": 2023,
    482       "relevance": "ProbTree decomposes complex questions into sub-questions solved with RAG; retrieval-augmented reasoning approach."
    483     },
    484     {
    485       "title": "Adaptive-RAG: Learning to Adapt Retrieval-Augmented Large Language Models through Question Complexity",
    486       "authors": ["Soyeong Jeong", "Jinheon Baek", "Sukmin Cho", "Sung Ju Hwang", "Jong C Park"],
    487       "year": 2024,
    488       "arxiv_id": "2403.14403",
    489       "relevance": "Adapts RAG strategy based on question complexity; closely related adaptive RAG approach."
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 2,
    495       "justification": "Method is applicable to QA systems using open-source LLMs, but requires access to model internal states which limits deployment to local model serving."
    496     },
    497     "surprise_contrarian": {
    498       "score": 1,
    499       "justification": "Novel application of internal-state uncertainty for RAG decisions, but the general idea that LLM internals contain useful signals is well-established."
    500     },
    501     "fear_safety": {
    502       "score": 0,
    503       "justification": "No safety or risk implications; the paper addresses factual accuracy improvement in QA."
    504     },
    505     "drama_conflict": {
    506       "score": 0,
    507       "justification": "No controversial claims or challenges to established institutions or practices."
    508     },
    509     "demo_ability": {
    510       "score": 1,
    511       "justification": "Code claimed to be released but no URL provided; requires a GPU and local LLM deployment to use."
    512     },
    513     "brand_recognition": {
    514       "score": 1,
    515       "justification": "Tsinghua University is well-known in AI research but not a household name like OpenAI or Google DeepMind."
    516     }
    517   }
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs