scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31147B)
      1 {
      2   "paper": {
      3     "title": "Understand What LLM Needs: Dual Preference Alignment for Retrieval-Augmented Generation",
      4     "authors": [
      5       "Guanting Dong",
      6       "Yutao Zhu",
      7       "Chenghao Zhang",
      8       "Zechen Wang",
      9       "Zhicheng Dou",
     10       "Ji-Rong Wen"
     11     ],
     12     "year": 2024,
     13     "venue": "The Web Conference",
     14     "arxiv_id": "2406.18676",
     15     "doi": "10.1145/3696410.3714717"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "DPA-RAG introduces a dual preference alignment framework for RAG systems, combining an externally preference-aligned reranker (via point-wise, pair-wise, and contrastive learning) with an LLM internal self-alignment stage. The framework outperforms baselines across four QA datasets (NQ, TriviaQA, HotpotQA, WebQSP) and generalizes across multiple LLMs from 500M to 13B parameters as well as GPT-3.5 and GPT-4. Ablations show the preference-aligned reranker contributes the largest gains, and sequential pre-alignment followed by SFT outperforms mixed multi-task training.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states 'Our code is publicly available at https://github.com/dongguanting/DPA-RAG.' This is a definitive release, not a future promise."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All four evaluation datasets (NQ, TriviaQA, HotpotQA, WebQSP) are publicly available standard benchmarks. The paper states in Section B.1 that 'all datasets and evaluation benchmarks used in our experiments have been open-source.'"
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Section B.3 mentions 'eight A100 80g GPUs', LLaMA Factory versions 0.6.3 and 0.8.1, and AdamW optimizer, but there is no requirements.txt, Dockerfile, or comprehensive dependency listing sufficient to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While implementation details are provided in Section B.3 and algorithms in Appendix A, the paper lacks step-by-step reproduction instructions. No README-style commands or scripts for replicating experiments are given in the paper itself."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Despite reporting 'the average performance from five experiments, each with a different random seed' (Section B.3), Tables 2-5 present only point estimates with no confidence intervals, error bars, or ± notation."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims DPA-RAG 'significantly outperforms all baselines' (Section 4.2) but provides no p-values, t-tests, or any statistical significance tests. Differences are assessed by comparing raw numbers only."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Table 2 reports absolute improvements with baseline context (e.g., '+5.09' for NQ Hit@1 from 50.94 to 56.03 with LLaMA2-7B). The improvement magnitudes are shown relative to baselines throughout."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for the choice of five random seeds, the number of test examples, or a power analysis. Standard benchmark test set sizes are used without discussion of whether they are sufficient for the claimed comparisons."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Section B.3 states results are 'the average performance from five experiments, each with a different random seed,' but no standard deviation, IQR, or any spread measure is reported in any table or figure. The reader cannot assess result stability."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table 2 includes extensive baselines: traditional RAG with DPR across 6 LLMs, 7 reranker-based methods (RankGPT, LRL, PRP, RankLLaMA, BGE, BCEmbedding, ColBERTv2), and 6 preference-aligned methods (KnowPAT, REPLUG, RA-Judgement, RRHF, RAFT, FILCO)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include methods from 2023-2024 such as RAFT (2024), FILCO (2023), RankGPT (2023), BGE (2023), LLaMA3 (2024), and Qwen2 (2023). These are contemporary for a 2024 paper."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Tables 3 and 5 present detailed ablation studies removing individual components: PA-Reranker, Pre-Align, Query Augmentation, Filtering, point-wise/pair-wise/CPA alignment, and MGDA-UB optimization. Each component's contribution is individually measured."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two evaluation metrics are used throughout: Hit@1 for top-ranked accuracy and F1 score for quality/similarity to ground truth (Section 4.1)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of model outputs is conducted. Evaluation is entirely automated using Hit@1 and F1 metrics against ground-truth answers."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 1 shows explicit train/dev/test splits for all four datasets. Results in Table 2 are reported on the standard test sets for each benchmark."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per dataset (Table 2), per model size (Figure 3), per preference category (Figure 4: Both Correct, Aligned Knowledge, Unaligned Knowledge, Both Incorrect), per augmentation strategy (Table 4), and per ablation component (Table 5)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1 and Figure 1 discuss the 'Unaligned Knowledge' failure mode where retrieval misleads the LLM. Appendix C.3 provides detailed case studies showing baseline failures (e.g., NQ-Case1 baseline outputs 'New Westminster' incorrectly). Figure 5 discusses mixed training failure."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Figure 5 (right) explicitly reports that mixed training 'leads to a noticeable performance decline and fluctuations' compared to standard QA training, identifying an approach that doesn't work. The paper also notes that some augmentation strategies are less effective than others (Table 4)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims DPA-RAG 'outperforms all baselines' on four QA datasets, supported by Table 2. The claim of seamless integration with 'black-box and open-sourced LLM readers' is supported by results with GPT-3.5, GPT-4, LLaMA2, LLaMA3, and Qwen2."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims ('DPA-RAG improves performance') are supported by controlled ablation studies (Tables 3, 5) that remove individual components and measure the effect. The ablation design with single-variable manipulation is adequate for these claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper calls DPA-RAG 'a universal framework designed to align diverse knowledge preferences within RAG systems' (abstract), but only tests on four English QA datasets. No testing on other languages, other knowledge-intensive tasks (e.g., summarization, dialogue), or other retrieval corpora. The 'universal' claim significantly overstates the tested scope."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for why DPA-RAG works. For example, improvements could stem from additional training data rather than the alignment mechanism, or from the augmented data providing a regularization effect. No robustness checks or confound analysis is presented."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures Hit@1 and F1 on QA benchmarks and claims improvements in QA performance within RAG systems. The measurements match the claimed outcomes — no proxy gap exists between what is measured and what is claimed."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "While 'gpt-3.5-turbo-0613' is specified for data augmentation (Section C.2), the main evaluation uses only 'GPT-3.5' and 'GPT-4' without snapshot dates or API versions (Table 2). LLaMA2-7B/13B and LLaMA3-8B are specified by size but not by exact checkpoint or release version."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt templates are provided in Section B.2: the SFT stage prompt, the pre-aligned stage prompt with judgment instruction, and the query augmentation prompt. These include the actual text with placeholders whose fill values are defined."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section B.3 reports: batch size 16 (reranker) and 128 (QA), learning rates 1e-5 and 7e-5, 10 and 3 epochs, AdamW optimizer, 3% warmup, temperature 1.0 for augmentation, a=0.8 for scoring weight, top-k=3 documents."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. DPA-RAG is a retriever-reranker-reader pipeline with fine-tuning, not an agent-based system."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3.2 documents the preference knowledge construction pipeline in detail: hierarchical document sampling (positions 1, 25, 50, 100), four-category classification, five augmentation strategies, NLI quality filtering removing ~20% of augmented data. Algorithm 1 formalizes the full process."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated limitations section in the paper. The conclusion (Section 5) is a brief summary of contributions with no discussion of limitations or threats to validity."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of specific methodological weaknesses or potential confounds."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not state what it does NOT show. It claims 'universal' applicability without stating boundaries — no mention of untested languages, task types, retrieval corpora, or settings where the approach might not work."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "While the benchmark datasets are public, the constructed preference dataset, per-run experimental outputs, and individual seed results are not made available. The reader cannot verify the reported averaged numbers."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.2 describes in detail how preference data is collected: direct vs. referencing document comparisons on GPT-3.5, four-category classification, hierarchical sampling of documents at positions 1, 25, 50, 100. Section B.1 describes all four benchmark datasets."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from standard public QA benchmarks (NQ, TriviaQA, HotpotQA, WebQSP)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Algorithm 1 provides the complete pipeline from training data to preference dataset construction including extraction, augmentation, and NLI filtering. Algorithm 2 documents the reader training pipeline. Section 3.2 states the preference dataset is ~20% of the training set."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source or acknowledgments section is present in the paper text. The paper does not disclose whether the work was funded."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Gaoling School of Artificial Intelligence at Renmin University of China, and Beijing University of Posts and Telecommunications."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial disclosure is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the models used (GPT-3.5, GPT-4, LLaMA2, LLaMA3, Qwen2). This is needed to assess whether benchmark test data could be in the training corpus."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of potential overlap between benchmark test sets (NQ, TriviaQA, HotpotQA, WebQSP) and the pre-training data of the LLMs used. These benchmarks all predate the models."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "NQ (2019), TriviaQA (2017), HotpotQA (2018), and WebQSP (2016) were all publicly available well before the training cutoffs of GPT-3.5, GPT-4, LLaMA2/3, and Qwen2. No contamination analysis or mitigation is discussed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. All experiments use automated evaluation on benchmark datasets."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, latency, or per-example API cost is reported. The system involves retrieval, reranking, and LLM generation but no cost figures are given for any stage."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Section B.3 mentions 'eight A100 80g GPUs' for training but does not quantify total GPU hours, wall-clock training time, or API costs for GPT-3.5/GPT-4 experiments."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Section B.3 states 'the average performance from five experiments, each with a different random seed,' but no per-seed results, standard deviations, or sensitivity analysis is reported. The reader cannot assess how much results vary across seeds."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Section B.3 explicitly states: 'We report the average performance from five experiments, each with a different random seed.'"
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Specific hyperparameters are reported (Section B.3) but the paper does not describe how they were selected — no search budget, search method, or number of configurations tried. The scoring weight a=0.8 is noted as following prior work without grid search."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No explanation of how the final hyperparameter configuration was selected. Learning rates, batch sizes, and epochs appear chosen but the selection process is not described."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons across 4 datasets, 6+ reader models, and 13+ baselines without any correction for multiple comparisons (Bonferroni, Holm, etc.)."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement and evaluate their own baselines (Table 2) without acknowledging the well-documented bias that authors' re-implementations of baselines systematically underperform."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "DPA-RAG adds a fine-tuned reranker and a pre-alignment training stage on top of standard RAG. The additional compute cost is never compared against the performance gains, nor are baselines compared at matched compute budgets."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper uses NQ, TriviaQA, HotpotQA, and WebQSP as QA benchmarks without discussing whether these benchmarks adequately measure the claimed capability of 'preference alignment' in RAG systems."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No agentic scaffolding is involved. DPA-RAG is a retriever-reranker-reader pipeline, not an agent scaffold."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "All four benchmarks (NQ 2019, TriviaQA 2017, HotpotQA 2018, WebQSP 2016) predate the models used (GPT-3.5/4, LLaMA2/3, Qwen2). The models may have seen solutions in training. This is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the retrieval setup or the preference knowledge construction pipeline introduces information leakage between train and test."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether train and test examples share structural similarities or come from overlapping sources (e.g., Wikipedia articles appearing in both retrieval corpus and test answers)."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "DPA-RAG significantly outperforms all baselines across four QA datasets with different LLM readers.",
    372       "evidence": "Table 2 shows DPA-RAG with LLaMA2-7B achieving +5.09 Hit@1 on NQ, +6.26 on TriviaQA, +3.83 on HotpotQA, +3.88 on WebQSP over RAG baseline. Improvements are consistent across GPT-3.5, GPT-4, LLaMA2-13B, LLaMA3-8B, and Qwen2-7B.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "The preference-aligned reranker provides the largest performance contribution among DPA-RAG's components.",
    377       "evidence": "Table 3 ablation shows removing PA-Reranker causes the largest drop (-3.23 Hit@1 on NQ, -3.64 on TQA), larger than removing Pre-Align (-1.72, -2.21) or Query Augmentation (-2.13, -2.62).",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Integrating both external and internal alignment yields a mutually reinforcing effect beyond additive gains.",
    382       "evidence": "Table 3 shows combined removal (w/o Pre-Align + PA-Rerank) causes -4.12 on NQ, while individual removals sum to -4.95, suggesting some synergy. However this is only shown on 2 datasets.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "Sequential training (pre-alignment then SFT) outperforms mixed multi-task training.",
    387       "evidence": "Figure 5 (right) shows mixed training leads to 'a noticeable performance decline and fluctuations' while sequential training yields stable gains on NQ. Only demonstrated on one dataset.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Query augmentation effectiveness correlates with data complexity and diversity scores.",
    392       "evidence": "Table 4 and Figure 5 (left) show that augmentation strategies with higher complexity+diversity scores (Complexity: 2.33, NQ 54.81) outperform those with lower scores (Rephrasing: 2.03, NQ 52.27). A monotonic trend is observed.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "DPA-RAG delivers stable performance improvements as model parameter size increases from 500M to 14B.",
    397       "evidence": "Figure 3 shows DPA-RAG improvements remain relatively consistent across Qwen1.5-0.5B through Qwen1.5-14B on both HQA and TQA datasets.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Retrieved documents often fail to match the knowledge preferences of LLMs, with high-similarity documents sometimes misleading the model.",
    402       "evidence": "Figure 1 preliminary analysis on GPT-3.5 across NQ, TriviaQA, and HotpotQA shows 'Unaligned Knowledge' cases where high-similarity documents mislead, and 'Aligned Knowledge' cases where low-similarity documents (100th) still help.",
    403       "supported": "moderate"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No variance reported despite multiple runs",
    409       "detail": "The paper averages over 5 random seeds but never reports standard deviation, confidence intervals, or any spread measure. Without variance, the reader cannot assess whether reported improvements are within noise margins. Given the small absolute differences in some comparisons (e.g., +1.44 on WebQSP Hit@1 for LLaMA3), this is a significant omission."
    410     },
    411     {
    412       "flag": "No statistical significance tests",
    413       "detail": "Claims of 'significantly outperforms' are made based on comparing raw numbers without any statistical tests. With many comparisons across 4 datasets, 6+ readers, and 13+ baselines, some apparent improvements could be due to chance."
    414     },
    415     {
    416       "flag": "Overclaiming with 'universal' label",
    417       "detail": "The paper calls DPA-RAG 'a universal framework' but only evaluates on English-language QA tasks with Wikipedia-based retrieval. No testing on other languages, other task types (summarization, dialogue, code), or other knowledge sources."
    418     },
    419     {
    420       "flag": "No limitations section",
    421       "detail": "The paper has no dedicated limitations discussion. This is unusual for a venue like The Web Conference and means known weaknesses (e.g., dependence on GPT-3.5 for preference scoring, generalization beyond QA) are never acknowledged."
    422     },
    423     {
    424       "flag": "Benchmark contamination risk unaddressed",
    425       "detail": "All four benchmarks (NQ, TriviaQA, HotpotQA, WebQSP) were published years before the training cutoffs of GPT-3.5/4 and LLaMA2/3. The baseline LLMs may have memorized test answers, confounding the preference alignment evaluation."
    426     },
    427     {
    428       "flag": "Self-comparison bias",
    429       "detail": "Authors implement all baselines themselves (Table 2) without acknowledging the well-documented bias that authors' re-implementations of competing methods tend to underperform. No independent evaluation or use of official baseline implementations is confirmed."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Training language models to follow instructions with human feedback",
    435       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    436       "year": 2022,
    437       "relevance": "Foundational RLHF paper for aligning LLMs with human preferences, which DPA-RAG extends to retrieval alignment."
    438     },
    439     {
    440       "title": "Direct preference optimization: Your language model is secretly a reward model",
    441       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    442       "year": 2023,
    443       "relevance": "DPO is the dominant alternative to RLHF for preference alignment; DPA-RAG's pair-wise alignment draws on similar principles."
    444     },
    445     {
    446       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    447       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    448       "year": 2020,
    449       "relevance": "Foundational RAG paper that established the retrieve-then-read paradigm that DPA-RAG extends."
    450     },
    451     {
    452       "title": "REPLUG: Retrieval-augmented black-box language models",
    453       "authors": ["Weijia Shi", "Sewon Min", "Michihiro Yasunaga"],
    454       "year": 2023,
    455       "arxiv_id": "2301.12652",
    456       "relevance": "Baseline method for aligning retrievers with black-box LLMs using output probability, directly compared with DPA-RAG."
    457     },
    458     {
    459       "title": "RAFT: adapting language model to domain specific RAG",
    460       "authors": ["Tianjun Zhang", "Shishir G. Patil", "Naman Jain"],
    461       "year": 2024,
    462       "arxiv_id": "2403.10131",
    463       "relevance": "Baseline that trains LLMs to ignore irrelevant retrieved documents, directly compared with DPA-RAG."
    464     },
    465     {
    466       "title": "Learning to filter context for retrieval-augmented generation",
    467       "authors": ["Zhiruo Wang", "Jun Araki", "Zhengbao Jiang"],
    468       "year": 2023,
    469       "arxiv_id": "2311.08377",
    470       "relevance": "FILCO baseline that filters training context using data selection methods, a strong competing approach to DPA-RAG."
    471     },
    472     {
    473       "title": "Llama 2: Open foundation and fine-tuned chat models",
    474       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    475       "year": 2023,
    476       "arxiv_id": "2307.09288",
    477       "relevance": "Primary open-source LLM used as reader in DPA-RAG experiments across multiple model sizes."
    478     },
    479     {
    480       "title": "GPT-4 technical report",
    481       "authors": ["OpenAI"],
    482       "year": 2023,
    483       "arxiv_id": "2303.08774",
    484       "relevance": "Black-box LLM reader used to demonstrate DPA-RAG's cross-model generalization capability."
    485     },
    486     {
    487       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    488       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    489       "year": 2022,
    490       "relevance": "Foundational prompting technique for LLM reasoning, relevant to how LLMs process retrieved knowledge."
    491     },
    492     {
    493       "title": "ReAct: Synergizing reasoning and acting in language models",
    494       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    495       "year": 2023,
    496       "relevance": "Agent framework combining reasoning and retrieval, relevant to multi-round retrieval paradigm in RAG alignment."
    497     },
    498     {
    499       "title": "Knowledgeable preference alignment for LLMs in domain-specific question answering",
    500       "authors": ["Yichi Zhang", "Zhuo Chen", "Yin Fang"],
    501       "year": 2024,
    502       "relevance": "KnowPAT baseline for aligning LLM preferences with knowledge, directly compared with DPA-RAG."
    503     },
    504     {
    505       "title": "Investigating the factual knowledge boundary of large language models with retrieval augmentation",
    506       "authors": ["Ruiyang Ren", "Yuhao Wang", "Yingqi Qu"],
    507       "year": 2023,
    508       "arxiv_id": "2307.11019",
    509       "relevance": "RA-Judgement baseline that explores knowledge boundaries of RAG, directly compared with DPA-RAG."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 2,
    515       "justification": "RAG is widely deployed in production systems; the reranker alignment approach is implementable with released code, though requires training infrastructure."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "The finding that retrieved documents don't match LLM preferences is somewhat expected in the RAG community; the dual alignment approach is incremental rather than surprising."
    520     },
    521     "fear_safety": {
    522       "score": 0,
    523       "justification": "No safety or security concerns raised; the paper focuses on improving QA accuracy in RAG systems."
    524     },
    525     "drama_conflict": {
    526       "score": 0,
    527       "justification": "No controversy or conflict; this is a straightforward method paper with incremental improvements."
    528     },
    529     "demo_ability": {
    530       "score": 1,
    531       "justification": "Code is released on GitHub but requires significant setup (training reranker, fine-tuning LLMs on 8 A100 GPUs) — not easily demoed."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "Renmin University is well-known in NLP research but not a household name; paper uses GPT-3.5/4 as baselines but is not from OpenAI."
    536     }
    537   }
    538 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs