scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29065B)
      1 {
      2   "paper": {
      3     "title": "Shifting from Ranking to Set Selection for Retrieval Augmented Generation",
      4     "authors": [
      5       "Dahyun Lee",
      6       "Yongrae Jo",
      7       "Haeju Park",
      8       "Moontae Lee"
      9     ],
     10     "year": 2025,
     11     "venue": "Annual Meeting of the Association for Computational Linguistics",
     12     "arxiv_id": "2507.06838",
     13     "doi": "10.18653/v1/2025.acl-long.861"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "SETR, a set-wise passage selection approach for RAG, outperforms traditional reranking methods (including GPT-4o-based RankGPT) on four multi-hop QA benchmarks while using 40-50% fewer passages on average. Chain-of-Thought reasoning with explicit information requirement identification (IRI) contributes distinct gains to retrieval precision and information coverage. Controlled experiments isolating method-level effects (same base model, same teacher supervision) confirm the gains stem from the set-wise formulation itself rather than model capacity differences.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract states 'The code is available at https://github.com/LGAI-Research/SetR' and Section 3 references the open-source contribution."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All evaluation benchmarks (HotpotQA, 2WikiMultiHopQA, MuSiQue, MultiHopRAG) are public. Training data is referenced from a public HuggingFace dataset (castorini/rank_zephyr_training_data, Section 3.3.1)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions '16×A100 GPUs', Axolotl framework, and Rankify toolkit (Appendix A.3), but provides no requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. Training and evaluation procedures are described at a high level, but there is no 'Reproducing Results' section or commands to run."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 1-5 report only point estimates (e.g., '36.62' F1) with no confidence intervals, error bars, or ± notation anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims SETR 'significantly outperforms' baselines (Section 4.2) but provides no statistical significance tests (no p-values, t-tests, bootstrap tests, or any other test). Differences are assessed by comparing point estimates only."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Tables 1-4 provide baseline and method results allowing direct computation of effect sizes. Section 4.3 reports '3.8%-4.6% higher precision' with baseline context. All results are contextualized against multiple baselines."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for why these particular benchmark sizes or training set size (40K) were chosen. No power analysis or discussion of whether sample sizes are sufficient for the claims."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviation, variance, or any spread measure is reported across experimental runs. All results appear to be from single runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Extensive baselines are included: BM25, bge-reranker-large, RankLlama, RankVicuna, RankZephyr, FirstMistral, and RankGPT (gpt-4o). Tables 1-4 compare against all of them."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include recent models: FirstMistral (2024), RankGPT with gpt-4o-2024-08-06, RankZephyr (2023), and bge-reranker-large (2023). These represent the current state of the art."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Three SETR variants are ablated: SETR-Selection only (no reasoning), SETR-CoT (general CoT), and SETR-CoT & IRI (full model). Results in Tables 1, 2, 3 isolate contributions of CoT and IRI components."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used: EM, F1, Accuracy for QA; MRR@10, NDCG@10, Precision@5, Recall@5 for retrieval (Table 2); Hit@k and information coverage (Figure 3); token efficiency (Table 5)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is included. All evaluation is automated using benchmark metrics (EM, F1, Accuracy, MRR, NDCG, Precision, Recall)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "SETR is trained on MS MARCO data (Section 3.3.1) and evaluated on completely separate benchmarks: HotpotQA, 2WikiMultiHopQA, MuSiQue, and MultiHopRAG. This is a zero-shot transfer evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per dataset across all four benchmarks (Tables 1-4), with separate retrieval metrics in Table 2 and efficiency analysis per dataset in Table 5."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No qualitative failure examples are shown. Section 5.1 discusses performance degradation trends with more passages (Figure 3), but no specific failure cases or error analysis is provided."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Several negative results are reported: SETR-CoT underperforms SETR-Selection only on some benchmarks (e.g., MultiHopRAG with BM25: 41.63 vs 43.62 in Table 1). Figure 3 shows that increasing passages degrades performance for all methods."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims SETR 'outperforms both proprietary LLM-based rerankers and open-source baselines in terms of answer correctness and retrieval quality.' Tables 1 and 2 support this across four benchmarks with multiple metrics."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('X improves Y') are supported by: (1) ablation studies isolating CoT and IRI components (Tables 1, 3), (2) controlled teacher-model experiments (Table 3), and (3) unified-setting comparison controlling for base model, data, and supervision (Table 4, Section 5.3)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract bounds claims to 'multi-hop RAG benchmarks.' The Limitations section explicitly states: 'our method optimizes retrieval for multi-hop and complex queries but has not yet been validated across diverse RAG domains such as code generation or conversational AI.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 5.3 explicitly designs experiments to rule out alternative explanations: that gains come from model capacity or teacher supervision rather than the method itself. They test with the teacher model as upper bound and in a unified training setting."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures EM, F1, and Accuracy and claims 'answer correctness' — the measurements directly match the claimed outcomes. No proxy gap exists between what is measured and what is claimed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific versions are given: 'gpt-4o-2024-08-06' (footnote 3), 'Llama-3.1-8B-Instruct' with HuggingFace URL (footnote 4). Baseline model versions are also specified (e.g., Zephyr-7B-β, Mistral 7B)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full prompt templates are provided in Figures 2, 4, 5, 6, and 7 for all methods (SETR-CoT & IRI, SETR-CoT, SETR-Selection only, QA generation prompts). The placeholders are for actual input data (questions, passages)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Training hyperparameters are reported (5 epochs, batch size 512, LR 5×10⁻⁶, AdamW — Section 3.3.2 and Appendix A.3). However, inference parameters (temperature, top-p, max tokens) for both SETR and GPT-4o are not stated."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. SETR is a single-pass retrieve → select → generate pipeline with no tools, loops, retry logic, or feedback mechanisms."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 3.3.1 describes data construction: 40K questions from Pradeep et al., top-20 passages retrieved, GPT-4o labeling, text preprocessing with ftfy, and replacing [n] with (n) to prevent model confusion."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitations' section appears at the end of the paper with three substantive paragraphs covering dependence on initial retrieval, domain-specific validation gaps, and LLM reasoning quality."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The limitations are specific to this study: (1) dependence on initial retrieval stage quality, (2) not validated on diverse RAG domains like code generation or conversational AI, (3) effectiveness depends on the LLM's reasoning ability."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section explicitly states what was not tested: 'has not yet been validated across diverse RAG domains such as code generation or conversational AI' and notes dependence on initial retrieval quality."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Evaluation benchmarks are publicly available, but the GPT-4o-generated teacher annotations used for SETR training (40K examples) are not stated as released. Individual model predictions and intermediate results are not available."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.3.1 describes data construction in detail: 40K training questions from Pradeep et al. (2023b), derived from MS MARCO v1, each paired with top-20 retrieved passages, labeled by GPT-4o with zero-shot prompting."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public benchmarks (HotpotQA, 2WikiMultiHopQA, MuSiQue, MultiHopRAG) and an existing training dataset."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: source training data → top-20 passage retrieval → GPT-4o annotation → text preprocessing with ftfy and [n]→(n) replacement → supervised fine-tuning. Evaluation pipeline is also described (retrieval → selection → generation)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding or acknowledgments section is present in the paper. All authors are from LG AI Research (corporate lab) but no funding source is disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: LG AI Research and University of Illinois Chicago, with email addresses at lgresearch.ai."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed. The work comes from LG AI Research, a corporate lab. While they don't evaluate LG products specifically, the absence of funding disclosure makes independence unverifiable."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for Llama-3.1-8B-Instruct or GPT-4o, despite both models being used for evaluation and potentially having seen the benchmark data."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of potential overlap between model pretraining data and evaluation benchmarks. HotpotQA (2018), 2WikiMultiHopQA (2020), and MuSiQue (2022) predate both models and could be in their training data."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Three of four benchmarks (HotpotQA 2018, 2WikiMultiHopQA 2020, MuSiQue 2022) were published well before Llama-3.1 and GPT-4o training cutoffs. No contamination risk discussion is provided."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. All evaluation is automated on benchmark datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 5.4 and Table 5 report token-level efficiency analysis: output tokens generated during retrieval and input tokens fed to the generator for all methods across all four benchmarks."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware is mentioned (16×A100 GPUs, Appendix A.3) but total compute budget is not quantified — no GPU hours, training time, or total API spend for the 40K GPT-4o labeling is reported."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported. The final hyperparameters (LR 5×10⁻⁶, batch size 512, 5 epochs) are stated without any description of how they were selected."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No explanation of how the final hyperparameter configuration was selected. No mention of validation set selection or search procedure."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, so the question of correction for multiple comparisons does not arise."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No acknowledgment of author-evaluation bias. The authors compare SETR against baselines implemented via the Rankify toolkit but do not discuss whether their own system receives more careful tuning."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 5.4 and Table 5 explicitly compare token usage (as a compute proxy) across methods alongside their performance. The paper discusses the accuracy-efficiency trade-off spectrum across SETR variants."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether the multi-hop QA benchmarks actually measure what the paper claims to evaluate. The benchmarks are used without questioning their construct validity."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The retrieval and generation components are held constant across all comparisons (same bge-large-en-v1.5 retriever, same Llama-3.1-8B-Instruct or GPT-4o generator). Section 5.3 further controls for base model and training data."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. HotpotQA (2018), 2WikiMultiHopQA (2020), and MuSiQue (2022) were all published before the training data collection periods of Llama-3.1 and GPT-4o."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of feature leakage or whether the evaluation setup provides information not available in real usage scenarios."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of independence between training and test data. While SETR is trained on MS MARCO and evaluated on different benchmarks, potential overlap in the pretraining data of the base LLMs is not addressed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap, or decontamination pipelines)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "SETR outperforms both proprietary LLM-based rerankers and open-source baselines in terms of answer correctness across multi-hop QA benchmarks.",
    370       "evidence": "Tables 1 and 4 show SETR-CoT & IRI achieves the highest F1/EM/Accuracy on most benchmarks. With bge retriever: F1 38.11 on HotpotQA vs 34.45 for RankGPT (gpt-4o), Accuracy 47.14 on MultiHopRAG vs 45.69 for RankGPT.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Set-wise passage selection achieves comparable or better performance while using 40-50% fewer passages on average.",
    375       "evidence": "Table 1 shows SETR uses ~2.63-2.91 passages on average vs 5.00 for all reranking baselines, while achieving higher F1 and Accuracy scores.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Both CoT reasoning and information requirement identification (IRI) contribute distinct performance gains.",
    380       "evidence": "Ablation in Tables 1, 3, and 4 compares SETR-Selection only, SETR-CoT, and SETR-CoT & IRI. Results show mixed but generally positive contributions of each component, with some benchmarks showing CoT underperforming Selection only.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "SETR improves information coverage from 19.33% to 36.49% compared to reranking methods.",
    385       "evidence": "Figure 3d shows information coverage percentages on MultiHopRAG benchmark, with SETR achieving substantially higher coverage than reranking baselines.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Performance gains stem from the set-wise selection formulation itself, not from stronger base models or teacher supervision.",
    390       "evidence": "Section 5.3 and Tables 3-4 present controlled experiments: teacher model as upper bound (Table 3 using GPT-4o for all methods) and unified setting (Table 4 using same Llama-3.1-8B-Instruct, same training data, same teacher). SETR outperforms reranking in both controlled settings.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No statistical significance testing",
    397       "detail": "The paper uses 'significantly outperforms' language (Section 4.2) but provides no statistical significance tests anywhere. All claims of superiority are based on comparing point estimates across single runs."
    398     },
    399     {
    400       "flag": "No variance or error bars reported",
    401       "detail": "No standard deviation, confidence intervals, or any uncertainty quantification across all tables and figures. With single-run results, it is impossible to know whether observed differences exceed natural variance."
    402     },
    403     {
    404       "flag": "Benchmark contamination risk ignored",
    405       "detail": "Three of four benchmarks (HotpotQA 2018, 2WikiMultiHopQA 2020, MuSiQue 2022) predate the training of both Llama-3.1 and GPT-4o. No contamination analysis or discussion is provided."
    406     },
    407     {
    408       "flag": "Corporate affiliation without funding disclosure",
    409       "detail": "All lead authors are from LG AI Research, a corporate lab. No funding source, competing interests, or financial interests are disclosed."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    415       "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang", "Avirup Sil", "Hannaneh Hajishirzi"],
    416       "year": 2023,
    417       "arxiv_id": "2310.11511",
    418       "relevance": "Key RAG system that dynamically determines retrieval necessity with self-reflection, directly comparable to the proposed set-wise selection approach."
    419     },
    420     {
    421       "title": "Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents",
    422       "authors": ["Weiwei Sun", "Lingyong Yan", "Xinyu Ma", "Shuaiqiang Wang", "Pengjie Ren", "Zhumin Chen", "Dawei Yin", "Zhaochun Ren"],
    423       "year": 2024,
    424       "arxiv_id": "2304.09542",
    425       "relevance": "Introduces RankGPT and evaluates LLMs as reranking agents, serving as the primary proprietary baseline in this paper."
    426     },
    427     {
    428       "title": "RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a Breeze!",
    429       "authors": ["Ronak Pradeep", "Sahel Sharifymoghaddam", "Jimmy Lin"],
    430       "year": 2023,
    431       "arxiv_id": "2312.02724",
    432       "relevance": "Open-source listwise reranker that serves as a key baseline and provides the training data used by SETR."
    433     },
    434     {
    435       "title": "RankVicuna: Zero-Shot Listwise Document Reranking with Open-Source Large Language Models",
    436       "authors": ["Ronak Pradeep", "Sahel Sharifymoghaddam", "Jimmy Lin"],
    437       "year": 2023,
    438       "arxiv_id": "2309.15088",
    439       "relevance": "LLM-based listwise reranker baseline trained with GPT-generated supervision for passage ranking."
    440     },
    441     {
    442       "title": "MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries",
    443       "authors": ["Yixuan Tang", "Yi Yang"],
    444       "year": 2024,
    445       "arxiv_id": "2401.15391",
    446       "relevance": "Primary evaluation benchmark specifically designed for multi-hop RAG systems with gold evidence annotations."
    447     },
    448     {
    449       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    450       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal"],
    451       "year": 2021,
    452       "arxiv_id": "2005.11401",
    453       "relevance": "Foundational RAG paper that established the retrieval-augmented generation paradigm evaluated in this work."
    454     },
    455     {
    456       "title": "Adaptive-RAG: Learning to Adapt Retrieval-Augmented Large Language Models through Question Complexity",
    457       "authors": ["Soyeong Jeong", "Jinheon Baek", "Sukmin Cho", "Sung Ju Hwang", "Jong C. Park"],
    458       "year": 2024,
    459       "arxiv_id": "2403.14403",
    460       "relevance": "Adaptive retrieval strategy for RAG that dynamically determines retrieval necessity based on query complexity."
    461     },
    462     {
    463       "title": "RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval",
    464       "authors": ["Parth Sarthi", "Salman Abdullah", "Aditi Tuli", "Shubh Khanna", "Anna Goldie", "Christopher D. Manning"],
    465       "year": 2024,
    466       "arxiv_id": "2401.18059",
    467       "relevance": "Iterative retrieval approach that decomposes and refines complex questions through recursive processing."
    468     },
    469     {
    470       "title": "Large Language Models are Effective Text Rankers with Pairwise Ranking Prompting",
    471       "authors": ["Zhen Qin", "Rolf Jagerman", "Kai Hui", "Honglei Zhuang", "Junru Wu"],
    472       "year": 2023,
    473       "arxiv_id": "2306.17563",
    474       "relevance": "Evaluates LLMs for pairwise ranking in retrieval systems, relevant to understanding ranking strategies for RAG."
    475     },
    476     {
    477       "title": "CoRAG: A Cost-Constrained Retrieval Optimization System for Retrieval-Augmented Generation",
    478       "authors": ["Ziting Wang", "Haitao Yuan", "Wei Dong", "Gao Cong", "Feifei Li"],
    479       "year": 2024,
    480       "arxiv_id": "2411.00744",
    481       "relevance": "Multi-round retrieval approach for RAG with cost constraints, representing iterative retrieval strategies compared to SETR's single-step approach."
    482     },
    483     {
    484       "title": "Rankify: A Comprehensive Python Toolkit for Retrieval, Re-Ranking, and Retrieval-Augmented Generation",
    485       "authors": ["Abdelrahman Abdallah", "Jamshid Mozafari", "Bhawna Piryani", "Mohammed Ali", "Adam Jatowt"],
    486       "year": 2025,
    487       "arxiv_id": "2502.02464",
    488       "relevance": "Toolkit used to implement the full RAG pipeline in this paper's experiments, relevant to reproducibility of RAG evaluations."
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 2,
    494       "justification": "SETR is a drop-in replacement for rerankers in RAG systems with code released, directly usable by practitioners building multi-hop QA systems."
    495     },
    496     "surprise_contrarian": {
    497       "score": 1,
    498       "justification": "Challenges the dominance of ranking-based retrieval for RAG but the idea that set-level properties matter is intuitive rather than shocking."
    499     },
    500     "fear_safety": {
    501       "score": 0,
    502       "justification": "No safety, security, or risk concerns raised by this work."
    503     },
    504     "drama_conflict": {
    505       "score": 0,
    506       "justification": "No controversy or conflict; straightforward methodological improvement paper."
    507     },
    508     "demo_ability": {
    509       "score": 2,
    510       "justification": "Code released on GitHub (https://github.com/LGAI-Research/SetR) with model based on publicly available Llama-3.1-8B-Instruct."
    511     },
    512     "brand_recognition": {
    513       "score": 1,
    514       "justification": "LG AI Research is a known corporate lab but not a top-tier name in NLP research; published at ACL which adds credibility."
    515     }
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs