ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30388B)


      1 {
      2   "paper": {
      3     "title": "Hierarchical Document Refinement for Long-context Retrieval-augmented Generation",
      4     "authors": [
      5       "Jiajie Jin",
      6       "Xiaoxi Li",
      7       "Guanting Dong",
      8       "Yuyao Zhang",
      9       "Yutao Zhu",
     10       "Yongkang Wu",
     11       "Zhonghua Li",
     12       "Qi Ye",
     13       "Zhicheng Dou"
     14     ],
     15     "year": 2025,
     16     "venue": "Annual Meeting of the Association for Computational Linguistics",
     17     "arxiv_id": "2505.10413",
     18     "doi": "10.48550/arXiv.2505.10413"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "LongRefiner, a plug-and-play document refinement system for RAG, achieves best or near-best performance across seven QA datasets (single-hop, multi-hop, long-form) while using ~10x fewer tokens than full-document input and maintaining low latency (~10.8s vs ~40s for full content). Ablation studies show hierarchical document structuring is the most critical component (removal causes ~20% degradation), while scaling analysis demonstrates performance gains with larger refiner models and more training data. The approach outperforms perplexity-based compression methods like LongLLMLingua with ~50x less latency.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states 'Our code is available at https://github.com/ignorejjj/LongRefiner' and provides a working URL."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "All seven evaluation datasets (NQ, TriviaQA, HotpotQA, 2WikiMultiHopQA, ASQA, ELI5, PopQA) are publicly available standard benchmarks. The corpus is based on the publicly available Wikipedia 2018 dump from KILT."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions VLLM, Llama-Factory, 4 NVIDIA A800 GPUs, and bf16 precision, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. Implementation details in Appendix A describe the setup but not specific commands to replicate experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 2, 3, 4, and 5 are reported as point estimates (e.g., '54.4 Acc') with no confidence intervals, error bars, or ± notation."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims improvements over baselines (e.g., 'surpassing the performance of perplexity-based methods by more than 9%') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, etc.) are reported."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper provides effect sizes with baseline context: 'reducing token usage by 10x and latency by 4x' compared to full content, 'surpassing the performance of perplexity-based methods by more than 9%', and Table 2 shows all absolute scores enabling comparison."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for the choice of 10,000 training samples per dataset, the number of evaluation examples per benchmark, or power analysis. The training data size is varied in Figure 3 but the final choice is not justified."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance or standard deviation is reported across runs. The paper states 'temperature set to 0 for greedy decoding to eliminate randomness in results' for inference, but training randomness (LoRA initialization, data ordering) is not addressed."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Nine baselines across three categories are compared: retrieval-based (BM25, Bge-Reranker, SBERT, Recomp), semantic chunking (Jina-Segment, Meta-Chunking), and perplexity-based (Selective-Context, LLMLingua2, LongLLMLingua)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include recent methods: LLMLingua2 (2024), Meta-Chunking (2024), Bge-Reranker (2024), LongLLMLingua (2023). These represent current state-of-the-art approaches."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 3 presents ablation studies removing each of the three key components (query analysis, document structuring, adaptive refinement) and measuring impact across single-hop, multi-hop, and long-form QA."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses Accuracy and F1 Score for short-answer QA datasets, and F1 Score for long-form datasets. Tables also report Tokens and Latency as efficiency metrics."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is fully automated using Accuracy and F1 metrics. No human evaluation of output quality is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper uses standard benchmark test splits from NQ, TriviaQA, HotpotQA, 2WikiMultiHopQA, ASQA, ELI5, and PopQA. PopQA is additionally used as out-of-domain data since it 'does not include a training set.'"
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by all 7 individual datasets in Table 2, by QA type (single-hop, multi-hop, long-form) in Tables 3-4, and by document length in Figure 4."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper discusses PopQA where their method slightly underperforms full content due to short documents with minimal noise, and Figure 4 shows lower relative performance on shorter documents. Section 4.4 discusses recall decline with training data scaling."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Negative results include: lower performance on PopQA vs full content, lower performance on shorter documents (Figure 4), and temporary recall decline with increased training data (Figure 3 right panels) attributed to overfitting."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'competitive performance in various scenarios' and '10x fewer computational costs' — Table 2 shows LongRefiner achieves best scores on most datasets with 1933 tokens vs 19567 for full content (~10x reduction). Latency claim (10.8s vs 40.6s for full content) is also supported."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims are made via ablation studies (Table 3): 'Removing any step results in significant performance degradation.' The ablation design controls single-variable manipulation by removing one component at a time, which is adequate for these causal claims."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The abstract claims 'practical insights for real-world long-text RAG applications' but evaluation is limited to Wikipedia-based QA benchmarks. While the Limitations section acknowledges the approach 'relies entirely on general-domain Wikipedia corpus' and cannot 'directly transfer to vertical domains,' the title and abstract frame results more broadly than the tested setting."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for the observed improvements. No consideration of confounds such as whether gains come from the specific scoring model, the training data quality, or other factors beyond hierarchical structuring."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures QA Accuracy and F1 and reports them as such. Claims are framed in terms of QA performance, token compression ratio, and latency — matching the actual measurements without inflating to broader constructs."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Specific model versions are stated: Llama3.1-8B-Instruct as generator, Qwen2.5-3B-Instruct as refiner backbone, Llama3.1-70B-Instruct for annotation, bge-reranker-v2-m3 for scoring. These are versioned open-source models with identifiable weight releases."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Prompts A, B, C.1, and C.2 are provided in the appendix but contain unresolved {demonstrations} placeholders. The actual few-shot demonstration examples are not provided anywhere in the paper, so the full prompts cannot be reconstructed."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A.3 reports learning rate (3e-5), batch size (1), gradient accumulation (8), warmup ratio (0.1), bf16 precision, LoRA method, max sequence lengths (2k/32k/4k), 1 epoch training. Inference uses temperature 0 and max output tokens 500."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "The paper does not use agentic scaffolding. LongRefiner is a multi-step processing pipeline (query analysis → document structuring → adaptive refinement) without agent loops, retry logic, or tool use."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 3.2 describes Wikipedia-based label collection: collecting webpage data, removing images/links/references, extracting structural information. Appendix A.3 describes using first 10,000 samples from each dataset's training set, label generation with Llama3.1-70B-Instruct."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 'Limitations' provides substantive discussion of two specific limitations: lack of support for diverse data types (tables, images, hyperlinks) and reliance on general-domain Wikipedia corpus."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The limitations section discusses specific threats: (1) the method handles only plain text, not tables/images/hyperlinks found in real documents, and (2) the Wikipedia training corpus makes transfer to vertical domains (enterprise, finance) challenging. These are specific to this study."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper explicitly states what was not tested: diverse data types and vertical domains like enterprise or finance. It acknowledges the approach 'relies entirely on general-domain Wikipedia corpus, making it challenging to directly transfer to vertical domains.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "All evaluation datasets are publicly available standard benchmarks (NQ, TriviaQA, HotpotQA, etc.). The corpus is the publicly available Wikipedia 2018 dump from KILT. Code repository is provided."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The paper describes using FlashRAG's data collection, retrieving top-8 full documents from Wikipedia Dump 2018 per query following the LongRAG approach, and constructing training labels using Llama3.1-70B-Instruct with specified prompts."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public benchmarks and Wikipedia dump."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is documented: Wikipedia webpages → remove irrelevant info → extract structure → create (D, Dxml) pairs for document structuring. Query labels generated via Llama3.1-70B with specified prompts. Training uses first 10,000 samples per dataset, merged into final training set."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding acknowledgment or grant information is provided anywhere in the paper."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly stated: Renmin University of China (Gaoling School of Artificial Intelligence) and Huawei Poisson Lab."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Funding is not disclosed. Huawei co-authors are involved, and Huawei as a technology company has potential interest in RAG system improvements, but no funding relationship is disclosed for assessment."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper uses Llama3.1-8B-Instruct as generator and Qwen2.5-3B-Instruct as refiner but does not state the training data cutoff date for either model."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether test examples from the QA benchmarks (NQ 2019, TriviaQA 2017, etc.) appeared in the training data of Llama3.1 or Qwen2.5."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "All seven benchmarks were published between 2017-2022, well before Llama3.1 and Qwen2.5's training. The paper does not discuss whether these models may have seen benchmark answers during pre-training, which could affect RAG evaluation results."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Table 2 reports both token counts (1933 tokens) and latency (10.8s) for LongRefiner and all baselines. The paper extensively discusses efficiency as a key contribution."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper states '4 NVIDIA A800 GPUs' and '1 epoch' training per task, but does not quantify total GPU hours, training time, or compute cost."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No results across multiple random seeds are reported. While inference uses greedy decoding (temp=0), LoRA training involves randomness that is not addressed."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is not explicitly stated. The paper mentions 'temperature set to 0 for greedy decoding to eliminate randomness' for inference, but does not state whether training was conducted once or multiple times."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search budget is reported. Final hyperparameters are listed (learning rate 3e-5, batch size 1, gradient accumulation 8, etc.) but how they were selected is not described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No description of how the final model configuration was selected. The k parameter for XML truncation is described as a hyperparameter but its selection is not justified."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No statistical tests are performed at all, let alone correction for multiple comparisons. The paper makes many comparisons across 7 datasets and 9+ baselines without any statistical testing."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors evaluate their own system against baselines without acknowledging self-comparison bias. Baselines are implemented using FlashRAG and official implementations, but no discussion of whether author-evaluation could favor their method."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Figure 1 plots latency vs. performance across methods. Table 2 reports tokens and latency alongside performance. Figure 3 shows performance as a function of model size and training data volume. All methods are constrained to a 2k token budget for fair comparison."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper uses seven standard QA benchmarks without discussing whether QA accuracy/F1 adequately measures the claimed 'refinement quality' or whether these benchmarks represent real-world RAG scenarios."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "All methods use the same generator (Llama3.1-8B-Instruct) and the same retriever (top-8 documents). The same generation prompts are used across methods. The only variable is the refinement approach, which is the object of study."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Benchmarks (NQ 2019, TriviaQA 2017, HotpotQA 2018, etc.) predate the training data of Llama3.1 and Qwen2.5 by years. No discussion of whether models may have memorized answers from pre-training on these widely-used benchmarks."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks answer information. The refiner is trained on the same benchmark training sets used for evaluation, though test splits are separate."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of independence between training and test data. The refiner's training labels are generated using data from the same benchmark distributions as the test sets."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention methods are employed. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LongRefiner achieves the best performance across all seven QA datasets while maintaining low latency (10.8s) and using ~10x fewer tokens than full content.",
    375       "evidence": "Table 2 shows LongRefiner scores highest on 6/7 datasets (NQ: 54.4 Acc, TriviaQA: 71.7 Acc, HotpotQA: 39.3 Acc, ASQA: 35.8 F1) using 1933 tokens vs 19567 full content, with 10.8s latency vs 40.6s.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Hierarchical document structuring is the most critical component, with removal causing nearly 20% degradation.",
    380       "evidence": "Table 3 ablation: removing document structuring drops single-hop from 62.3 to 45.7 EM (~27% relative drop). Multi-hop drops from 37.4 to 29.9 Acc, long-form from 30.2 to 27.1 F1.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "LongRefiner surpasses perplexity-based methods by more than 9% while maintaining latency comparable to retrieval-based approaches.",
    385       "evidence": "Table 2: LongRefiner NQ Acc 54.4 vs LongLLMLingua 45.4 (best perplexity-based), ~9pp gain. Latency 10.8s vs 3.6-8.6s for retrieval-based methods.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Performance improves with increased model parameters and training data, with diminishing returns.",
    390       "evidence": "Figure 3 shows recall and accuracy increase from 0.5B to 7B model sizes and from 1000 to 10000 training examples, with curves flattening at larger scales.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "LongRefiner consistently outperforms LongLLMLingua across almost all document lengths.",
    395       "evidence": "Figure 4 shows LongRefiner outperforms the best baseline at all document lengths above ~10k tokens, with growing advantage at longer documents.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The method generalizes to different generator models (Qwen2.5-7B-Instruct).",
    400       "evidence": "Table 5 (Appendix B) shows LongRefiner outperforms all baselines on Qwen2.5-7B-Instruct across all datasets.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or uncertainty quantification",
    407       "detail": "All results across Tables 2-5 are single point estimates with no confidence intervals, standard deviations, or significance tests. With no variance reporting, it is impossible to assess whether performance differences are meaningful or within noise."
    408     },
    409     {
    410       "flag": "Benchmark contamination not addressed",
    411       "detail": "All seven QA benchmarks (2017-2022) predate the training of Llama3.1 and Qwen2.5 (2024). These models may have memorized benchmark answers during pre-training, which could confound the evaluation even in a RAG setting, as the generator could rely on parametric knowledge rather than retrieved content."
    412     },
    413     {
    414       "flag": "Misleading efficiency comparison framing",
    415       "detail": "The abstract claims '10x fewer computational costs and latency compared to the best baseline' which conflates token reduction (vs full content) with latency reduction (vs LongLLMLingua). Against retrieval-based methods, LongRefiner's latency (10.8s) is actually higher than BM25 (3.6s). The '10x' framing cherry-picks the most favorable comparison point."
    416     },
    417     {
    418       "flag": "Wikipedia-only evaluation presented as general RAG solution",
    419       "detail": "All experiments use Wikipedia-based QA datasets, but the paper frames LongRefiner as providing 'practical insights for real-world long-text RAG applications.' The limitations section acknowledges domain transfer challenges, but the main claims are broader than the evidence."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    425       "authors": ["Patrick S. H. Lewis", "Ethan Perez", "Aleksandra Piktus"],
    426       "year": 2020,
    427       "relevance": "Foundational RAG paper that establishes the paradigm of combining retrieval with generation for knowledge-intensive tasks."
    428     },
    429     {
    430       "title": "LLMLingua: Compressing prompts for accelerated inference of large language models",
    431       "authors": ["Huiqiang Jiang", "Qianhui Wu", "Chin-Yew Lin"],
    432       "year": 2023,
    433       "relevance": "Key baseline: perplexity-based prompt compression method for reducing LLM inference costs."
    434     },
    435     {
    436       "title": "LongLLMLingua: Accelerating and enhancing LLMs in long context scenarios via prompt compression",
    437       "authors": ["Huiqiang Jiang", "Qianhui Wu", "Xufang Luo"],
    438       "year": 2023,
    439       "relevance": "Primary baseline: extends LLMLingua to long-context scenarios, the strongest competitor in experiments."
    440     },
    441     {
    442       "title": "LLMLingua-2: Data distillation for efficient and faithful task-agnostic prompt compression",
    443       "authors": ["Zhuoshi Pan", "Qianhui Wu", "Huiqiang Jiang"],
    444       "year": 2024,
    445       "relevance": "Key baseline: improved prompt compression via data distillation, evaluated in the main experiments."
    446     },
    447     {
    448       "title": "RECOMP: Improving retrieval-augmented LMs with compression and selective augmentation",
    449       "authors": ["Fangyuan Xu", "Weijia Shi", "Eunsol Choi"],
    450       "year": 2023,
    451       "arxiv_id": "2310.04408",
    452       "relevance": "Retrieval compression baseline that selectively augments LM inputs from retrieved content."
    453     },
    454     {
    455       "title": "FlashRAG: A modular toolkit for efficient retrieval-augmented generation research",
    456       "authors": ["Jiajie Jin", "Yutao Zhu", "Xinyu Yang"],
    457       "year": 2024,
    458       "arxiv_id": "2405.13576",
    459       "relevance": "RAG evaluation toolkit used for implementing baselines in this paper."
    460     },
    461     {
    462       "title": "BIDER: Bridging knowledge inconsistency for efficient retrieval-augmented LLMs via key supporting evidence",
    463       "authors": ["Jiajie Jin", "Yutao Zhu", "Yujia Zhou"],
    464       "year": 2024,
    465       "relevance": "Addresses knowledge inconsistency in RAG by identifying key supporting evidence from retrieved documents."
    466     },
    467     {
    468       "title": "LongRAG: Enhancing retrieval-augmented generation with long-context LLMs",
    469       "authors": ["Ziyan Jiang", "Xueguang Ma", "Wenhu Chen"],
    470       "year": 2024,
    471       "arxiv_id": "2406.15319",
    472       "relevance": "Long-context RAG approach whose MaxP document retrieval design is adopted in this paper's experimental setup."
    473     },
    474     {
    475       "title": "Compressing context to enhance inference efficiency of large language models",
    476       "authors": ["Yucheng Li", "Bo Dong", "Frank Guerin"],
    477       "year": 2023,
    478       "relevance": "Selective-Context method used as a perplexity-based baseline for prompt compression."
    479     },
    480     {
    481       "title": "xRAG: Extreme context compression for retrieval-augmented generation with one token",
    482       "authors": ["Xin Cheng", "Xun Wang", "Xingxing Zhang"],
    483       "year": 2024,
    484       "arxiv_id": "2405.13792",
    485       "relevance": "Soft prompt compression approach for RAG that encodes documents into vector representations."
    486     },
    487     {
    488       "title": "LoRA: Low-rank adaptation of large language models",
    489       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    490       "year": 2022,
    491       "relevance": "Parameter-efficient fine-tuning method used as the core training strategy for LongRefiner's multi-task learning."
    492     },
    493     {
    494       "title": "Search-o1: Agentic search-enhanced large reasoning models",
    495       "authors": ["Xiaoxi Li", "Guanting Dong", "Jiajie Jin"],
    496       "year": 2025,
    497       "arxiv_id": "2501.05366",
    498       "relevance": "Agentic search system that enhances LLM reasoning with retrieval, relevant to the agentic AI research scope."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "RAG is widely deployed and long-context refinement is a practical bottleneck; the plug-and-play design and code release make this usable, though domain limitations reduce immediate applicability."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that document structure helps refinement is intuitive rather than surprising; the main contribution is engineering a working system around this idea."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No safety or security concerns raised; this is a performance optimization for RAG systems."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict; straightforward system paper with standard empirical evaluation."
    517     },
    518     "demo_ability": {
    519       "score": 2,
    520       "justification": "Code is released on GitHub and the system uses open-source models (Qwen2.5-3B, Llama3.1-8B), making it reproducible with appropriate GPU resources."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Renmin University has moderate NLP reputation; Huawei is a known company but not a top-tier AI lab brand for this audience."
    525     }
    526   }
    527 }

Impressum · Datenschutz