ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26730B)


      1 {
      2   "paper": {
      3     "title": "Towards a Robust Retrieval-Based Summarization System",
      4     "authors": [
      5       "Shengjie Liu",
      6       "Jing Wu",
      7       "Jingyuan Bao",
      8       "Wenyi Wang",
      9       "Naira Hovakimyan",
     10       "Christopher G Healey"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2403.19889",
     15     "doi": "10.48550/arXiv.2403.19889"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "The paper introduces LogicSumm, a 7-scenario evaluation framework for RAG-based summarization that reveals LLMs struggle with document relevance assessment, especially when irrelevant documents are present. SummRAG fine-tunes Mistral-7B Instruct on synthetically generated dialogues using LoRA, achieving high logical accuracy (≥0.79 across all scenarios) while matching GPT-3.5 Turbo's summarization quality. In multi-document settings (k=5,8,10), SummRAG maintains stable performance as irrelevant documents increase, unlike Stuff, Map-Reduce, and Refine frameworks which degrade.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "GitHub link provided in footnote 3: https://github.com/ncsulsj/Robust_Sumsystem. HuggingFace links for model weights and dataset also provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Dataset released on HuggingFace: https://huggingface.co/datasets/zycjlsj123/ragsummdata, as stated in footnote 3."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. No library versions specified."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions provided in the paper. While code and data links are given, there is no README description or 'Reproducing Results' section."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All tables (Tables 2, 3, 4) report only point estimates (e.g., 0.96, 0.91) with no confidence intervals or error bars."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are used. Claims that SummRAG outperforms baselines are based solely on comparing numbers in tables without any test."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Tables 2-4 provide absolute performance numbers for all baselines and SummRAG across all scenarios, allowing direct comparison of magnitudes. For example, SummRAG at 1.0 vs Mistral-7B Chat at 0.29 for Scenario 1 logical accuracy."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Test set sizes are stated per scenario (57, 48, 50, 36, 50, 43, 98 samples) but no justification for these sizes is given, and no power analysis is discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures reported anywhere. All results appear to be single-run numbers."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple baselines compared: GPT-3.5 Turbo, Claude 2, Jurassic, LLaMA2-13B Chat, Mistral-7B with various prompting strategies (Table 2), and Stuff/Map-Reduce/Refine frameworks (Table 4)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include GPT-3.5 Turbo, Claude 2, Mistral-7B (all 2023-2024 models), which were contemporary at time of writing."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No ablation study showing which components of SummRAG (special tokens, dialogue format, system prefixes, LoRA) contribute to the improvement. The system has multiple components but none are ablated."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics used: logical accuracy (GPT-4 assessed), BertScore (precision, recall, F1), and Rouge 1/2/L."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation included. Logical correctness is assessed by GPT-4 Turbo, and summarization quality is measured by automated metrics (BertScore, Rouge). The paper makes claims about summarization quality that would benefit from human assessment."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 3.1 states: 'We follow a procedure to generate test data similar to the method employed to create training data.' Test data is separately generated from training data."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per scenario in Tables 2 and 3, and per document count (k=5,8,10) in Table 4."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.3 provides a supporting analysis with specific examples of failure cases (stock market query example) and discusses distribution shift from original to fine-tuned model. Figure 2 also illustrates specific limitations."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that Chain of Thought prompting fails in Scenario 6 for information conflicts, and that Mistral-7B with explicit logical instructions shows poor performance in Scenarios 1 and 5 (Table 2). SummRAG's Scenario 6 performance (0.79) is also acknowledged as the weakest."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of 'improved logical coherence and summarization quality' are supported by Tables 2-4 showing SummRAG's high logical accuracy and competitive summarization scores."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper claims SummRAG 'enhances robustness' and 'improves' performance (causal language), but the intervention (fine-tuning) changes multiple variables simultaneously (dialogue format, special tokens, system prefixes, LoRA) without ablation. The comparison against zero-shot baselines also conflates the effect of additional training data with the specific framework design."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Robust Retrieval-Based Summarization System' without qualification, but results are limited to CNN Daily Mail and XSum datasets, Mistral-7B as the fine-tuned model, and GPT-4 Turbo as gold standard. No bounding to these specific settings."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No discussion of alternative explanations. For example: Could the improvements simply be due to additional training data exposure rather than the specific dialogue format? Could the synthetic test data generation process favor SummRAG's training distribution?"
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper uses GPT-4 Turbo to assess 'logical correctness' and GPT-4 outputs as gold standard summaries, but does not discuss whether GPT-4's judgment is a valid proxy for actual logical correctness, or whether GPT-4 outputs are valid gold standards for summarization quality."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Models are referred to as 'GPT-3.5 Turbo', 'GPT-4 Turbo', 'Mistral-7B Instruct', 'Claude 2', 'LLaMa2-13B' without specific version numbers or snapshot dates. Per schema rules, 'GPT-3.5 Turbo' without a version like 'gpt-3.5-turbo-0613' is insufficient."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt text is provided in Appendices A.1-A.4, including evaluation prompts, dialogue generation prompts, and benchmarking prompts for all scenarios with actual prompt templates and one-shot examples."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No hyperparameters reported for any model: no temperature, top-p, max tokens for API calls, and no learning rate, rank, or alpha for LoRA fine-tuning."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The system is a RAG pipeline with a retriever and generator, not an agent with tool use, retry logic, or feedback mechanisms."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Sections 4.1-4.3 describe the dialogue generation process in detail, including how documents are selected from CNN Daily Mail and XSum, how scenarios are constructed, how special tokens are converted to natural language (Table in Appendix A.5), and how training data is formatted."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 'Conclusion, Limitation, and Future Work' contains substantive discussion of limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 discusses specific threats: 'SummRAG's performance is linked to the scenarios in LogicSumm, which may not encompass all possible real-life situations' and 'the efficacy of our approach is influenced by the quality of the prompts used during dialogue generation.' These are specific to this study's design."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show or what settings/populations are excluded. The limitations acknowledge incomplete scenarios but don't specify untested domains, languages, or tasks."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Dataset released on HuggingFace (https://huggingface.co/datasets/zycjlsj123/ragsummdata) for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Sections 4.1-4.3 describe in detail how dialogues were generated using GPT-4 Turbo with specific prompts, how documents were selected from CNN Daily Mail and XSum, and how each scenario was constructed."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data is synthetically generated from standard datasets (CNN Daily Mail, XSum)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "While the dialogue generation process is described, the full pipeline from raw data to final training set is incomplete: no training data counts are provided, no filtering criteria or rejection rates are stated, and no information on how many dialogues were generated vs. kept."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No acknowledgments section or funding disclosure found anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All author affiliations are clearly listed: North Carolina State University, University of Illinois Urbana-Champaign, and Northwestern University."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding information disclosed, so independence cannot be assessed. The absence of disclosure is itself a concern."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement found in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates stated for any of the models used (GPT-3.5, GPT-4, Claude 2, Mistral-7B, LLaMA-2), despite evaluating them on tasks derived from CNN Daily Mail and XSum."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether CNN Daily Mail or XSum data (used to generate test scenarios) appeared in the training data of the evaluated LLMs. These are widely-used datasets highly likely to be in training corpora."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "CNN Daily Mail and XSum were published well before the training cutoffs of GPT-3.5/4, Claude 2, and Mistral-7B. The paper does not discuss the contamination risk of using these well-known datasets."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost or latency reported despite the system requiring multiple LLM API calls (GPT-4 Turbo for dialogue generation and evaluation, plus model fine-tuning)."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No computational budget stated: no GPU hours for LoRA fine-tuning, no API costs for GPT-4 Turbo dialogue generation, no training time reported."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No results reported across multiple random seeds. All results appear to be single-run numbers."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "LoRA hyperparameters appear tuned (they use LoRA for fine-tuning) but no search budget, search method, or number of configurations tried is reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No explanation of how the final model configuration was selected. No mention of validation-based selection or comparison of multiple configurations."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper compares 7+ methods across 7 scenarios (many comparisons) but performs no statistical tests at all, let alone corrections for multiple comparisons."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own system (SummRAG) against baselines without acknowledging the systematic bias of author-evaluation. Baselines are run by the authors in possibly non-optimal configurations."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "SummRAG requires additional fine-tuning compute compared to zero-shot baselines, but this compute difference is not discussed or accounted for in the comparison."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper does not discuss whether LogicSumm's 7 scenarios actually measure what is claimed (real-world RAG summarization robustness). No analysis of construct validity or comparison with alternative evaluation approaches."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No agentic scaffolding is involved. The evaluation compares models and frameworks directly."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "Not discussed. CNN Daily Mail and XSum data used to construct test scenarios predate all evaluated models' training data, creating temporal leakage risk."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Not discussed. The evaluation setup could leak information since test scenarios are generated using similar procedures to training data."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not discussed. Training and test data are generated using similar procedures from the same source datasets (CNN Daily Mail, XSum), potentially creating non-independence."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method applied. No canary strings, membership inference, or temporal splits used."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "General-purpose LLMs lack robustness in RAG-based summarization, particularly when retrieved documents are irrelevant to the query.",
    372       "evidence": "Table 2 shows Mistral-7B Chat with explicit logical instructions achieves only 0.29 accuracy in Scenario 1 (irrelevant retrieval), and Figure 2 illustrates specific failure cases where models follow irrelevant retrieved text instead of recognizing its irrelevance.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "SummRAG achieves consistently high logical accuracy across all seven LogicSumm scenarios.",
    377       "evidence": "Table 2 shows SummRAG achieves 1.0 on Scenarios 1, 2, 5; 0.97 on Scenario 4; 0.79 on Scenario 6; and 0.86 on Scenario 7. These are equal to or better than all baselines across most scenarios.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "SummRAG maintains summarization quality comparable to GPT-3.5 Turbo while enhancing robustness.",
    382       "evidence": "Table 3 shows SummRAG achieves BertScore F1 of 0.91 and Rouge-1 of 0.48 in Scenario 2, matching GPT-3.5 Turbo's 0.90 F1 and 0.48 Rouge-1.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "SummRAG shows resilience against irrelevant documents in multi-document settings, unlike other RAG summarization frameworks.",
    387       "evidence": "Table 4 shows SummRAG maintains BertScore F1 of 0.87-0.88 across k=5,8,10, while Stuff drops from 0.87 to 0.85, Map-Reduce from 0.86 to 0.84, and Refine from 0.85 to 0.83.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Prompting strategies for Mistral-7B show inconsistent performance across different summarization scenarios.",
    392       "evidence": "Table 2 shows that explicit logical instructions yield 0.29 accuracy in Scenario 1 but 1.0 in Scenarios 4 and 2; zero-shot CoT gives 0.88 in Scenario 1 but fails in Scenario 6; one-shot CoT gives 1.0 in Scenario 1 but 0.19 in Scenario 2.",
    393       "supported": "strong"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "GPT-4 as both gold standard and evaluator",
    399       "detail": "GPT-4 Turbo outputs serve as gold standard summaries for quality evaluation, and GPT-4 Turbo also assesses logical correctness. This creates circular evaluation — a model trained on GPT-4-generated dialogues is evaluated against GPT-4's judgments, biasing toward GPT-4-like outputs."
    400     },
    401     {
    402       "flag": "No error bars or uncertainty quantification",
    403       "detail": "All results across Tables 2-4 are point estimates with no confidence intervals, error bars, or variance from multiple runs. With small test sets (36-98 per scenario), individual results could vary significantly."
    404     },
    405     {
    406       "flag": "Small synthetic test sets",
    407       "detail": "Test sets are small (36-98 samples per scenario) and synthetically generated using the same process as training data. This raises concerns about whether results generalize beyond the synthetic distribution."
    408     },
    409     {
    410       "flag": "Missing ablation study",
    411       "detail": "SummRAG has multiple components (special tokens, dialogue format, system prefixes, LoRA fine-tuning, aspect-specific training) but no ablation study isolates which components contribute to improvement."
    412     },
    413     {
    414       "flag": "Contamination risk from well-known datasets",
    415       "detail": "Test scenarios are constructed from CNN Daily Mail and XSum documents, which are likely in the training data of all evaluated LLMs (GPT-3.5/4, Claude 2, Mistral, LLaMA). This confounds the evaluation of 'robustness' since models may have memorized these documents."
    416     },
    417     {
    418       "flag": "Unfair baseline comparison",
    419       "detail": "SummRAG (fine-tuned Mistral-7B) is compared against zero-shot or few-shot baselines. The fine-tuned model has an inherent advantage from task-specific training data. Fairer comparisons would fine-tune baselines on similar amounts of data."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Self-RAG: Learning to retrieve, generate, and critique through self-reflection",
    425       "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang", "Avirup Sil", "Hannaneh Hajishirzi"],
    426       "year": 2023,
    427       "arxiv_id": "2310.11511",
    428       "relevance": "Self-RAG is the direct predecessor to SummRAG; the paper modifies and extends Self-RAG for summarization tasks."
    429     },
    430     {
    431       "title": "Benchmarking large language models in retrieval-augmented generation",
    432       "authors": ["Jiawei Chen", "Hongyu Lin", "Xianpei Han", "Le Sun"],
    433       "year": 2023,
    434       "arxiv_id": "2309.01431",
    435       "relevance": "Evaluates LLM robustness in RAG settings, directly related to evaluation methodology for RAG systems."
    436     },
    437     {
    438       "title": "LoRA: Low-rank adaptation of large language models",
    439       "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis", "Zeyuan Allen-Zhu"],
    440       "year": 2021,
    441       "arxiv_id": "2106.09685",
    442       "relevance": "Core fine-tuning technique used in SummRAG for parameter-efficient model adaptation."
    443     },
    444     {
    445       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    446       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    447       "year": 2020,
    448       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm."
    449     },
    450     {
    451       "title": "Chain-of-Thought prompting elicits reasoning in large language models",
    452       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    453       "year": 2022,
    454       "relevance": "Prompting technique used as baseline in the evaluation of LLM robustness for summarization."
    455     },
    456     {
    457       "title": "Mistral 7B",
    458       "authors": ["Albert Q Jiang", "Alexandre Sablayrolles", "Arthur Mensch"],
    459       "year": 2023,
    460       "arxiv_id": "2310.06825",
    461       "relevance": "Base model fine-tuned in SummRAG; represents the class of open-source LLMs for code/NLP tasks."
    462     },
    463     {
    464       "title": "LLaMA: Open and efficient foundation language models",
    465       "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"],
    466       "year": 2023,
    467       "arxiv_id": "2302.13971",
    468       "relevance": "Open-source LLM used as baseline, relevant to understanding LLM capability evaluation."
    469     },
    470     {
    471       "title": "Tug-of-war between knowledge: Exploring and resolving knowledge conflicts in retrieval-augmented language models",
    472       "authors": ["Zhuoran Jin", "Pengfei Cao", "Yubo Chen"],
    473       "year": 2024,
    474       "arxiv_id": "2402.14409",
    475       "relevance": "Addresses knowledge conflicts in RAG systems, a core challenge also tackled in this paper's Scenario 6."
    476     },
    477     {
    478       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    479       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    480       "year": 2023,
    481       "arxiv_id": "2307.16789",
    482       "relevance": "Provides the function-calling token methodology adopted in SummRAG's dialogue generation."
    483     },
    484     {
    485       "title": "Benchmarking large language models for news summarization",
    486       "authors": ["Tianyi Zhang", "Faisal Ladhak", "Esin Durmus", "Percy Liang"],
    487       "year": 2024,
    488       "relevance": "Comprehensive LLM summarization benchmark directly relevant to evaluation methodology."
    489     }
    490   ]
    491 }

Impressum · Datenschutz