scan.json (29279B)
1 { 2 "paper": { 3 "title": "Retrieval-augmented generation in multilingual settings", 4 "authors": [ 5 "Nadezhda Chirkova", 6 "David Rau", 7 "Hervé Déjean", 8 "Thibault Formal", 9 "Stéphane Clinchant", 10 "Vassilina Nikoulina" 11 ], 12 "year": 2024, 13 "venue": "KNOWLLM", 14 "arxiv_id": "2407.01463", 15 "doi": "10.48550/arXiv.2407.01463" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "RAG improves performance across all 13 languages tested, but a gap remains between English and non-English. Translating system prompts into user languages and explicitly instructing the model to respond in the user language is critical for achieving high correct language rates. Multilingual-by-design generators (Command-R-35B) consistently outperform English-centric models for non-English, while BGE-m3 enables reliable cross-lingual retrieval. Code-switching in non-Latin alphabet languages and transliteration variability remain key limitations requiring evaluation metric adjustments.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "Code released at https://github.com/naver/bergen with multilingual-specific documentation at https://github.com/naver/bergen/blob/main/documentations/multilingual.md, stated in abstract and footnote 1." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All datasets used are publicly available: MKQA, XOR-TyDi QA, TyDi QA, and Wikipedia (with HuggingFace links provided in footnotes 10 and 11). No proprietary data was collected." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions int4 quantization and model names with HuggingFace links, but provides no requirements.txt, Dockerfile, or detailed environment/dependency specifications." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper releases code at the BERGEN library with dedicated multilingual documentation. Section 4 describes experimental details (retrieval setup, generation parameters, evaluation procedure) sufficient for reproduction via the released code." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All tables (1, 5, 6, 7) report point estimates only (e.g., '58.4', '70.2'). No confidence intervals, error bars, or ± notation anywhere in the paper." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "Claims of performance differences between models, prompts, and retrieval options are made throughout (e.g., 'Command-R-35B is the only model which consistently achieves high CLR') but no statistical significance tests are applied." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "Tables show raw performance numbers for different conditions (e.g., no retrieval vs. retrieval in Table 1), but the paper never explicitly computes or reports effect sizes, relative improvements, or Cohen's d. The reader must compute differences from the tables." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "They use 2.7K MKQA samples and 3K XOR-TyDi QA validation examples without justifying these sample sizes. The subset selection from MKQA is explained (overlap with KILT NQ) but its adequacy for statistical conclusions is not discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Single-run results with greedy decoding throughout. No variance, standard deviation, or spread measures reported. Section 4: 'We use greedy decoding' — one deterministic run per configuration." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Multiple baselines: no-retrieval baseline (Table 1), different retrieval sources (English, user language, multilingual), default vs. advanced prompts (Table 5), English-centric vs. multilingual generators (Table 6), SPLADE+QT vs. BGE-m3 (Table 7), and oracle retrieval." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "All models are contemporary (2023-2024): Command-R-35B, Mixtral-8x7B, SOLAR-10.7B, LLaMA-2-7B-chat, BGE-m3. These represent state-of-the-art at time of writing." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Systematic ablation across all pipeline components: retrieval language (Table 1), prompt formulation (Table 5), generator model (Table 6), and retrieval system (Table 7). Each table isolates one variable while controlling others." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Two metrics used: character 3-gram recall (task accuracy) and correct language rate (CLR, measuring whether the model responds in the user language). Both reported throughout Tables 5 and 6." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Manual inspection of 50 predictions each for Russian, Chinese, and French (Table 8). Authors categorize errors into system performance characteristics and data characteristics, identifying specific failure types like wrong retrieval, code-switching, and transliteration issues." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Evaluation uses standard held-out datasets: MKQA (derived from NQ), XOR-TyDi QA validation split, and TyDi QA. These are established evaluation sets separate from any model training." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Extensive per-language breakdowns across 13 languages in Table 1, per-prompt and per-model breakdowns in Tables 5-6, and per-error-type breakdown in Table 8. Results are never reported as only aggregates." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table 8 provides detailed manual error analysis: wrong retrieval, wrong response with correct retrieval, code-switching, transliteration issues, fluency errors, and data quality problems. Section 5 discusses these systematically." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Multiple negative findings reported: English-centric models frequently reply in English for non-English queries (Table 5), Mixtral only works well for its pretrained languages (Table 6), Command-R's English recall is lower than SOLAR's due to 'curse of multilinguality' (Section 5), and code-switching remains unsolved." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims are well-supported: 'task-specific prompt engineering is needed' (Table 5), 'evaluation metrics need adjustments' (Table 3 and Section 3 evaluation discussion), 'frequent code-switching in non-Latin alphabet languages' (Table 8), all directly supported by experimental results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Claims are primarily comparative ('translating prompts improves CLR', 'retrieval from multilingual Wikipedia is beneficial'). The ablation design with controlled single-variable manipulation across Tables 5-7 provides adequate evidence for these causal claims." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Claims are scoped to the tested setting: 13 specific languages, MKQA and XOR-TyDi QA datasets, zero-shot setting, open QA task with Wikipedia. The Limitations section explicitly states these boundaries and calls for extending to other tasks and domains." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper offers brief mentions (SOLAR's multilingual ability 'attributed to accidental multilingual data in pretraining', Command-R's lower English scores due to 'curse of multilinguality') but these are passing observations, not substantive discussion of alternative explanations for the main findings." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures character 3-gram recall and correct language rate, and frames claims at the same granularity as these measurements. No broader framing beyond what the metrics measure (e.g., they don't claim to measure 'RAG quality' generally, just QA accuracy and language correctness on specific benchmarks)." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model identifiers provided: 'LLaMA-2-7B-chat', 'SOLAR-10.7B', 'Mixtral-8x7B', 'Command-R-35B' with HuggingFace links for Command-R (CohereForAI/c4ai-command-r-v01), BGE-m3, and NLLB-600M. Sufficient to identify exact model versions." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Table 2 provides the full text of all six system prompt variants used. The {UL} placeholder is clearly defined as the user language name, and all test languages are enumerated, so prompts can be fully reconstructed." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4 reports: greedy decoding, maximum 128 new tokens, int4 quantization, top-50 retrieval then top-5 reranking, 100-word passage chunks (100 Unicode characters for CJK/Thai)." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The pipeline is a straightforward retrieve-then-generate architecture without agents, tool use, or iterative reasoning." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4 documents preprocessing: Wikipedia split into 100-word chunks with article title prepended (100 Unicode characters for non-whitespace-separated languages), MKQA subset selected via overlap with KILT NQ, evaluation preprocessing includes lower-casing, punctuation removal, and article removal." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Dedicated 'Limitations' section after Section 6, discussing scope restrictions (QA task, Wikipedia datastore), excluded components (query reformulation, context post-processing), and single retriever choice." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats discussed: only one retriever/reranker model used (BGE-m3), BGE-m3 is in-domain since it was trained on Wikipedia data, evaluation limited to QA task, prompt translations may have quality issues (footnote 7), and evaluation data has ambiguous/incorrect labels (Table 8)." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Limitations section explicitly states what was NOT tested: other tasks beyond QA, other domains beyond Wikipedia, query reformulation, context post-processing (filtering irrelevant passages). Also Section 6 lists specific future directions as implicit scope boundaries." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "All evaluation data is from publicly available datasets with provided links: MKQA, XOR-TyDi QA, TyDi QA, Wikipedia (via HuggingFace). Code is released for reproducing results." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Dataset sources described: MKQA from NQ (10K examples translated to 25 languages, 2.7K subset selected), XOR-TyDi QA (40K questions in 7 languages, 3K validation), Wikipedia passage construction documented. Table 4 provides detailed statistics." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. All data comes from standard public benchmarks (MKQA, XOR-TyDi QA, Wikipedia)." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Pipeline documented in Section 4: Wikipedia articles split into 100-word passages with title prepended, MKQA subset filtered via KILT NQ overlap (recovering relevant document information), retrieval of top-50 passages then reranking to top-5, evaluation with preprocessed character 3-gram matching." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding source disclosed. The Acknowledgments section thanks named individuals but mentions no grants, funding agencies, or corporate sponsors. All authors are from NAVER LABS Europe (corporate research lab)." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors are listed as affiliated with NAVER LABS Europe in the paper header. They evaluate third-party models (Command-R by Cohere, BGE-m3 by BAAI, LLaMA by Meta, etc.) rather than NAVER products." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding is explicitly disclosed. As NAVER LABS employees, their research is implicitly funded by NAVER, a tech company with search products that could benefit from RAG advances. Independence cannot be assessed without disclosure." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests statement, no patent disclosures, and no financial interest declarations anywhere in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates stated for any of the four generator models (LLaMA-2, SOLAR, Mixtral, Command-R) or the retriever (BGE-m3). Languages in pretraining are mentioned for some models, but not temporal cutoffs." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether MKQA or XOR-TyDi QA examples appeared in the training data of any evaluated model. MKQA derives from Natural Questions (2019), and all models were trained after this date." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "MKQA is derived from Natural Questions (published 2019) and uses Wikipedia, both of which are highly likely in the training data of all evaluated models. This contamination risk is not discussed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in the study. Manual error analysis in Table 8 is performed by the researchers, not a human subjects study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The study evaluates models on public benchmarks." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost, latency, or per-query cost reported despite running 4 generators across 13 languages on thousands of examples with retrieval and reranking." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No GPU hours, hardware specifications, or total computational budget stated. Only int4 quantization is mentioned without stating what hardware was used." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "Greedy decoding makes generation deterministic, but this is not explicitly discussed as a design choice for reproducibility. No seed sensitivity analysis is reported for any component." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never explicitly stated. Results appear to be from single runs with greedy decoding, but this is not confirmed." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "The paper tests multiple prompt formulations and configurations but does not describe these as a hyperparameter search and does not report the total configurations tried or compute spent on exploration." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "Section 5 ('Best performing configuration to be used as a strong baseline') explicitly identifies the best configuration (Command-R-35B, BGE-m3, 'Reply short in UL (UL)' prompt, multilingual Wikipedia retrieval) and justifies each choice based on the systematic comparisons in Tables 1, 5-7." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed at all. Since no p-values are computed, multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors build and evaluate their own mRAG pipeline but do not acknowledge the bias of evaluating their own system or discuss how this might affect their assessment of component contributions." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Models range from 7B (LLaMA-2) to ~46B parameters (Mixtral-8x7B) and 35B (Command-R), but performance is never reported as a function of compute budget. The compute cost difference is not discussed." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "MKQA and XOR-TyDi QA are used without formal discussion of whether they validly measure multilingual RAG capability. Table 8 identifies data quality issues (ambiguous questions, wrong labels) but does not frame this as a construct validity concern." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No agentic scaffolding is involved. The retrieve-then-generate pipeline has no scaffolding confound." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "Not discussed. MKQA derives from Natural Questions (2019) and Wikipedia, both created before all evaluated models' training data. Models may have seen both the questions and Wikipedia passages during training." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "Not discussed. The no-retrieval baseline implicitly addresses whether models already know answers, but this is not framed as feature leakage analysis." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "Not discussed. MKQA and Wikipedia are both likely in the training data of the evaluated models, creating potential non-independence between training and evaluation data." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection method employed. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "RAG brings substantial performance improvement across all 13 languages tested.", 372 "evidence": "Table 1 shows consistent improvement from no-retrieval to retrieval conditions across all languages on both MKQA and XOR-TyDi QA (e.g., English MKQA: 58.4 → 70.2, Arabic: 26.4 → 49.0).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Translating system prompts into user languages and explicitly specifying target language are critical for achieving high correct language rate.", 377 "evidence": "Table 5 shows CLR improvements across all settings. For Command-R with English retrieval, Korean CLR goes from 54.3% (default) to 100% (translated prompt with UL instruction).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Multilingual-by-design generator models are essential for consistent generation across languages.", 382 "evidence": "Table 6 shows Command-R-35B achieves CLR >98% across all tested languages with appropriate prompts, while English-centric LLaMA-2-7B achieves only 4.3-62.8% CLR and often replies in English.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "BGE-m3 enables reliable cross-lingual retrieval without requiring query translation.", 387 "evidence": "Table 7 shows BGE-m3 outperforms SPLADE+QT (query translation) on both retrieval recall@5 and downstream character 3-gram recall across all languages.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Retrieving from multilingual Wikipedia is beneficial compared to monolingual retrieval in most cases.", 392 "evidence": "Table 1 'All langs' column often matches or exceeds both English-only and user-language-only retrieval (e.g., German MKQA: 66.9 vs 64.6/65.5). However, patterns vary by dataset: MKQA favors English, XOR-TyDi QA favors user language.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Character 3-gram recall is more robust than word-level matching for multilingual evaluation.", 397 "evidence": "Table 3 illustrates that word-level exact match gives 0% for 'Sofia Kovalevskaia' vs ground truth 'Sofya Kovalevskaya', while character 3-gram recall gives 69.2%. However, this is demonstrated by example rather than systematic comparison.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Code-switching in non-Latin alphabet languages is a persistent limitation of current multilingual LLMs.", 402 "evidence": "Table 8 manual inspection: 5 out of 50 Russian predictions and 6 out of 50 Chinese predictions had named entities in English. Even with explicit NE-in-UL prompting, the issue is not fully resolved (Table 5).", 403 "supported": "moderate" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "No statistical testing", 409 "detail": "All performance comparisons across models, prompts, languages, and retrieval options are based solely on point-estimate comparisons with no confidence intervals, significance tests, or uncertainty quantification. With thousands of examples per condition, even small differences could be tested." 410 }, 411 { 412 "flag": "Contamination not addressed", 413 "detail": "MKQA derives from Natural Questions (2019) and Wikipedia, both very likely in the training data of all evaluated models (2023-2024 vintage). The no-retrieval baseline may partially reflect memorized answers rather than true zero-knowledge performance, but this is never discussed." 414 }, 415 { 416 "flag": "Small manual evaluation sample", 417 "detail": "Error analysis (Table 8) is based on only 50 examples per language for 3 of 13 languages. This is too small for reliable error rate estimates and may not represent the full distribution of error types." 418 }, 419 { 420 "flag": "Single retriever evaluated", 421 "detail": "Only BGE-m3 is used as the multilingual retriever/reranker. The comparison with SPLADE (Table 7) is English-only. The paper acknowledges this in Limitations but the single-system evaluation limits generalizability of retrieval-related findings." 422 }, 423 { 424 "flag": "No compute or cost reporting", 425 "detail": "Running 4 generators × 6 prompts × multiple retrieval settings × 13 languages on thousands of examples represents substantial compute, but no GPU hours, hardware, latency, or cost figures are provided." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 431 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 432 "year": 2020, 433 "relevance": "Foundational RAG paper that introduced the retrieve-then-generate paradigm widely used in LLM applications." 434 }, 435 { 436 "title": "Self-RAG: Learning to retrieve, generate, and critique through self-reflection", 437 "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang"], 438 "year": 2024, 439 "relevance": "Proposes self-reflective RAG where the model decides when retrieval is needed, relevant to RAG methodology and evaluation." 440 }, 441 { 442 "title": "BGE m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation", 443 "authors": ["Jianlv Chen", "Shitao Xiao", "Peitian Zhang"], 444 "year": 2024, 445 "relevance": "The multilingual embedding model used as the primary retriever/reranker, core to multilingual retrieval capability evaluation." 446 }, 447 { 448 "title": "Llama 2: Open foundation and fine-tuned chat models", 449 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 450 "year": 2023, 451 "relevance": "Major open-source LLM evaluated as an English-centric baseline for multilingual generation capability." 452 }, 453 { 454 "title": "Mixtral of experts", 455 "authors": ["Albert Q. Jiang", "Alexandre Sablayrolles"], 456 "year": 2024, 457 "relevance": "Mixture-of-experts LLM evaluated for multilingual generation, representing a multilingual-by-design architecture." 458 }, 459 { 460 "title": "Bergen: A benchmarking library for retrieval-augmented generation", 461 "authors": ["David Rau", "Hervé Déjean", "Nadezhda Chirkova"], 462 "year": 2024, 463 "relevance": "The RAG benchmarking framework on which this multilingual evaluation is built, relevant to RAG evaluation methodology." 464 }, 465 { 466 "title": "One question answering model for many languages with cross-lingual dense passage retrieval", 467 "authors": ["Akari Asai", "Xinyan Yu", "Jungo Kasai"], 468 "year": 2021, 469 "relevance": "Introduced the CORA approach and XOR-TyDi QA for multilingual open QA, the primary prior work and evaluation benchmark." 470 }, 471 { 472 "title": "Making retrieval-augmented language models robust to irrelevant context", 473 "authors": ["Ori Yoran", "Tomer Wolfson", "Ori Ram"], 474 "year": 2024, 475 "relevance": "Addresses RAG robustness to irrelevant context, relevant to understanding RAG pipeline reliability." 476 }, 477 { 478 "title": "MKQA: A linguistically diverse benchmark for multilingual open domain question answering", 479 "authors": ["Shayne Longpre", "Yi Lu", "Joachim Daiber"], 480 "year": 2021, 481 "relevance": "One of two primary evaluation benchmarks used, providing parallel multilingual QA evaluation across 25 languages." 482 }, 483 { 484 "title": "No language left behind: Scaling human-centered machine translation", 485 "authors": ["NLLB Team", "Marta R. Costa-jussà"], 486 "year": 2022, 487 "relevance": "The NLLB translation model used for query translation baseline in cross-lingual retrieval experiments." 488 } 489 ] 490 }