scan.json (25938B)
1 { 2 "paper": { 3 "title": "Lost in the Middle: How Language Models Use Long Contexts", 4 "authors": [ 5 "Nelson F. Liu", 6 "Kevin Lin", 7 "John Hewitt", 8 "Ashwin Paranjape", 9 "Michele Bevilacqua", 10 "Fabio Petroni", 11 "Percy Liang" 12 ], 13 "year": 2023, 14 "venue": "TACL", 15 "arxiv_id": "2307.03172" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper states 'we release our code and evaluation data' and provides the URL nelsonliu.me/papers/lost-in-the-middle (Section 1, footnote 1)." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper uses the publicly available NaturalQuestions-Open dataset and states they release their evaluation data at the project URL. The synthetic key-value retrieval task uses randomly generated UUIDs with a described generation process." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No mention of environment specifications, requirements files, or dependency details found in the paper. The paper describes using APIs (OpenAI, Anthropic) and open models (MPT, LongChat, Llama-2) but does not provide a reproducible environment specification." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "While code and data are released, the paper itself does not contain step-by-step reproduction instructions, a README description with commands, or a 'Reproducing Results' section." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Main results (Figures 5, 7, 8, 9, etc.) report only point estimates of accuracy without confidence intervals or error bars. Token count statistics in Appendix F include standard deviations, but the main experimental results do not." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper makes multiple comparative claims (e.g., 'performance degrades significantly', 'performance is nearly identical') but uses no statistical significance tests. Comparisons are based solely on visual inspection of accuracy numbers." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper consistently reports effect sizes in context. For example: 'GPT-3.5-Turbo's multi-document QA performance can drop by more than 20%' (Section 2.3), '1.5% for GPT-3.5-Turbo and ~1% for Claude-1.3' (Section 5), '10-point worst-case degradation' (Appendix E). These provide baseline context for interpreting magnitude." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper uses 2655 queries from NaturalQuestions-Open and 500 examples for key-value retrieval, but does not justify why these sizes were chosen or discuss whether they provide sufficient statistical power for the claims made." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "Experiments appear to be single-run evaluations. No standard deviations across experimental runs are reported for the main accuracy results. The only variance reported is for token counts in Appendix F, not for experimental outcomes." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "The paper includes closed-book and oracle baselines (Table 1) to contextualize performance. Multiple models are compared against each other, and extended-context variants are compared against their standard counterparts." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "The models evaluated (GPT-3.5-Turbo, Claude-1.3, MPT-30B-Instruct, LongChat-13B, GPT-4, Llama-2) were all state-of-the-art or near-state-of-the-art at the time of publication in 2023." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper includes several ablation-style experiments: Section 4.1 (decoder-only vs. encoder-decoder), Section 4.2 (query-aware contextualization), Section 4.3 (instruction fine-tuning vs. base models), Appendix B (random vs. retrieved distractors), and Appendix C (randomized distractor order)." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": false, 86 "justification": "The paper uses only accuracy (exact match) as the evaluation metric across all experiments. No additional metrics such as F1 or partial match are reported." 87 }, 88 "human_evaluation": { 89 "applies": false, 90 "answer": false, 91 "justification": "The paper evaluates language model capabilities on factual retrieval tasks with ground-truth answers. Human evaluation of outputs is not relevant since correctness is objectively determined by exact match against annotated answers." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "The paper uses NaturalQuestions-Open, a standard benchmark with established test data. No tuning or model selection is performed on the evaluation data; all models are evaluated zero-shot with fixed prompts." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by position of relevant information (multiple positions per context length), by context length (10, 20, 30 documents; 75, 140, 300 key-value pairs), by model, and appendices provide per-model tabulations (Tables 5-7)." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper discusses specific failure modes: models performing worse than closed-book when information is in the middle (Section 2.3), LongChat-13B generating code instead of answers (Section 3.2), and the general failure pattern of U-shaped performance curves." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper reports several negative results: extended-context models not improving over standard models (Section 2.3), query-aware contextualization not helping multi-document QA despite helping key-value retrieval (Section 4.2), and additional retrieved documents not improving reader performance (Section 5)." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims that performance degrades when relevant information position changes, that there is a U-shaped performance curve with primacy and recency bias, and that extended-context models are not necessarily better. All of these are supported by the results in Sections 2.3, 3.2, and the figures." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper's ablation studies (Sections 4.1-4.3) use controlled single-variable manipulation to investigate causal factors. The main experiments use controlled positioning of information to establish causal effects of position on performance. The study design is adequate for the causal claims made." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper is careful about generalization. It specifies exact models tested, limits claims to 'current language models', and explicitly notes limitations such as the cost of evaluating GPT-4 on full experiments (Appendix D footnote). The title 'How Language Models Use Long Contexts' is broad but the paper tests multiple model families to support breadth." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper systematically investigates alternative explanations: whether the U-shape is due to decoder-only architecture (Section 4.1), lack of query-aware contextualization (Section 4.2), instruction fine-tuning bias (Section 4.3), hard negatives vs. random distractors (Appendix B), and ranking prior in prompts (Appendix C)." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper specifies model versions: 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k-0613', 'gpt-4-0613' (Section 2.2, footnote 5, Figure 15), Claude-1.3, Claude-1.3 (100K), MPT-30B-Instruct, LongChat-13B (16K), specific Llama-2 variants with parameter counts." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "The exact prompts are provided in Figures 2, 6 for the multi-document QA and key-value retrieval tasks respectively. The modified prompt for randomized ordering is quoted in Appendix C." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "The paper states 'We use greedy decoding when generating outputs' (Section 2.2), which fully specifies the generation strategy (temperature=0, no sampling). For API models this is the key hyperparameter." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used. The experiments involve direct prompting of language models without any agent framework, tool use, or multi-step reasoning pipeline." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 2.1 describes the data pipeline in detail: 2655 queries from NaturalQuestions-Open filtered to paragraph-type long answers, Contriever retrieval of distractor documents, filtering of distractors that contain annotated answers. The key-value retrieval task generation is described in Section 3.1." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The paper discusses some caveats inline (e.g., cost of GPT-4 evaluation, greedy decoding only) but lacks a substantive standalone limitations discussion." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No specific threats to validity are discussed. The paper does not address, for example, whether results might change with different prompts, different tasks, non-English inputs, or different evaluation metrics." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what the results do NOT show. While it mentions leaving exploration of other decoding methods to future work (Section 2.2), it does not systematically bound the scope of its conclusions." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper states code and evaluation data are released at nelsonliu.me/papers/lost-in-the-middle, and the underlying NaturalQuestions-Open dataset is publicly available." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 2.1 describes the data collection in detail: NaturalQuestions-Open queries, filtering criteria (paragraph-type long answers yielding 2655 queries), Contriever retrieval for distractors, and the synthetic key-value generation process (Section 3.1)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. The data comes from standard benchmarks (NaturalQuestions-Open) and synthetic generation." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline is well documented: NaturalQuestions-Open queries → filter to paragraph answers (2655 queries) → Contriever retrieval for distractors → filter out distractors containing answers → controlled reordering for position experiments. Appendix A addresses ambiguity filtering." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "The Acknowledgments section states: 'This work was supported by the Stanford Center for Research on Foundation Models (CRFM), by OpenAI via an API credits grant to the Stanford CRFM, and by Anthropic via the Claude academic access program.'" 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: Stanford University, UC Berkeley, and Samaya AI. The footnote notes work was 'partially completed as an intern at Samaya AI.'" 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "OpenAI and Anthropic provided API credits/access, and their models (GPT-3.5-Turbo, GPT-4, Claude-1.3) are the primary subjects of evaluation. Both companies have a financial interest in the perceived capabilities of their long-context models." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement is present. Some authors are affiliated with Samaya AI, a company that may have interests related to retrieval-augmented generation, but no financial interest disclosures are provided." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper does not state the training data cutoff dates for any of the models evaluated. While model versions are specified (e.g., gpt-3.5-turbo-0613), the training cutoff is not discussed." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No discussion of whether NaturalQuestions-Open data appeared in any model's training data. The benchmark was published in 2019, well before all models' training periods, making contamination plausible." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "NaturalQuestions was published in 2019 and is widely available online. Models trained after 2019 may have seen these questions. The paper does not address this contamination risk, though it is partially mitigated by the controlled nature of the experiments (same questions across conditions)." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "The paper mentions that 'evaluating GPT-4 on the full multi-document QA and key-value retrieval experiments would cost upwards of $6000' (Appendix D footnote), but does not report the actual costs incurred for the experiments that were run." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No total computational budget is stated. The paper does not report total API spend, GPU hours for open models, or wall-clock time for experiments." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "Language model performance exhibits a U-shaped curve as a function of the position of relevant information in the input context, with highest performance at the beginning and end.", 294 "evidence": "Figures 5 and 7 show this pattern across multiple models and both multi-document QA and key-value retrieval tasks. For example, GPT-3.5-Turbo drops by >20% when relevant information is in the middle vs. beginning/end (Section 2.3).", 295 "supported": "strong" 296 }, 297 { 298 "claim": "Extended-context models are not necessarily better at using input context than their standard-context counterparts.", 299 "evidence": "Figure 5 shows GPT-3.5-Turbo and GPT-3.5-Turbo (16K) have 'nearly superimposed' performance curves on 10- and 20-document settings that fit within both context windows (Section 2.3). Same pattern for Claude-1.3 vs Claude-1.3 (100K).", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Encoder-decoder models are more robust to position of relevant information within their training-time context window.", 304 "evidence": "Figure 8 shows Flan-UL2 has only 1.9% absolute difference between best and worst case within its 2048-token window, but exhibits U-shaped degradation on longer sequences (Section 4.1).", 305 "supported": "strong" 306 }, 307 { 308 "claim": "Query-aware contextualization dramatically improves key-value retrieval but minimally affects multi-document QA.", 309 "evidence": "Section 4.2 reports GPT-3.5-Turbo (16K) achieves perfect performance on 300 key-value pairs with query-aware contextualization vs. 45.6% worst-case without it. Figure 9 shows minimal change for QA.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Model performance saturates long before retriever recall in open-domain QA, indicating models fail to use additional retrieved documents effectively.", 314 "evidence": "Figure 11 shows that using 50 instead of 20 retrieved documents improves performance by only ~1.5% for GPT-3.5-Turbo and ~1% for Claude-1.3, while retriever recall continues to increase (Section 5).", 315 "supported": "strong" 316 }, 317 { 318 "claim": "The U-shaped performance curve appears even in base language models before instruction fine-tuning.", 319 "evidence": "Figure 10 shows both MPT-30B and MPT-30B-Instruct exhibit U-shaped curves, though instruction fine-tuning slightly reduces the performance disparity (Section 4.3).", 320 "supported": "strong" 321 }, 322 { 323 "claim": "Only sufficiently large language models (13B+ parameters) exhibit primacy bias; smaller models (7B) are solely recency-biased.", 324 "evidence": "Figure 16 and Appendix E show Llama-2-7b models are solely recency-biased while 13B and 70B models exhibit both primacy and recency bias.", 325 "supported": "moderate" 326 } 327 ], 328 "methodology_tags": [ 329 "benchmark-eval" 330 ], 331 "key_findings": "Language models exhibit a U-shaped performance curve when accessing information at different positions in long input contexts, performing best when relevant information is at the beginning or end and worst when it is in the middle. Extended-context models (e.g., GPT-3.5-Turbo-16K, Claude-1.3-100K) do not improve context utilization over their standard counterparts. Encoder-decoder models are more robust to information position within their training-time context window. In practical open-domain QA, reader performance saturates far before retriever recall, suggesting current models cannot effectively leverage additional retrieved documents.", 332 "red_flags": [ 333 { 334 "flag": "No error bars or uncertainty quantification", 335 "detail": "All main results are reported as single-run point estimates without confidence intervals, error bars, or variance across runs. Given the stochastic nature of some models and the claim-heavy analysis, this is a notable gap." 336 }, 337 { 338 "flag": "No significance tests for comparative claims", 339 "detail": "The paper makes numerous claims about performance differences (e.g., 'nearly identical', 'significantly degrades') without any statistical tests. Whether observed differences are statistically significant or within noise is never tested." 340 }, 341 { 342 "flag": "Contamination risk unaddressed", 343 "detail": "NaturalQuestions was published in 2019 and is widely available. All evaluated models were trained after 2019. The paper does not discuss whether models may have seen these questions during training, though the controlled within-question position manipulation partially mitigates this concern." 344 }, 345 { 346 "flag": "No limitations section", 347 "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Key limitations such as the use of a single evaluation metric (accuracy), a single task domain (factual QA), English-only evaluation, and greedy decoding only are not systematically discussed." 348 } 349 ], 350 "cited_papers": [ 351 { 352 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 353 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"], 354 "year": 2023, 355 "relevance": "Foundational work on LLM tool use and agentic capabilities, directly relevant to understanding how models augmented with external information use their context." 356 }, 357 { 358 "title": "REPLUG: Retrieval-Augmented Black-Box Language Models", 359 "authors": ["Weijia Shi", "Sewon Min", "Michihiro Yasunaga", "Minjoon Seo", "Rich James", "Mike Lewis", "Luke Zettlemoyer", "Wen tau Yih"], 360 "year": 2023, 361 "arxiv_id": "2301.12652", 362 "relevance": "Proposes retrieval-augmented generation for black-box LLMs, directly relevant to understanding how retrieved context is used by models." 363 }, 364 { 365 "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness", 366 "authors": ["Tri Dao", "Daniel Y. Fu", "Stefano Ermon", "Atri Rudra", "Christopher Ré"], 367 "year": 2022, 368 "arxiv_id": "2205.14135", 369 "relevance": "Key algorithmic contribution enabling longer context windows in transformers, foundational infrastructure for long-context LLM research." 370 }, 371 { 372 "title": "Hyena Hierarchy: Towards Larger Convolutional Language Models", 373 "authors": ["Michael Poli", "Stefano Massaroli", "Eric Nguyen", "Daniel Y. Fu", "Tri Dao", "Stephen Baccus", "Yoshua Bengio", "Stefano Ermon", "Christopher Ré"], 374 "year": 2023, 375 "relevance": "Alternative to attention-based architectures for long-context processing, relevant to understanding architectural approaches to long-context modeling." 376 }, 377 { 378 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 379 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 380 "year": 2023, 381 "arxiv_id": "2307.09288", 382 "relevance": "Major open-source LLM evaluated in this study; relevant to understanding the scaling and fine-tuning effects on context utilization." 383 }, 384 { 385 "title": "Scaling Instruction-Finetuned Language Models", 386 "authors": ["Hyung Won Chung", "Le Hou", "Shayne Longpre"], 387 "year": 2022, 388 "arxiv_id": "2210.11416", 389 "relevance": "Describes instruction fine-tuning at scale (Flan models), which this paper investigates as a potential factor in context utilization patterns." 390 }, 391 { 392 "title": "In-Context Retrieval-Augmented Language Models", 393 "authors": ["Ori Ram", "Yoav Levine", "Itay Dalmedigos", "Dor Muhlgay", "Amnon Shashua", "Kevin Leyton-Brown", "Yoav Shoham"], 394 "year": 2023, 395 "arxiv_id": "2302.00083", 396 "relevance": "Explores retrieval-augmented generation with in-context learning, directly relevant to the practical implications of context position effects." 397 }, 398 { 399 "title": "Large Language Models Struggle to Learn Long-Tail Knowledge", 400 "authors": ["Nikhil Kandpal", "Haikang Deng", "Adam Roberts", "Eric Wallace", "Colin Raffel"], 401 "year": 2022, 402 "arxiv_id": "2211.08411", 403 "relevance": "Studies LLM knowledge limitations, providing context for why retrieval augmentation matters and how models use provided information." 404 }, 405 { 406 "title": "Natural Questions: A Benchmark for Question Answering Research", 407 "authors": ["Tom Kwiatkowski", "Jennimaria Palomaki", "Olivia Redfield"], 408 "year": 2019, 409 "relevance": "The primary benchmark dataset used in this study; foundational QA evaluation resource for LLM research." 410 }, 411 { 412 "title": "LLaMA: Open and Efficient Foundation Language Models", 413 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"], 414 "year": 2023, 415 "arxiv_id": "2302.13971", 416 "relevance": "Foundation model that LongChat-13B extends, relevant to understanding architectural choices for long-context modeling." 417 } 418 ] 419 }