scan.json (31264B)
1 { 2 "paper": { 3 "title": "Episodic Memories Generation and Evaluation Benchmark for Large Language Models", 4 "authors": ["Alexis Huet", "Zied Ben Houidi", "Dario Rossi"], 5 "year": 2025, 6 "venue": "International Conference on Learning Representations", 7 "arxiv_id": "2501.13121", 8 "doi": "10.48550/arXiv.2501.13121" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Even state-of-the-art LLMs (GPT-4o, Claude 3.5 Sonnet, o1-mini, Llama 3.1 405B) struggle with episodic memory tasks on a synthetic contamination-free benchmark, particularly when questions require recalling multiple related events (F1 ≤0.60 for 2+ events). Performance degrades consistently with cue overload, and naive fine-tuning fails to generalize beyond single-event memorization. RAG generally outperforms in-context memory (except for GPT-4o), and chronological ordering and entity state tracking are especially challenging (≤36% latest state accuracy, ≤18% full set recall).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository is provided: 'Code and data available at Huet et al. (2025)' with URL https://github.com/ahstat/episodic-memory-benchmark (footnote 1, reference section)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "11 datasets are released (Tab. 28) through the GitHub repository. The paper states 'release open source code and datasets' and footnote 2 references the list in Tab. 28." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or dependency specifications in the paper. Only the models used are named without library/environment setup details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While the benchmark generation process is described in extensive detail (Appendix B), the paper does not provide step-by-step reproduction instructions with specific commands. The methodology is described algorithmically but not as executable steps." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Standard deviations are reported throughout, e.g., Tab. 3: '0.84±0.37', '0.81±0.38'. All results tables include ± notation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Wilcoxon signed-rank test with Holm's method adjustment is used for model comparisons (Fig. 3, Fig. 7). Mann-Whitney U tests are used in ablation studies (Tab. 21, 23, 25)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "F1 scores are reported with full context: baseline comparisons show magnitude of differences (e.g., fine-tuned model drops from F1=0.83 for 1 event to F1=0.19 for 6+ events in Tab. 3). Performance differences between conditions are quantified absolutely." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The question selection targets Ntarget=5 per bin per question id (Appendix B.2.4) but no formal power analysis or justification for why 456/686 questions are sufficient for the statistical conclusions drawn." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Standard deviations are reported across questions for all experimental conditions in Tab. 3, 13, 14, 16-20, 22, 24, 27." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines are compared: six LLMs across three memory strategies (in-context, RAG, fine-tuning). Section 5.1 describes the baseline setup." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models evaluated include GPT-4o, GPT-4o-mini, Claude 3.5 Sonnet, Llama 3.1 405B, and o1-mini — all state-of-the-art at time of submission (late 2024/early 2025)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Extensive ablation studies in Appendix E: book size (E.1), RAG granularity (E.2), number of cues and traces (E.3, E.4), book generation model (E.5), chapter ordering (E.6), and event plausibility (E.7)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "F1-score is the primary metric (Section 4.3), and Kendall's τ coefficient is additionally used for chronological ordering questions (Section 4.3, Tab. 4)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of model outputs is performed. Evaluation uses an LLM-as-a-judge approach (Section 4.3, Appendix B.3). Manual analysis of GPT-4o failures in Appendix E.8 covers only 24 questions." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "For the fine-tuning experiment, 'all questions involving a single chapter (i.e., corresponding to the bin {1}) are present in both the training and the test sets' (Appendix B.2.5). Train/test contamination exists in the fine-tuning condition." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by number of matching events (Tab. 3), cue type (Fig. 4, Tab. 16-17), retrieval type (Tab. 18), and detailed (cue, retrieval) pairs (Tab. 19)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Appendix E.8 provides manual analysis of 24 failed GPT-4o responses on 0-event questions, categorizing failures into 'inner' (17 cases) and 'outer' (7 cases) with detailed examples of confabulation patterns." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Fine-tuning is shown to fail at generalization (F1=0.00 on 0-event questions, drops from 0.83 to 0.19 as events increase). o1-mini performs worst at recall despite best confabulation avoidance. Claude 3.5 Sonnet in-context has a known bug (Appendix F)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims that 'even the most advanced LLMs struggle with episodic memory tasks, particularly when dealing with multiple related events' are directly supported by Tab. 3 (F1 ≤0.60 for 2+ events) and Fig. 3 (model rankings with statistical tests)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims are generally supported through controlled comparisons. 'RAG generally outperforms in-context counterparts' is supported by comparing the same model across memory strategies (Fig. 3). Ablation studies (Appendix E) systematically vary single factors." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Claims are bounded to the tested models and benchmark. The paper specifies which models were evaluated, acknowledges limitations including 'limited domain scope' (Section 6), and explicitly notes the benchmark covers 'human-like protagonists within fictional contexts.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Multiple alternative explanations are investigated: Claude vs. GPT book generation bias (Appendix E.5), chronological vs. unordered presentation (E.6), realistic vs. non-realistic events (E.7), and paragraph vs. chapter RAG granularity (E.2)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly draws parallels between their benchmark and established human episodic memory tests (AMI, Autobiographical Interview in Section 2 and Appendix A.3.1), and clearly defines what F1 score on cue-based recall measures. Section 3 articulates the cognitive science framework underlying the measurements." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Book generation models include version dates: 'Claude 3.5 Sonnet (2024-06-20)' and 'GPT-4o (2024-05-13)' (Appendix B.1.5). However, evaluation models are listed without version/snapshot dates: just 'GPT-4o', 'GPT-4o-mini', 'Claude 3 Haiku', 'Claude 3.5 Sonnet', 'o1-mini' (Section 5.1)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text is provided in the appendix: chapter generation (Listing 7), chapter verification (Listing 8), evaluation (Listings 12, 14), answer generation for in-context (Listing 15), RAG (Listing 16), and fine-tuning (Listing 17)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Fine-tuning hyperparameters are reported (30 epochs, batch size 64, LR multiplier 1.8; footnote 5) and RAG uses 'text-embedding-3-small.' However, inference hyperparameters (temperature, top-p, max tokens) for the evaluated models are not stated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The evaluation involves direct prompting of LLMs, RAG retrieval, or fine-tuning — none of which involve agentic scaffolding." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The data generation pipeline is extensively documented in Appendix B: universe creation (B.1.1-B.1.2), event generation with geometric sampling (B.1.3), meta-data generation (B.1.4), chapter generation and verification (B.1.5-B.1.7), question selection with filtering counts (B.2)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 'Summary and Limitations' provides dedicated subsections addressing temporal representation, event independence, limited domain scope, and training limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are identified: 'Our benchmark relies on explicit temporal markers, which may not fully capture the nuanced ways time is expressed in natural language'; 'The independent generation of chapters... does not capture the interconnected and causal nature of real-world events' (Section 6)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 states specific scope boundaries: benchmark is limited to explicit temporal markers (not implicit references like 'yesterday'), events are independently generated (no causal chains), and domain is restricted to 'human-like protagonists within fictional contexts.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Code and all 11 benchmark datasets are released on GitHub (https://github.com/ahstat/episodic-memory-benchmark). Tab. 28 lists all produced benchmarks." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The synthetic data generation procedure is described in detail in Appendix B: universe components (B.1.1), event generation with truncated geometric distribution (B.1.3), chapter generation with LLM (B.1.5), and multi-stage verification (B.1.6-B.1.7)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data is entirely synthetic, generated procedurally from defined universe components." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The full pipeline is documented with counts at each stage: 200 events generated → filtering for (t,ent)/(t,s) conflicts → itermax=10 chapter generation attempts → direct verification → LLM-as-a-judge verification → 196 validated chapters (Tab. 7). Question pipeline: 7056 raw questions → 3886 after dedup → 686 after selection (Appendix B.2)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosure, acknowledgments section, or grant information is present in the paper. Authors are listed as Huawei Technologies employees but no explicit funding statement is provided." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation is clearly stated: 'Huawei Technologies Co., Ltd., Paris, France' on the title page." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The implicit funder (Huawei) does not have a direct financial interest in the specific outcomes — the paper evaluates third-party models (GPT-4o, Claude, Llama, o1-mini), none of which are Huawei products." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-4o, Claude 3.5 Sonnet, Llama 3.1, o1-mini). While the benchmark is designed to be contamination-free, the models' training cutoffs are not mentioned." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "The paper explicitly addresses contamination as a design requirement: 'We synthesize a unique episodic memory benchmark, free from contamination' (Section 1). Section 2 identifies data leakage as a limitation of existing benchmarks. The synthetic generation approach structurally prevents train/test overlap." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "The benchmark is specifically designed to be contamination-free: 'avoid data leakage, ensuring that the evaluation is free from contamination' (Section 2). The synthetic generation approach guarantees benchmark data did not exist before model training." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. The benchmark evaluates LLMs on synthetic tasks." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates LLM performance on synthetic benchmarks." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, token counts, or inference latency are reported for the evaluation runs across six models and three memory strategies." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper does not report GPU hours, API spend, or total compute for benchmark generation or model evaluation." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results appear to be from single evaluation runs. No seed sensitivity analysis is reported for model inference. Benchmark generation uses a fixed seed (Appendix B.1.2) but the paper does not vary it for evaluation." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of evaluation runs per model is not explicitly stated. Results appear to be single-run per model-memory configuration." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "For fine-tuning, hyperparameters (30 epochs, batch size 64, LR multiplier 1.8) are stated but no search budget is described. RAG top-K is set to 'top-K paragraphs' without discussing how K was chosen. No search budget for any configuration." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "Fine-tuning hyperparameters and RAG parameters appear chosen without explanation of the selection process or reporting of alternative configurations tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": true, 316 "justification": "Holm's method is used for adjusting Wilcoxon signed-rank tests in the Critical Difference plots (Fig. 3, Fig. 7). This is explicitly stated: 'Wilcoxon signed-rank test... adjusted by Holm's method.'" 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate third-party models on their own benchmark but do not discuss potential benchmark design bias that could favor or disadvantage certain model architectures." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Performance is not reported as a function of compute budget. Different models and memory strategies have different compute requirements (e.g., RAG adds embedding overhead, fine-tuning requires training) but this is not quantified or compared." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "The paper extensively discusses what the benchmark measures and its relationship to cognitive science episodic memory constructs (Section 3, Appendix A). It draws explicit parallels to human episodic memory tests (AMI, Autobiographical Interview) and explains how cue-based recall maps to LLM prompting." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. Models are evaluated via direct prompting, RAG, or fine-tuning without agentic scaffolding." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "The benchmark is synthetically generated specifically to avoid temporal leakage: 'free from contamination' by design. The synthetic data was generated after model training, making prior exposure impossible." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No explicit discussion of whether the evaluation setup could leak answer information through context. For in-context learning, the full book is provided (by design), but the paper does not discuss whether question formulations could inadvertently signal answers." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Questions are generated from a shared universe with overlapping elements (same entities, locations, dates across chapters), but the paper does not discuss whether this shared structure could create dependencies between test questions that affect evaluation validity." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "The synthetic benchmark generation itself serves as a concrete leakage prevention method — all data is generated de novo, making it structurally impossible for models to have seen it during training. This is highlighted as a key design feature in Sections 1-2." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Even the most advanced LLMs struggle with episodic memory tasks, particularly with multiple related events or complex spatio-temporal relationships, even in contexts as short as 10k-100k tokens.", 365 "evidence": "Tab. 3 shows F1 scores ≤0.60 for all models when questions involve 2+ matching events on the long book. Tab. 13 shows suboptimal performance even on the short book (10k tokens). Fig. 3 ranks all models with statistical tests.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "RAG generally outperforms in-context memory for episodic tasks, with the exception of GPT-4o.", 370 "evidence": "Fig. 3 shows RAG variants generally rank higher than in-context counterparts. GPT-4o in-context achieves the highest average rank, statistically tied with Claude 3.5 Sonnet RAG.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Performance degrades consistently as the number of events matching a cue increases (cue overload effect).", 375 "evidence": "Tab. 3 shows monotonic F1 decline across all models from 1-event to 6+ events (e.g., GPT-4o: 0.81→0.60→0.57→0.53). Fig. 4 heatmaps visualize this gradient.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Naive fine-tuning fails to achieve deep understanding of episodic events and merely overfits to single learned facts.", 380 "evidence": "Tab. 3: fine-tuned GPT-4o-mini achieves F1=0.83 for 1-event questions but drops to 0.37 for 2 events and 0.19 for 6+ events. It scores F1=0.00 on 0-event questions (always hallucinates), confirming it memorized specific answers without generalizing.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "o1-mini performs best at avoiding confabulation but worst at actual event recall.", 385 "evidence": "Tab. 3: o1-mini achieves F1=0.97 on 0-event questions (confabulation avoidance) but only 0.05 on 1-event questions on the long book. On the short book (Tab. 13), o1-mini performs significantly better and is statistically equivalent to GPT-4o (Fig. 7).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Performance varies by cue type, with content cues being easiest and time cues being hardest.", 390 "evidence": "Fig. 4 shows a gradient from content (top) to time (bottom) across models, with content cues consistently yielding higher F1 scores than time cues.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Models have significant difficulty with chronological ordering and entity state tracking, especially for multi-event questions.", 395 "evidence": "Tab. 4: ≤36% accuracy for latest state recall, ≤18% for full set recall across all models. Kendall's τ is low even for exact matches, showing ordering failures.", 396 "supported": "strong" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Unspecified judge model in LLM-as-a-judge evaluation", 402 "detail": "The paper uses an 'LLM-as-a-judge approach' (Section 4.3) for evaluation but does not explicitly state which model serves as the judge. Since 4-8% of scores are partial matches requiring judgment, the choice of judge model could affect results." 403 }, 404 { 405 "flag": "No multi-run evaluation or seed sensitivity", 406 "detail": "Results appear to be from single evaluation runs per model-memory configuration. LLM outputs can vary across runs due to sampling, but no variance across runs is reported. The ± values in tables represent variance across questions, not across repeated runs." 407 }, 408 { 409 "flag": "Missing inference hyperparameters", 410 "detail": "Temperature, top-p, and max tokens are not reported for any evaluated model. These parameters significantly affect LLM output, especially for memory recall tasks where more deterministic sampling (low temperature) might improve results." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Human-like episodic memory for infinite context llms", 416 "authors": ["Zafeirios Fountas", "Martin A Benfeghoul", "Adnan Oomerjee", "Fenia Christopoulou", "Gerasimos Lampouras", "Haitham Bou-Ammar", "Jun Wang"], 417 "year": 2024, 418 "arxiv_id": "2407.09450", 419 "relevance": "Proposes human-like episodic memory mechanisms for LLMs, directly relevant to extending LLM memory capabilities." 420 }, 421 { 422 "title": "Larimar: Large language models with episodic memory control", 423 "authors": ["Payel Das", "Subhajit Chaudhury", "Elliot Nelson", "Igor Melnyk", "Sarath Swaminathan", "Sihui Dai", "Aurélie Lozano", "Georgios Kollias", "Vijil Chenthamarakshan", "Soham Dan"], 424 "year": 2024, 425 "arxiv_id": "2403.11901", 426 "relevance": "Proposes episodic memory control architecture for LLMs, directly addressing the memory limitations this benchmark evaluates." 427 }, 428 { 429 "title": "Michelangelo: Long context evaluations beyond haystacks via latent structure queries", 430 "authors": ["Kiran Vodrahalli", "Santiago Ontanon", "Nilesh Tripuraneni", "Kelvin Xu", "Sanil Jain", "Rakesh Shivanna", "Jeffrey Hui", "Nishanth Dikkala", "Mehran Kazemi", "Bahare Fatemi"], 431 "year": 2024, 432 "arxiv_id": "2409.12640", 433 "relevance": "Introduces latent structure queries for long-context evaluation, the closest prior work to this episodic memory benchmark." 434 }, 435 { 436 "title": "Babilong: Testing the limits of llms with long context reasoning-in-a-haystack", 437 "authors": ["Yuri Kuratov", "Aydar Bulatov", "Petr Anokhin", "Ivan Rodkin", "Dmitry Sorokin", "Artyom Sorokin", "Mikhail Burtsev"], 438 "year": 2024, 439 "arxiv_id": "2406.10149", 440 "relevance": "Long-context reasoning benchmark extending bAbI tasks, which this paper positions as lacking episodic memory assessment." 441 }, 442 { 443 "title": "Ruler: What's the real context size of your long-context language models?", 444 "authors": ["Cheng-Ping Hsieh", "Simeng Sun", "Samuel Kriman", "Shantanu Acharya", "Dima Rekesh", "Fei Jia", "Boris Ginsburg"], 445 "year": 2024, 446 "arxiv_id": "2404.06654", 447 "relevance": "Evaluates actual context utilization of long-context LLMs, directly relevant to understanding context window limitations." 448 }, 449 { 450 "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context", 451 "authors": ["Machel Reid", "Nikolay Savinov", "Denis Teplyashin", "Dmitry Lepikhin", "Timothy Lillicrap", "Jean-baptiste Alayrac", "Radu Soricut", "Angeliki Lazaridou", "Orhan Firat", "Julian Schrittwieser"], 452 "year": 2024, 453 "arxiv_id": "2403.05530", 454 "relevance": "Demonstrates multi-million token context capabilities and includes needle-in-haystack evaluation that this paper critiques as insufficient." 455 }, 456 { 457 "title": "Extracting training data from large language models", 458 "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace", "Matthew Jagielski", "Ariel Herbert-Voss", "Katherine Lee", "Adam Roberts", "Tom Brown", "Dawn Song", "Ulfar Erlingsson"], 459 "year": 2021, 460 "relevance": "Demonstrates LLM memorization of training data, relevant to understanding model memory and data contamination risks." 461 }, 462 { 463 "title": "Quantifying memorization across neural language models", 464 "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski", "Katherine Lee", "Florian Tramer", "Chiyuan Zhang"], 465 "year": 2022, 466 "arxiv_id": "2202.07646", 467 "relevance": "Quantifies memorization as function of model scale and context length, directly informing the episodic memory benchmark design." 468 }, 469 { 470 "title": "Augmenting language models with long-term memory", 471 "authors": ["Weizhi Wang", "Li Dong", "Hao Cheng", "Xiaodong Liu", "Xifeng Yan", "Jianfeng Gao", "Furu Wei"], 472 "year": 2023, 473 "arxiv_id": "2306.07174", 474 "relevance": "Proposes LongMem architecture for augmenting LLMs with persistent memory, relevant to addressing the episodic memory limitations documented in this benchmark." 475 }, 476 { 477 "title": "Many-shot in-context learning", 478 "authors": ["Rishabh Agarwal", "Avi Singh", "Lei M Zhang", "Bernd Bohnet", "Stephanie Chan", "Ankesh Anand", "Zaheer Abbas", "Azade Nova", "John D Co-Reyes", "Eric Chu"], 479 "year": 2024, 480 "arxiv_id": "2404.11018", 481 "relevance": "Studies in-context learning scaling with many examples, relevant to understanding LLM ability to process and retain information in context." 482 } 483 ], 484 "engagement_factors": { 485 "practical_relevance": { 486 "score": 1, 487 "justification": "Benchmark framework and code are released for researchers, but the benchmark itself is a research tool not directly applicable to practitioner workflows." 488 }, 489 "surprise_contrarian": { 490 "score": 1, 491 "justification": "LLMs struggling with memory tasks is not hugely surprising, though the degree of failure at even 10k-token contexts and the complete failure of fine-tuning to generalize is somewhat unexpected." 492 }, 493 "fear_safety": { 494 "score": 0, 495 "justification": "No safety, security, or AI risk concerns are raised by this work." 496 }, 497 "drama_conflict": { 498 "score": 0, 499 "justification": "No controversy or conflict angle; straightforward benchmark evaluation paper." 500 }, 501 "demo_ability": { 502 "score": 2, 503 "justification": "GitHub repository with code and 11 datasets is released, allowing researchers to run the benchmark on their own models." 504 }, 505 "brand_recognition": { 506 "score": 1, 507 "justification": "From Huawei (known but not top-tier AI lab); evaluates well-known models (GPT-4, Claude) which adds some recognition." 508 } 509 } 510 }