scan.json (25641B)
1 { 2 "paper": { 3 "title": "Evaluating Very Long-Term Conversational Memory of LLM Agents", 4 "authors": ["Adyasha Maharana", "Dong-Ho Lee", "Sergey Tulyakov", "Mohit Bansal", "Francesco Barbieri", "Yuwei Fang"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2402.17753" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "LLMs with limited context struggle with very long conversations (best base model GPT-4-turbo scores 32.1 vs human 87.9 on QA). Long-context LLMs improve factual recall but catastrophically fail on adversarial questions (2.1% for GPT-3.5-turbo-16k vs 70.2% for GPT-4-turbo base). RAG with observations provides the best balance, particularly outperforming dialog-based and summary-based retrieval. Event summarization reveals that long-context models paradoxically underperform base models, suggesting they fail to utilize extended context appropriately.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "The paper states 'Code and data to be available at https://snap-research.github.io/locomo' — a promise of future release, not a confirmed available release at time of publication." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "Same as code — dataset is promised to be available at the project page but described as future release." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions using 'Nvidia A6000 server with FP32' and specific APIs but provides no requirements.txt, Dockerfile, or detailed dependency listing." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. Implementation details are scattered across the paper and appendix but no consolidated reproduction guide exists." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables 2, 3, and 4 are reported as point estimates (F1 scores, ROUGE scores, FactScores) with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes numerous comparative claims ('outperforms', 'improvement') but reports no statistical significance tests. All comparisons are based on raw number differences." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports relative improvements with baselines, e.g., 'improvements ranging from 22-66%', 'performance that is 83% lower than the base model', 'lags behind human levels (by 56%)'. Percentage improvements with baselines provide effect size context." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The dataset contains only 50 conversations and 7,512 QA pairs. No justification for why 50 conversations is sufficient, and no power analysis is discussed." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Section C.2 states 'We report results from a single inference run for each model in our experiments.' No variance across runs is reported." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper compares base LLMs, long-context LLMs, and RAG approaches, plus human performance as an upper bound. Multiple models serve as baselines (Mistral-7B, Llama-2-Chat-70B, GPT-3.5-turbo, GPT-4-turbo)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include GPT-4-turbo, GPT-3.5-turbo-16k, Llama-2-Chat-70B, and Mistral-7B, all contemporary at the time of writing (January 2024)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The RAG experiments ablate retrieval units (dialog, observation, summary) and top-k values. The multimodal dialog generation task ablates training variants (base, +summary, +observation)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "QA uses F1 and retrieval recall@k. Event summarization uses ROUGE-1/2/L and FactScore (precision, recall, F1). Multimodal dialog uses BLEU, ROUGE-L, and MMRelevance." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Human performance is measured as an upper bound on the QA task (Table 2, 87.9 overall F1). Human annotators also verified and edited the dataset (Section 3.4)." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "No explicit train/test/dev split is described for the 50 LOCOMO conversations used for evaluation. The 50 MiniGPT-5 training conversations are separately generated, but the evaluation benchmark itself has no described held-out split." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "QA results are broken down by 5 reasoning types (single-hop, multi-hop, temporal, open-domain, adversarial) in Tables 2 and 3. RAG results are broken down by retrieval unit and top-k." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 6.2 presents a taxonomy of 5 error categories in event summarization (missing information, hallucination, misunderstanding dialog cues, speaker attribution errors, saliency errors) with examples in Table 7." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Several negative results: long-context LLMs catastrophically fail on adversarial questions (2.1%), long-context models underperform base models on event summarization, summary-based RAG does not significantly improve performance despite high recall." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about LLM challenges with lengthy conversations, improvements from long-context/RAG, and lagging behind human performance are all supported by Tables 2-4." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper makes causal claims like 'Employing strategies like long-context LLMs or RAG can offer improvements' and 'introducing improper context from inaccurate retrieval can lead to a decline in performance' without controlled causal designs — these are observational comparisons across different model configurations." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims to evaluate 'LLM Agents' generally but tests only a few specific models (GPT-3.5/4, Llama-2, Mistral) on a single synthetic dataset of 50 conversations. The paper does not bound its claims to these specific models and dataset." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not discuss alternative explanations for its findings. For example, the adversarial question collapse in long-context models could be due to prompt formatting rather than context length, but this is not explored." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures F1 on QA and FactScore on summarization as proxies for 'long-term memory' and 'comprehension' without discussing the gap between these automated metrics and actual memory capabilities." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are referred to as 'gpt-3.5-turbo', 'gpt-4-turbo', 'Mistral-7B', 'Llama-2-Chat-70B' without specific version snapshots or dates. The paper states 'as of January 2024' but does not give exact API version strings." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Extensive prompts are provided in the appendix: persona generation (Fig. 5), event graph generation (Fig. 6), summary generation (Fig. 8), observation extraction (Fig. 9), and image sharing/reaction (Fig. 10)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section C.2: 'temperature set to 0 and topp set to 1 for evaluation.' MiniGPT-5 trained for 10 epochs using default hyperparameters from original codebase." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The generative agent architecture is described in detail in Section 3.3: reflect & respond mechanism with short/long-term memory, observation extraction, summary generation, and image sharing/reaction functions. The RAG pipeline (DRAGON retriever + reader) is also described." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The full pipeline from persona selection → event graph generation → dialog generation → human verification/editing is documented across Sections 3.1-3.4 with details in Appendix A. Annotators edited ~15% of turns and removed/substituted ~19% of images." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 8 'Limitations' provides substantive discussion across 5 sub-topics: hybrid data, limited multimodal behavior, language limitation, closed-source LLMs, and evaluation challenges." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The limitations are specific: 'this dataset may not fully reflect the nuances of real-world online conversations', 'images in our dataset can be replaced with their captions without much loss of information', 'LLMs are prone to generating verbose answers' creating evaluation challenges." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper states the pipeline works only for English, acknowledges synthetic data limitations, and notes that web-sourced images lack visual long-term consistency of personal photos. Section 9 explicitly states 'We do not make any recommendations for real-world policies based on this study.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "Raw data is promised for future release but not available at publication time." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "The entire data collection pipeline is described in detail: persona generation from MSC dataset, event graph construction, LLM agent dialog generation, human verification/editing (Sections 3.1-3.4 and Appendix A)." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The human annotators are described only as 'in-house annotators' (Section B.3). No details on how they were selected, their qualifications, or demographics — 'we were unable to obtain their demographics due to the confidential nature of such information.'" 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is fully documented: persona selection → expansion via GPT-3.5 → event graph generation via text-davinci-003 → agent dialog generation → human filtering/editing. Statistics on edit rates provided (15% dialog turns, 19% images)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: UNC Chapel Hill, USC, and Snap Inc." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Three authors are from Snap Inc., which has commercial interest in conversational AI agents. This potential conflict is not discussed. No funding disclosure makes independence impossible to assess." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state training data cutoff dates for any of the evaluated models (GPT-3.5-turbo, GPT-4-turbo, Mistral-7B, Llama-2-Chat-70B)." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The LOCOMO benchmark is newly created, which reduces contamination risk, but this is not explicitly discussed. The MSC personas used as seeds are public and could be in training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether the generated conversations or their source personas could overlap with model training data." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in the evaluation. Human annotators were used for dataset construction, not as study subjects." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human subjects study. Dataset annotation is not a human subjects study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the evaluation." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the evaluation." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the evaluation." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the evaluation." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the evaluation." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No API costs, token counts, or per-example costs reported despite extensive use of OpenAI APIs for both data generation and evaluation." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "MiniGPT-5 training took 'approximately 30 hours on a single A6000 GPU' (Section C.2). However, no total API costs for data generation or LLM evaluation are reported." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Section C.2 explicitly states 'We report results from a single inference run for each model.' No seed sensitivity analysis." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Section C.2 states 'We report results from a single inference run for each model in our experiments.' The number is stated, though it is only 1." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search is described for evaluation. MiniGPT-5 uses 'default hyperparameters from original codebase' with no search." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "No description of how best configurations were selected across the various RAG top-k settings or model choices." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Many comparisons across models, retrieval strategies, and question types with no statistical tests at all, let alone corrections for multiple comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors created the LOCOMO benchmark and evaluate models on it without acknowledging potential bias from evaluating on their own benchmark." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "Models of vastly different sizes (Mistral-7B vs GPT-4-turbo) are compared without discussing compute differences." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper does not discuss whether QA F1, FactScore, and MMRelevance actually measure 'long-term memory' as claimed, or whether the synthetic benchmark reflects real conversation dynamics." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "RAG models use DRAGON retriever + GPT-3.5-turbo-16k reader, confounding retrieval quality with reader capability. Base models have different context truncation strategies. These scaffolding differences are not addressed as confounds." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "The LOCOMO conversations are generated from MSC personas which are publicly available. No discussion of whether these personas or similar conversation patterns appeared in model training data." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether QA question phrasing provides hints." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "All 50 conversations are generated by gpt-3.5-turbo, and GPT-3.5-turbo is also used as a baseline model. The evaluated model generated the test data it is being tested on, which is a significant non-independence issue not discussed." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are employed." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Long-context LLMs and RAG improve QA memory capabilities by 22-66% but still lag behind human performance by 56%.", 364 "evidence": "Table 2: GPT-3.5-turbo-16k at 16K context achieves 37.8 overall F1 vs human 87.9. Base GPT-3.5-turbo at 22.4. Improvement ranges from 22-66% depending on context size (Section 6.1).", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Long-context LLMs are prone to hallucinations on adversarial questions, with performance dropping to 2.1% compared to 70.2% for GPT-4-turbo base.", 369 "evidence": "Table 2: GPT-3.5-turbo-16k at 16K context scores 2.1% on adversarial questions vs GPT-4-turbo base at 70.2%.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "RAG is most effective when conversations are stored as observations rather than raw dialog or summaries.", 374 "evidence": "Table 3: Observation-based RAG achieves 41.4 overall F1 at top-5 vs dialog-based 31.7 and summary-based 29.9.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Long-context models show poor performance on event summarization, lagging behind the base model by 14%.", 379 "evidence": "Table 4: GPT-3.5-turbo achieves 45.9 FactScore F1 vs GPT-3.5-turbo-16k at 39.9.", 380 "supported": "moderate" 381 } 382 ], 383 "red_flags": [ 384 { 385 "flag": "Single-run results", 386 "detail": "All experiments are single inference runs with no variance reporting. Results could vary substantially across runs, especially for temperature=0 API calls which may still have non-determinism." 387 }, 388 { 389 "flag": "Evaluated model generated test data", 390 "detail": "GPT-3.5-turbo was used to generate the LOCOMO conversations and is also used as a baseline model. This creates a fundamental non-independence issue — the model may perform differently on data it generated vs. truly novel conversations." 391 }, 392 { 393 "flag": "Very small dataset", 394 "detail": "Only 50 conversations form the entire benchmark. No justification for why this sample size is sufficient for the claims made." 395 }, 396 { 397 "flag": "No statistical tests", 398 "detail": "All comparative claims are based on raw number comparisons with no significance testing. Given the small dataset (50 conversations), differences could easily be due to chance." 399 }, 400 { 401 "flag": "Company conflict not discussed", 402 "detail": "Three of six authors are from Snap Inc., which has commercial interest in conversational AI. No competing interests disclosure." 403 } 404 ], 405 "cited_papers": [ 406 { 407 "title": "Generative agents: Interactive simulacra of human behavior", 408 "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"], 409 "year": 2023, 410 "relevance": "Foundation architecture for LLM-based agents with memory and reflection, directly used in LOCOMO's generative pipeline." 411 }, 412 { 413 "title": "Beyond goldfish memory: Long-term open-domain conversation", 414 "authors": ["Jing Xu", "Arthur Szlam", "Jason Weston"], 415 "year": 2022, 416 "relevance": "MSC dataset and prior work on long-term dialogue that LOCOMO extends by 9x in conversation length." 417 }, 418 { 419 "title": "Lost in the Middle: How Language Models Use Long Contexts", 420 "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt", "Ashwin Paranjape", "Michele Bevilacqua", "Fabio Petroni", "Percy Liang"], 421 "year": 2024, 422 "relevance": "Documents LLM challenges with long context utilization, directly relevant to LOCOMO's findings on long-context model failures." 423 }, 424 { 425 "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation", 426 "authors": ["Sewon Min", "Kalpesh Krishna", "Xinxi Lyu"], 427 "year": 2023, 428 "relevance": "Evaluation metric used for event summarization task in LOCOMO benchmark." 429 }, 430 { 431 "title": "Retrieval augmentation reduces hallucination in conversation", 432 "authors": ["Kurt Shuster", "Spencer Poff", "Moya Chen", "Douwe Kiela", "Jason Weston"], 433 "year": 2021, 434 "relevance": "Foundational work on RAG for dialogue, directly relevant to LOCOMO's RAG evaluation." 435 }, 436 { 437 "title": "Conversation Chronicles: Towards diverse temporal and relational dynamics in multi-session conversations", 438 "authors": ["Jihyoung Jang", "Minseong Boo", "Hyounghun Kim"], 439 "year": 2023, 440 "relevance": "Most comparable prior work on multi-session dialogues, limited to 5 sessions vs LOCOMO's 35." 441 }, 442 { 443 "title": "Replug: Retrieval-augmented black-box language models", 444 "authors": ["Weijia Shi", "Sewon Min", "Michihiro Yasunaga"], 445 "year": 2023, 446 "arxiv_id": "2301.12652", 447 "relevance": "RAG technique for augmenting LLMs, relevant to retrieval-based approaches evaluated in LOCOMO." 448 }, 449 { 450 "title": "MemoryBank: Enhancing large language models with long-term memory", 451 "authors": ["Wanjun Zhong", "Lianghong Guo", "Qiqi Gao", "Yanlin Wang"], 452 "year": 2023, 453 "arxiv_id": "2305.10250", 454 "relevance": "LLM memory augmentation approach directly relevant to the long-term memory evaluation problem LOCOMO addresses." 455 }, 456 { 457 "title": "Booookscore: A systematic exploration of book-length summarization in the era of LLMs", 458 "authors": ["Yapei Chang", "Kyle Lo", "Tanya Goyal", "Mohit Iyyer"], 459 "year": 2023, 460 "relevance": "Long-form summarization evaluation methodology relevant to LOCOMO's event summarization task." 461 }, 462 { 463 "title": "How to train your dragon: Diverse augmentation towards generalizable dense retrieval", 464 "authors": ["Sheng-Chieh Lin", "Akari Asai", "Minghan Li"], 465 "year": 2023, 466 "relevance": "DRAGON retrieval model used in LOCOMO's RAG experiments." 467 } 468 ] 469 }