scan-v5.json (26646B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Evaluating Very Long-Term Conversational Memory of LLM Agents", 6 "authors": [ 7 "Adyasha Maharana", 8 "Dong-Ho Lee", 9 "Sergey Tulyakov", 10 "Mohit Bansal", 11 "Francesco Barbieri", 12 "Yuwei Fang" 13 ], 14 "year": 2024, 15 "venue": "arXiv", 16 "arxiv_id": "2402.17753", 17 "doi": null 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "All major abstract claims—LOCOMO dataset statistics, LLM performance gaps vs humans, RAG/long-context improvements—are supported by Tables 2–4 and the documented pipeline.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Causal claims about RAG and observation-based retrieval are supported by ablations across retrieval units and top-k values in Tables 3 and 6; MiniGPT-5 ablations compare Base/+summary/+observation variants.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper makes broad claims about LLMs struggling with 'lengthy conversations' and 'temporal reasoning' without bounding generalizability beyond 50 synthetic English-only conversations from a single pipeline.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not consider alternative explanations for key findings—e.g., adversarial performance collapse in long-context models could reflect evaluation metric artifacts or context-position effects rather than hallucination per se.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "Section 8 explicitly acknowledges that F1/ROUGE are imperfect proxies for memory capability due to LLM verbosity, distinguishing measured scores from claimed 'memory' capabilities.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 8 'Limitations' contains five distinct, substantive subsections covering data quality, multimodal coverage, language, closed-source API dependency, and evaluation metrics.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats are named: LLM-generated data may miss real-world nuance, images lack personal visual consistency, pipeline is English-only, GPT dependency limits reproducibility, and LLM verbosity confounds F1 evaluation.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly bounds scope to English, 50 conversations, specific evaluated models, and acknowledges LOCOMO 'may not fully reflect the nuances of real-world online conversations.'", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding acknowledgment or disclosure appears anywhere in the paper text.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations (UNC Chapel Hill, USC, Snap Inc.) are clearly stated on the first page.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "The project is hosted at snap-research.github.io suggesting Snap funding; however, the paper evaluates external LLMs (GPT, LLaMA, Mistral), not Snap's own products, so the funder is not a direct beneficiary of specific model results.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement or declaration of financial interests is present.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "'Very long-term' is operationally defined (300 turns, 9K tokens, up to 35 sessions) and 'memory' is defined through three concrete evaluation tasks (QA, event summarization, dialogue generation).", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper clearly states its three contributions: the LOCOMO dataset, the machine-human generation pipeline, and the multi-task evaluation benchmark.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 systematically situates LOCOMO against existing dialogue datasets in Table 1, discussing specific quantitative limitations of prior work (e.g., MSC's ~1K tokens over 4 sessions) and how this paper addresses them.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "Footnote 1 states code and data are 'to be available' at snap-research.github.io/locomo — a promise of future release, not a current one.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": false, 131 "justification": "LOCOMO is promised for future release under CC BY-NC 4.0; it was not available at time of submission.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper mentions 'OpenAI API and Huggingface, as of January 2024' and an Nvidia A6000 server with FP32, but no requirements.txt, Dockerfile, or explicit package versions are provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions are provided; Appendix C gives high-level experimental descriptions but lacks runnable workflow documentation.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper explicitly states 'We report results from a single inference run for each model'; no confidence intervals or error bars appear anywhere.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are used despite multiple comparative claims across six models and three retrieval conditions.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Effect sizes are reported as absolute F1 differences and percentage improvements (22–66% for RAG; 56% gap vs human) with baseline values provided for context.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The choice of 50 conversations for LOCOMO is not justified through power analysis, coverage arguments, or comparison with what would be needed for reliable conclusions.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "Single-run results are reported throughout; no variance, standard deviation, or run-to-run variability is reported for any experiment.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Multiple baseline models (Mistral-7B, LLaMA-70B, GPT-3.5, GPT-4) plus human performance are included across tasks.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "All evaluated models (GPT-4-turbo, GPT-3.5-turbo-16K, Mistral-7B, LLaMA-70B) were state-of-the-art as of January 2024.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "RAG ablations compare dialog/observation/summary retrieval units at multiple top-k values (Table 3); MiniGPT-5 ablations compare Base/+summary/+observation training variants (Table 6).", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "QA uses F1 and Recall@k; summarization uses ROUGE-1/2/L and FactScore precision/recall/F1; dialogue generation uses BLEU, ROUGE-L, BertScore, and MMRelevance.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": true, 207 "justification": "Human performance is measured on the QA task (Table 2: 87.9 overall F1), providing an upper-bound benchmark across all five reasoning categories.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "LOCOMO serves as a held-out evaluation set; no evaluated model was trained on LOCOMO conversations (MiniGPT-5 trains on a separately generated set without human filtering).", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "QA results are broken down by five reasoning categories (single-hop, multi-hop, temporal, open-domain, adversarial) across all model conditions in Tables 2 and 3.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 6.2 identifies five error categories for event summarization with examples (Table 7); Section 6.1 analyzes adversarial question failure and hallucination patterns in long-context models.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "GPT-3.5-16K scores only 2.1% on adversarial questions vs 12.8% for the base model, and underperforms on event summarization despite larger context — both are highlighted and analyzed rather than downplayed.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": false, 239 "justification": "Models are referenced as 'gpt-3.5-turbo' and 'gpt-4-turbo' with links to general documentation pages rather than specific versioned snapshots that change over time.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Appendix A includes actual prompts for persona generation, event graph generation, session summarization, observation extraction, and image sharing/reaction behaviors (Figures 5–10).", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Temperature=0 and top_p=1 are reported for evaluation; MiniGPT-5 is trained for 10 epochs (~30 hours on A6000) using original codebase defaults for remaining hyperparameters.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "The reflect-and-respond memory architecture (short-term session summary, long-term observation database, retrieval during generation) is described in detail in Section 3.3 and Appendix A.2.1.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Image-to-caption conversion using BLIP-2 for QA/summarization tasks, F1 answer normalization, and RAG retrieval procedures are documented in Sections 4–5 and Appendix C.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "Raw LOCOMO data is promised for future release but not available at time of paper submission.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "The full collection pipeline is documented with quantitative annotation statistics (15% of turns edited, 19% images removed/substituted) and specific annotator task descriptions in Section 3.4.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": true, 282 "answer": false, 283 "justification": "Annotators are described only as 'in-house annotators'; demographics and recruitment methods are withheld due to 'confidential nature of such information' (Appendix B.3).", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The complete pipeline from persona selection through event graph generation, dialogue synthesis, human annotation, and benchmark construction is documented with prompts and worked examples.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Training data cutoffs for GPT-3.5-turbo, GPT-4-turbo, LLaMA-70B, and Mistral-7B are not stated anywhere in the paper.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "The paper does not discuss whether MSC source personas (which seed LOCOMO) or LLM-generated conversation patterns could overlap with model training distributions.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "No analysis of whether the MSC personas or evaluation QA question patterns may have been encountered during pretraining of evaluated models.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No formal human subjects study; human participation is limited to in-house annotation and performance benchmarking on the created dataset.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "Not applicable; in-house annotators are employees, not external research participants requiring IRB review.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "Not applicable as a formal human subjects study; annotator demographics are withheld as confidential per Appendix B.3.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "Not applicable; no formal external participant recruitment.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "Not applicable; no human subjects experiment with randomization.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "Not applicable; annotation tasks do not require blinding.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "Not applicable; no longitudinal human participants study.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper acknowledges using 'strongest commercial LLMs available through a paid API' but reports no actual API costs, token counts, or per-query pricing.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Only MiniGPT-5 training time (30 hours on a single A6000) is stated; no compute budget for the main API-based evaluation experiments is reported.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Long-context LLMs and RAG improve QA memory performance by 22–66% over base models", 376 "evidence": "Table 2: GPT-3.5-16K reaches 37.8% overall F1 vs 22.4% for GPT-3.5-4K; Table 3: RAG with observations achieves 41.4% overall F1", 377 "supported": "strong" 378 }, 379 { 380 "claim": "All evaluated models significantly lag behind human performance (56% gap on QA overall F1)", 381 "evidence": "Table 2 shows human performance at 87.9 vs best model GPT-3.5-16K at 37.8, a 50-point gap", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Long-context LLMs show dramatically degraded performance on adversarial questions vs base models", 386 "evidence": "Table 2: GPT-3.5-16K scores 2.1% on adversarial at 16K context vs 12.8% for GPT-3.5-4K base", 387 "supported": "strong" 388 }, 389 { 390 "claim": "RAG with observation-based retrieval outperforms session-summary retrieval on temporal reasoning", 391 "evidence": "Table 3: Observations achieve 41.9% F1 on temporal at top-5 vs summaries at 31.0% for top-5; however both are well below human performance of 92.6%", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Long-context models underperform base models on event summarization despite larger context windows", 396 "evidence": "Table 4: GPT-3.5-16K achieves FactScore F1 of 39.9 vs GPT-3.5-4K base at 45.9", 397 "supported": "strong" 398 }, 399 { 400 "claim": "LOCOMO conversations are substantially longer (9x tokens, 4x sessions) than prior state-of-the-art long-term dialogue datasets", 401 "evidence": "Table 1: LOCOMO averages 9,209 tokens and 19.3 sessions vs MSC's 1,226 tokens and 4 sessions", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval" 407 ], 408 "key_findings": "LOCOMO is the first benchmark for very long-term dialogues (~300 turns, 9K tokens, up to 35 sessions), substantially exceeding prior datasets. Current LLMs achieve only 22–38% F1 on QA vs 87.9% human performance, with temporal reasoning the hardest category (73% below human). Counterintuitively, long-context LLMs degrade severely on adversarial questions (2.1% vs 12.8% baseline) and underperform base models on event summarization, suggesting extended context may amplify hallucination rather than reduce it. RAG with speaker observations offers the best accuracy-comprehension tradeoff.", 409 "red_flags": [ 410 { 411 "flag": "Single-run evaluation, no variance", 412 "detail": "All results are from a single inference run with no error bars, CIs, or repeated trials, making effect sizes unreliable for the comparative claims made." 413 }, 414 { 415 "flag": "No statistical significance testing", 416 "detail": "No significance tests are used despite multiple comparative claims across models and retrieval conditions in a 50-conversation corpus." 417 }, 418 { 419 "flag": "Code and data not released at submission", 420 "detail": "Both code and LOCOMO data are promised for future release; reproduction is impossible at time of publication." 421 }, 422 { 423 "flag": "GPT model versions unspecified", 424 "detail": "GPT-3.5-turbo and GPT-4-turbo are referenced by marketing names linked to general documentation pages, not specific versioned snapshots." 425 }, 426 { 427 "flag": "Human baseline may be inflated by familiarity", 428 "detail": "Human QA performance (87.9%) is measured using the same in-house annotators who created and verified the dataset, potentially inflating the upper bound." 429 }, 430 { 431 "flag": "Contamination not addressed", 432 "detail": "No analysis of whether MSC source personas or LLM-generated content could overlap with training data of evaluated models; training cutoffs are unstated." 433 }, 434 { 435 "flag": "Small corpus (n=50 conversations)", 436 "detail": "All benchmark evaluations are grounded in only 50 synthetic conversations; this limits statistical confidence in findings and subgroup analyses." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "Beyond Goldfish Memory: Long-Term Open-Domain Conversation", 442 "relevance": "Primary predecessor dataset (MSC) that LOCOMO extends; also provides source personas for pipeline initialization" 443 }, 444 { 445 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 446 "relevance": "Memory architecture (reflect-and-respond with observations) directly adapted for LOCOMO's dialogue generation pipeline" 447 }, 448 { 449 "title": "Lost in the Middle: How Language Models Use Long Contexts", 450 "relevance": "Cited to explain long-context model failures on adversarial questions and event summarization in LOCOMO" 451 }, 452 { 453 "title": "FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation", 454 "relevance": "Key evaluation metric adopted for the event summarization task to measure factual precision and recall" 455 }, 456 { 457 "title": "Conversation Chronicles: Towards Diverse Temporal and Relational Dynamics in Multi-Session Conversations", 458 "relevance": "Prior multi-session dialogue dataset directly compared to LOCOMO in Table 1; prior art for temporal dialogue" 459 }, 460 { 461 "title": "How to Train Your DRAGON: Diverse Augmentation Towards Generalizable Dense Retrieval", 462 "relevance": "Retrieval model used in all RAG experiments" 463 }, 464 { 465 "title": "BooookScore: A Systematic Exploration of Book-Length Summarization in the Era of LLMs", 466 "relevance": "Cited for incremental summarization approach applied in event summarization experiments" 467 }, 468 { 469 "title": "MiniGPT-5: Interleaved Vision-and-Language Generation via Generative Vokens", 470 "relevance": "Base model for all multimodal dialogue generation experiments and ablations" 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "LOCOMO provides a concrete benchmark for developers building long-term conversational agents, directly measuring memory failure modes that affect deployed chatbots." 477 }, 478 "surprise_contrarian": { 479 "score": 2, 480 "justification": "Long-context LLMs performing worse than constrained-context base models on adversarial questions and event summarization directly contradicts the intuition that more context always helps." 481 }, 482 "fear_safety": { 483 "score": 1, 484 "justification": "Broader Impacts section raises parasocial relationship risks from realistic long-term agents and misinformation risks from multimodal generation, but these are brief rather than central findings." 485 }, 486 "drama_conflict": { 487 "score": 1, 488 "justification": "No major controversy or conflict angle; standard benchmark paper despite the counterintuitive long-context degradation finding." 489 }, 490 "demo_ability": { 491 "score": 1, 492 "justification": "Code and data are promised but not yet released; practitioners cannot currently reproduce or try the benchmark." 493 }, 494 "brand_recognition": { 495 "score": 1, 496 "justification": "Snap Inc. is a recognizable company but not a primary AI research lab; UNC and USC are respected but not top-tier AI venues by brand alone." 497 } 498 }, 499 "hn_data": { 500 "threads": [ 501 { 502 "hn_id": "39568622", 503 "title": "ArtPrompt: ASCII Art-Based Jailbreak Attacks Against Aligned LLMs", 504 "points": 145, 505 "comments": 55, 506 "url": "https://news.ycombinator.com/item?id=39568622", 507 "created_at": "2024-03-02T00:30:06Z" 508 }, 509 { 510 "hn_id": "39465357", 511 "title": "LongRoPE: Extending LLM Context Window Beyond 2M Tokens", 512 "points": 142, 513 "comments": 46, 514 "url": "https://news.ycombinator.com/item?id=39465357", 515 "created_at": "2024-02-22T10:44:35Z" 516 }, 517 { 518 "hn_id": "39811319", 519 "title": "Rose: Efficient and Extensible Autodiff on the Web", 520 "points": 3, 521 "comments": 0, 522 "url": "https://news.ycombinator.com/item?id=39811319", 523 "created_at": "2024-03-24T23:03:03Z" 524 }, 525 { 526 "hn_id": "39462835", 527 "title": "Microsoft's LongRoPE: Extending LLM Context Window Beyond 2M Tokens", 528 "points": 3, 529 "comments": 0, 530 "url": "https://news.ycombinator.com/item?id=39462835", 531 "created_at": "2024-02-22T03:24:29Z" 532 }, 533 { 534 "hn_id": "47203853", 535 "title": "Show HN: Engram – Memory for AI coding agents (2.5K installs, 80% on LOCOMO)", 536 "points": 1, 537 "comments": 0, 538 "url": "https://news.ycombinator.com/item?id=47203853", 539 "created_at": "2026-03-01T04:55:07Z" 540 }, 541 { 542 "hn_id": "45007581", 543 "title": "Evaluating Long-Term Conversational Memory of LLM Agents", 544 "points": 1, 545 "comments": 0, 546 "url": "https://news.ycombinator.com/item?id=45007581", 547 "created_at": "2025-08-24T20:42:16Z" 548 } 549 ], 550 "top_points": 145, 551 "total_points": 295, 552 "total_comments": 101 553 } 554 }