scan.json (24051B)
1 { 2 "paper": { 3 "title": "MemGPT: Towards LLMs as Operating Systems", 4 "authors": ["Charles Packer", "Sarah Wooders", "Kevin Lin", "Vivian Fang", "Shishir G. Patil", "Ion Stoica", "Joseph E. Gonzalez"], 5 "year": 2023, 6 "venue": "arXiv", 7 "arxiv_id": "2310.08560", 8 "doi": "10.48550/arXiv.2310.08560" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "MemGPT uses OS-inspired virtual context management to provide the illusion of extended context for LLMs. On deep memory retrieval, MemGPT with GPT-4 Turbo achieves 93.4% accuracy vs 35.3% for the baseline. On document QA, MemGPT's performance is unaffected by increasing context length unlike fixed-context baselines. On nested key-value retrieval, MemGPT with GPT-4 is the only approach that consistently completes multi-hop lookups beyond 2 nesting levels.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states 'We release MemGPT code and data for our experiments at https://research.memgpt.ai' in the abstract and Section 3." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states 'We publicly release our augmented MSC dataset, nested KV retrieval dataset, and a dataset of embeddings for 20M Wikipedia articles' in Section 3." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper references a code release URL but does not include step-by-step reproduction instructions within the paper itself." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Tables 2 and 3 report point estimates only (e.g., '93.4%' accuracy, '0.827' ROUGE-L) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims MemGPT 'significantly outperforms' baselines (Table 2) but provides no statistical significance tests — comparisons are based on raw number differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are contextualizable from the results: e.g., GPT-4 Turbo baseline 35.3% vs MemGPT 93.4% accuracy (Table 2), providing both absolute values for comparison." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The document QA task uses 50 sampled questions (Section 3.2.1), nested KV uses 30 configurations (Section 3.2.2), but no justification for these sample sizes is provided." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported across runs for any experiment." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "MemGPT is compared against fixed-context baselines using GPT-3.5 Turbo, GPT-4, and GPT-4 Turbo across all three tasks (Tables 2, 3, Figures 5, 7)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines use GPT-4 Turbo (128k context), which was state-of-the-art at time of writing (data collected 1/2024 per Table 1)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is presented to isolate which components of MemGPT (working context, archival storage, queue manager, function chaining) contribute most to performance." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "DMR task reports both accuracy and ROUGE-L (Table 2). Conversation opener reports SIM-1, SIM-3, and SIM-H (Table 3)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation relies entirely on automated metrics (ROUGE-L, LLM judge accuracy, cosine similarity). No human evaluation of system outputs is conducted despite the conversational agent domain where subjective quality matters." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The MSC dataset sessions 1-5 are used for history and a new session 6 with QA pairs is used for evaluation (Section 3.1.1). Document QA uses a separate sampled question set." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by underlying model (GPT-3.5, GPT-4, GPT-4 Turbo) and by task variant. Nested KV results are broken down by nesting level (Figure 7). Document QA shows performance vs number of documents retrieved (Figure 5)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 3.2.1 discusses that 'MemGPT will often stop paging through retriever results before exhausting the retriever database.' Section 3.2.2 notes GPT-3.5's failure mode: 'its primary failure mode is to simply return the original value.' MemGPT with GPT-3.5 has 'significantly degraded performance.'" 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "MemGPT with GPT-3.5 performs poorly on document QA (Figure 5) and nested KV (Figure 7). MemGPT with GPT-4 Turbo performs worse than MemGPT with GPT-4 on nested KV, which is explicitly noted." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims MemGPT can analyze large documents and create conversational agents with long-term memory. Both claims are supported by results in Sections 3.1 and 3.2." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims MemGPT 'improves' performance (e.g., 'MemGPT clearly improves the performance of the underlying base LLM'). Without ablations or controlled experiments isolating the memory hierarchy's contribution vs other design choices, these causal claims are not well justified." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title 'Towards LLMs as Operating Systems' and framing as a general 'OS-inspired design' is broader than the evidence, which covers only two specific tasks (MSC chat and document QA) with OpenAI models only." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the results. For example, the improvement could be partly due to having more retrieval calls rather than the memory hierarchy per se, or the baseline summarization approach may be particularly weak." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses LLM judges to evaluate correctness and engagement (CSIM scores as proxy for 'engagement'), but does not discuss the gap between these proxies and the actual constructs being measured. LLM judge reliability is mentioned briefly but the proxy-outcome distinction is not explicitly acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 3 specifies exact model endpoints: 'gpt-4-1106-preview', 'gpt-4-0613', and 'gpt-3.5-turbo-1106' with their context window sizes." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The appendix (Sections 6.1.1-6.1.6) provides actual prompt text used for MemGPT instructions, baselines, LLM judges, dataset generation, and document analysis, with caveats about editing for brevity." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, or other sampling hyperparameters are reported for the API calls. Only context window sizes and threshold percentages (70% warning, 100% flush) are mentioned." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The MemGPT scaffolding is described in detail in Section 2: main context structure (system instructions, working context, FIFO queue), queue manager with eviction policies, function executor, function chaining mechanism, and external storage tiers (archival and recall)." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.2.1 documents the retriever pipeline: 2018 Wikipedia dump, OpenAI text-embedding-ada-002 embeddings, PostgreSQL with pgvector using HNSW index. Section 3.1 describes how DMR QA pairs were generated from MSC." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section exists. The conclusion mentions 'future exploration' but does not discuss limitations of the current work." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. The conclusion frames the work broadly as 'a promising new direction for maximizing the capabilities of LLMs' without bounding the claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper releases the augmented MSC dataset, nested KV retrieval dataset, and Wikipedia embeddings at https://research.memgpt.ai." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.1.1 describes DMR dataset generation using a separate LLM with detailed prompts (Appendix 6.1.3). Section 3.2.1 describes the Wikipedia dump source and embedding procedure. Section 3.2.2 describes the nested KV task construction." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants — the study uses existing datasets (MSC) and synthetic benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from MSC dataset to DMR evaluation is documented: original MSC sessions → LLM-generated QA pairs → evaluation. Document QA pipeline: Wikipedia dump → embeddings → retriever → reader evaluation." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as affiliated with University of California, Berkeley." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Funding is not disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present. MemGPT became a commercial product (Letta), and several authors are associated with it, but this is not disclosed in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses GPT-3.5, GPT-4, and GPT-4 Turbo on NaturalQuestions-Open and MSC datasets but does not state the training data cutoff dates for these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "NaturalQuestions-Open is from 2019 and the MSC dataset from 2021 — both likely in GPT-4's training data. No discussion of potential overlap." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "NaturalQuestions-Open and MSC were published well before GPT-4's training cutoff. No contamination analysis is performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "MemGPT makes multiple LLM inference calls per user query (function chaining), but no inference cost, token consumption, or latency data is reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget, API spend, or hardware information is provided." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated for any experiment." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The memory pressure thresholds (70% warning, 100% flush, 50% eviction) appear tuned but no search budget is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The thresholds and design choices (e.g., eviction percentages) are presented without justification for why these specific values were chosen." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement both MemGPT and the baselines. The baselines use 'lossy summarization' which is a weak comparison strategy. No acknowledgment of self-comparison bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "MemGPT makes multiple LLM calls per query (function chaining) while baselines make one. This compute disparity is never discussed or controlled for." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the DMR task, conversation opener task, or nested KV task actually measure the claimed capabilities (consistency, engagement, document analysis ability)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "MemGPT IS the scaffold being evaluated. The paper evaluates the scaffolding system itself rather than comparing models within different scaffolds." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "NaturalQuestions-Open (2019) and MSC (2021) predate GPT-4's training. The models may have seen these benchmarks and their solutions. Not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the LLM judge or the evaluation setup might leak answer information." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential overlap between training data and evaluation data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "MemGPT with GPT-4 Turbo achieves 93.4% accuracy on deep memory retrieval, compared to 35.3% for the GPT-4 Turbo baseline.", 365 "evidence": "Table 2 shows DMR accuracy and ROUGE-L scores for all model/MemGPT combinations.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "MemGPT crafts conversation openers that perform similarly to or exceed human-written openers.", 370 "evidence": "Table 3 shows SIM-1 scores of 0.830-0.868 for MemGPT vs 0.800 for humans, and SIM-H scores of 0.767-0.817.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "MemGPT's document QA performance is unaffected by increased context length, unlike fixed-context baselines.", 375 "evidence": "Figure 5 shows flat MemGPT performance as documents increase, while baselines degrade with truncation.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "MemGPT with GPT-4 is the only approach that consistently completes nested KV retrieval beyond 2 nesting levels.", 380 "evidence": "Figure 7 shows MemGPT with GPT-4 maintaining near-perfect accuracy through 3 nesting levels while all baselines drop to 0%.", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No statistical rigor", 387 "detail": "All results are single-run point estimates with no variance, confidence intervals, or significance tests. Claims of 'significant' outperformance are made without any statistical testing." 388 }, 389 { 390 "flag": "Unfair compute comparison", 391 "detail": "MemGPT makes multiple LLM inference calls per query via function chaining, while baselines make a single call. The compute cost disparity is never quantified or discussed." 392 }, 393 { 394 "flag": "Weak baseline design", 395 "detail": "The DMR baseline uses 'lossy summarization' of past conversations, which is one of many possible baseline approaches. No comparison against RAG baselines or other memory-augmented systems." 396 }, 397 { 398 "flag": "No limitations section", 399 "detail": "The paper has no limitations, threats to validity, or scope-bounding discussion." 400 }, 401 { 402 "flag": "Benchmark contamination risk", 403 "detail": "NaturalQuestions-Open (2019) and MSC (2021) likely appear in GPT-4's training data. MemGPT's advantage could partly stem from the model having memorized answers that it retrieves through memory search." 404 }, 405 { 406 "flag": "Self-generated evaluation data", 407 "detail": "The DMR QA pairs were generated by an LLM, introducing potential systematic biases in what kinds of questions are asked and what counts as correct." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "AgentBench: Evaluating LLMs as Agents", 413 "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"], 414 "year": 2023, 415 "arxiv_id": "2308.03688", 416 "relevance": "Benchmark for evaluating LLMs as interactive agents, directly relevant to agentic AI evaluation." 417 }, 418 { 419 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 420 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 421 "year": 2023, 422 "arxiv_id": "2302.04761", 423 "relevance": "Foundational work on LLM tool use that MemGPT builds upon for function calling." 424 }, 425 { 426 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 427 "authors": ["Joon Sung Park", "Joseph C O'Brien", "Carrie J Cai"], 428 "year": 2023, 429 "arxiv_id": "2304.03442", 430 "relevance": "Proposes memory for LLM agents in multi-agent settings, directly related to agent memory management." 431 }, 432 { 433 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 434 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 435 "year": 2022, 436 "arxiv_id": "2210.03629", 437 "relevance": "Foundational work on interleaving reasoning and acting in LLM agents." 438 }, 439 { 440 "title": "Lost in the Middle: How Language Models Use Long Contexts", 441 "authors": ["Nelson F Liu", "Kevin Lin", "John Hewitt"], 442 "year": 2023, 443 "arxiv_id": "2307.03172", 444 "relevance": "Key finding on long-context model limitations that motivates MemGPT's approach; benchmark tasks reused." 445 }, 446 { 447 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 448 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 449 "year": 2020, 450 "relevance": "Foundational RAG paper relevant to understanding retrieval-augmented approaches vs MemGPT's memory hierarchy." 451 }, 452 { 453 "title": "Active Retrieval Augmented Generation", 454 "authors": ["Zhengbao Jiang", "Frank F Xu", "Luyu Gao"], 455 "year": 2023, 456 "arxiv_id": "2305.06983", 457 "relevance": "FLARE method for active retrieval during generation, closely related to MemGPT's self-directed retrieval." 458 }, 459 { 460 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 461 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 462 "year": 2022, 463 "relevance": "Foundational prompting technique used in MemGPT's function chaining and planning." 464 }, 465 { 466 "title": "WebGPT: Browser-Assisted Question-Answering with Human Feedback", 467 "authors": ["Reiichiro Nakano", "Jacob Hilton", "Suchir Balaji"], 468 "year": 2021, 469 "arxiv_id": "2112.09332", 470 "relevance": "Uses similar pagination concepts for context management in web browsing, precursor to MemGPT's approach." 471 }, 472 { 473 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 474 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 475 "year": 2023, 476 "arxiv_id": "2306.05685", 477 "relevance": "LLM-as-judge evaluation methodology used in MemGPT's evaluation, relevant to automated evaluation practices." 478 } 479 ] 480 }