scan.json (21532B)
1 { 2 "paper": { 3 "title": "Cost and accuracy of long-term memory in Distributed Multi-Agent Systems based on Large Language Models", 4 "authors": ["Benedict Wolff", "Jacopo Bennati"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2601.07978" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository provided: https://github.com/wolffbe/dmas-memory. The abstract states 'the source code of the testbed, experimental results as well as notebooks for analyzing and evaluating the results are available.'" 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states experimental results are available at the GitHub repository. The benchmark used (LoCoMo) is publicly available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Appendix A.4 (Table 7) lists detailed component versions including Docker 4.48.0, Python 3.11-slim, Neo4j 5.26, Ubuntu 22.04.3 LTS, and hardware specs (12th-gen Intel Core i5-12450H). Docker-based setup provides reproducible environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": true, 29 "justification": "Section 2.2 describes a Makefile that 'automates the testbed setup, and executes the analysis Jupyter notebook.' The repository contains the testbed code, Makefile, and analysis notebooks." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Table 5 reports 95% Wilson confidence intervals for accuracy rates (e.g., Graphiti unconstrained: [0.074, 0.162])." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 6 reports two-proportion z-tests with z-statistics and p-values for all pairwise comparisons. The paper correctly concludes no significant differences at α = 0.05." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage differences with baseline context throughout (e.g., 'Graphiti was 3.6% more accurate than mem0', 'mem0 was 86.5% faster during the loading phase', cost differentials of 40.2%). Raw numbers and percentages are provided." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "N=199 questions from one conversation of the LoCoMo benchmark. The paper states 'a robust sample size with statistical relevance was ensured' but provides no power analysis or formal justification for why 199 questions from a single conversation is sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across runs. The experiments appear to be single-run only — no mention of multiple experimental runs or seeds." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Two memory frameworks (mem0 and Graphiti) are compared against each other across two network conditions, providing a comparative evaluation." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Both mem0 and Graphiti are contemporary frameworks. The paper notes mem0 'recently secured $24 million USD in funding' and Neo4j (Graphiti's backend) raised $325M in Series F, both in 2025." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is performed. The system has multiple components (coordinator, memory agent, responder, SLM for tool-calling, LLM for answering) but none are ablated to understand individual contributions." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics reported: accuracy, IDK rate, CPU usage, RAM usage, disk usage, network bandwidth, token consumption, financial cost, execution time, string similarity, and semantic similarity." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of system outputs. Accuracy is measured automatically via string and semantic similarity against LoCoMo ground truth answers. Given the subjective nature of Q&A quality, human evaluation would be relevant." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "Not a machine learning training study. The benchmark questions serve as the evaluation set directly; there is no model training or tuning involved." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by experiment type (unconstrained/constrained), phase (loading/Q&A), memory framework, and cloud vs. edge in Tables 1-4. Response distributions (correct/wrong/IDK) are shown per condition." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.4 discusses the high IDK rate as a failure mode. The limitations note 'the DMAS responded to a significant number of questions with IDK' and suggest investigating root causes. Appendix A.3 shows an example of retrieved memories." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that network constraints had 'limited impact on the overall DMAS performance' (less than 1% cost variation), which was contrary to what might be expected. The high IDK rate (58-68%) is also a negative result reported transparently." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about mem0 outperforming Graphiti in efficiency and accuracy differences not being statistically significant are supported by Tables 1-6 and the z-test results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper uses a factorial experimental design with two independent variables (memory framework, network profile) and controlled conditions. Claims like 'mem0 is the Pareto-optimal solution' follow from the controlled comparison. The design adequately supports the comparative claims made." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper tests on a single conversation from LoCoMo with a single LLM (gpt-4o-mini) and single SLM (qwen2.5:3b), but makes broad claims about 'DMAS memory' and 'the optimal choice' without adequately bounding to the specific models, benchmark, and conversation tested." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for the results. For example, the high IDK rate could be due to the SLM's tool-calling quality, the responder prompt design, or memory retrieval quality — none of these are explored as confounds." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper specifies 'gpt-4o-mini' without a snapshot date or API version, and 'qwen2.5:3b-instruct-q4_K_M' which is a specific quantization but from an 'Ollama' image tagged 'latest'. No API version or snapshot date for gpt-4o-mini." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Appendix A.2 provides the full system and user prompts for both the coordinator and responder agents, including the exact text with placeholders and their fill values explained." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No mention of temperature, top-p, max tokens, or other sampling parameters for gpt-4o-mini or the Ollama-hosted SLM. These significantly affect output quality." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 2.2 describes the multi-agent architecture in detail: coordinator with tool-calling SLM, memory agent with mem0/Graphiti backends, responder with LLM, proxy routing, and the loading/Q&A workflow. Figure 1 provides the system architecture diagram." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 2.1 describes how LoCoMo conversations are decomposed into sessions and turns, loaded via the coordinator's /remember endpoint. Appendix A.1 shows the JSON structure. The similarity scoring method is described in Section 2.1." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4.4 'Limitations and future work' is a dedicated section discussing specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 4.4 identifies specific threats: the DMAS had 'little autonomy over its own actions,' the network constraints had 'limited impact,' and the high IDK rate needs investigation. These are specific to this study." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The limitations section discusses what future work should do but does not explicitly state what the current results do NOT show. For example, it does not state that results are limited to one conversation, one LLM, or one benchmark, despite these being significant scope limitations." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The paper states 'experimental results as well as notebooks for analyzing and evaluating the results are available' at the GitHub repository. Results are stored in CSV files." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 2.1 describes how data was collected: ten dependent variables measured during loading and Q&A phases, using Prometheus for metrics, telegraf for CPU/RAM/disk/network, and the analysis Jupyter notebook for Q&A results." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The study uses the LoCoMo benchmark dataset, which is a standard public benchmark." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2.2 documents the full pipeline: LoCoMo loaded via API → coordinator decomposes into turns → sent to memory agent → Q&A phase collects answers → analysis notebook stores CSV files. Monitoring via Prometheus and telegraf is also documented." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section found in the paper. Authors are from KTH Royal Institute of Technology but no funding source is disclosed." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations with KTH Royal Institute of Technology are clearly stated. Neither author appears affiliated with mem0 or Neo4j/Graphiti." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information disclosed, so independence cannot be assessed. The paper does mention mem0's $24M funding and Neo4j's $325M funding as context, but does not disclose any relationship." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses gpt-4o-mini to answer questions about LoCoMo conversations but does not state the model's training data cutoff. LoCoMo could be in the training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether gpt-4o-mini may have seen LoCoMo conversations or questions during training. The responder is prompted to use only provided memories, but contamination could still affect behavior." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "LoCoMo was published in 2024. gpt-4o-mini was likely trained on data that could include LoCoMo. No contamination analysis is provided." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Table 1 reports token consumption and USD cost per experiment/phase/memory. Section 4.1 converts computational cost to AWS Fargate pricing. Cost reporting is a central contribution of the paper." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Table 2 reports CPU (minutes), RAM (MB), disk (MB), and network (MB) usage per experiment. Table 3 reports total execution time. Section 4.1 converts these to financial cost using AWS pricing." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "mem0 significantly outperforms Graphiti in efficiency, with faster loading times, lower resource consumption, and minimal network overhead.", 286 "evidence": "Tables 1-3: mem0 was 86.5% faster loading (unconstrained), used 104-232% less CPU, 38-8% less RAM, and orders of magnitude less network bandwidth than Graphiti.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Accuracy differences between mem0 and Graphiti were not statistically significant.", 291 "evidence": "Table 6: Two-proportion z-tests yield p=0.2269 (unconstrained) and p=0.4330 (constrained), both above α=0.05. Table 5 shows overlapping 95% Wilson CIs.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "mem0 is the Pareto-optimal solution balancing cost and accuracy in DMAS.", 296 "evidence": "Section 4.2-4.3: Since accuracy difference is not significant and mem0 is substantially cheaper (40.2% lower cost unconstrained), mem0 dominates under the statistical Pareto framework.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Network constraints have limited impact on DMAS performance, with less than 1% cost variation.", 301 "evidence": "Section 4.3 states 'computational and financial cost varies by less than 1%' with only 4.3% increased financial burden in constrained experiment.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "This study compares mem0 (vector-based) and Graphiti (graph-based) long-term memory frameworks for distributed multi-agent systems using the LoCoMo benchmark. mem0 is substantially more efficient across all cost metrics (86.5% faster loading, 40.2% lower total cost) while accuracy differences are not statistically significant (p>0.05). Network constraints (200ms latency, 8 Mbits bandwidth) had minimal impact on performance. Both systems exhibited high IDK response rates (58-68%), suggesting fundamental limitations in memory retrieval quality.", 307 "red_flags": [ 308 { 309 "flag": "Single-run experiments", 310 "detail": "All experiments appear to be single-run with no repetition across seeds or runs. No variance or standard deviation is reported. Results could be affected by API non-determinism (gpt-4o-mini) or system load variability." 311 }, 312 { 313 "flag": "Very low accuracy rates", 314 "detail": "Both systems achieve only 6-11% accuracy on the LoCoMo benchmark, with 58-68% IDK responses. This suggests the experimental setup may have fundamental issues that limit the meaningfulness of accuracy comparisons." 315 }, 316 { 317 "flag": "Single conversation tested", 318 "detail": "Only the first of ten LoCoMo conversations was used. Results may not generalize to other conversation types or memory workloads. N=199 questions from one conversation is a narrow evaluation." 319 }, 320 { 321 "flag": "Docker 'latest' tags for components", 322 "detail": "Several critical components (Ollama, Prometheus, Qdrant, Socat, Jupyter Notebook) use 'latest' Docker tags (Table 7), making exact reproduction difficult as these tags change over time." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory", 328 "authors": ["P. Chhikara", "D. Khant", "S. Aryan", "T. Singh", "D. Yadav"], 329 "year": 2025, 330 "arxiv_id": "2504.19413", 331 "relevance": "Core memory framework evaluated in this study; relevant to LLM agent memory and scalability." 332 }, 333 { 334 "title": "Evaluating Very Long-Term Conversational Memory of LLM Agents", 335 "authors": ["A. Maharana", "D.-H. Lee", "S. Tulyakov", "M. Bansal", "F. Barbieri", "Y. Fang"], 336 "year": 2024, 337 "arxiv_id": "2402.17753", 338 "relevance": "LoCoMo benchmark used in this study; foundational benchmark for evaluating LLM long-term memory." 339 }, 340 { 341 "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges", 342 "authors": ["T. Guo", "X. Chen", "Y. Wang", "R. Chang", "S. Pei", "N. V. Chawla", "O. Wiest", "X. Zhang"], 343 "year": 2024, 344 "arxiv_id": "2402.01680", 345 "relevance": "Survey of LLM-based multi-agent systems covering progress and challenges." 346 }, 347 { 348 "title": "BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating Machine Learning Tasks", 349 "authors": ["S. Gandhi", "M. Patwardhan", "L. Vig", "G. Shroff"], 350 "year": 2025, 351 "arxiv_id": "2411.07464", 352 "relevance": "Addresses cost optimization in LLM multi-agent systems through profiling and efficient retrieval." 353 }, 354 { 355 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 356 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 357 "year": 2023, 358 "arxiv_id": "2305.05176", 359 "relevance": "Foundational work on cost-per-successful-query metric and LLM cost optimization." 360 }, 361 { 362 "title": "Green AI", 363 "authors": ["R. Schwartz", "J. Dodge", "N. A. Smith", "O. Etzioni"], 364 "year": 2020, 365 "doi": "10.1145/3381831", 366 "relevance": "Introduces accuracy-efficiency score framework referenced for balancing cost and accuracy in AI systems." 367 }, 368 { 369 "title": "Why Do Multi-Agent LLM Systems Fail?", 370 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 371 "year": 2025, 372 "arxiv_id": "2503.13657", 373 "relevance": "Analyzes failure modes in multi-agent LLM systems including specification issues and inter-agent misalignment." 374 }, 375 { 376 "title": "Achilles Heel of Distributed Multi-Agent Systems", 377 "authors": ["Y. Zhang", "Y. Li", "T. Zhao", "K. Zhu", "H. Wang", "N. Vasconcelos"], 378 "year": 2025, 379 "arxiv_id": "2504.07461", 380 "relevance": "Identifies critical trustworthiness challenges in distributed multi-agent systems." 381 }, 382 { 383 "title": "Zep: A Temporal Knowledge Graph Architecture for Agent Memory", 384 "authors": ["P. Rasmussen", "P. Paliychuk", "T. Beauvais", "J. Ryan", "D. Chalef"], 385 "year": 2025, 386 "arxiv_id": "2501.13956", 387 "relevance": "Graph-based agent memory framework relevant to evaluating knowledge graph approaches for LLM agents." 388 } 389 ] 390 }