scan.json (24503B)
1 { 2 "paper": { 3 "title": "LatentMem: Customizing Latent Memory for Multi-Agent Systems", 4 "authors": ["Muxin Fu", "Guibin Zhang", "Xiangyuan Xue", "Yafu Li", "Zefeng He", "Siyuan Huang", "Xiaoye Qu", "Yu Cheng", "Yang Yang"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.03036" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "GitHub link provided in the abstract header: https://github.com/KANABOON1/LatentMem" 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "All six benchmarks used (TriviaQA, PopQA, KodCode, BigCodeBench, StrategyQA, PDDL) are publicly available datasets. Descriptions in Appendix B.1." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "Table 3 lists training hyperparameters and mentions Flash Attention, vLLM, DeepSpeed, but no requirements.txt, Dockerfile, or specific library versions are provided." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "No step-by-step reproduction instructions, README commands, or scripts for replicating experiments are described in the paper." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "Tables 1 and 4 report only point estimates with no confidence intervals or error bars." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper claims LatentMem outperforms baselines based solely on comparing numbers. No statistical significance tests are reported." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "Tables 1 and 4 report percentage point improvements over baselines with both absolute values and deltas (e.g., '76.51↑16.20'), providing sufficient context for effect size." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "No justification for the number of evaluation examples used from each benchmark, nor any power analysis." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No standard deviation, variance, or spread measures reported across runs. All results appear to be single-run point estimates." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Seven memory baselines compared: Voyager, Generative, JoyAgent, MetaGPT, ChatDev, OAgents, G-Memory, plus no-memory and MARTI fine-tuning baseline." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "Baselines include recent works: G-Memory (2025), JoyAgent (2025), OAgents (2025), MARTI (2025). All are contemporary." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Section 5.6 and Figure 6 (Right) present ablation removing role profiles ('without role') and experience bank updates ('without experience'), showing contribution of each component." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "Evaluates accuracy across 6 benchmarks in different domains, plus time cost and token cost analysis in Section 5.3." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": false, 85 "justification": "No human evaluation of system outputs. All evaluation is automated (accuracy, pass rates, reward scores)." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "Explicit held-in/held-out split: TriviaQA, KodCode, StrategyQA, PopQA are in-domain; BigCodeBench and PDDL are out-of-domain. CAMEL and DyLAN are unseen MAS frameworks (Section 5.1, Appendix B.4.1)." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 1 provides per-benchmark, per-MAS-framework results across all 6 datasets and 4 frameworks." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 5.7 and Figure 7 present a case study analyzing common error patterns (step repetition, disobey task specification, reasoning-action mismatch) and how LatentMem addresses them." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "Ablation study shows performance drops when components are removed (e.g., 6.45% drop on MacNet KodCode without role profiles). Some baselines hurt performance vs no-memory (e.g., ChatDev on AutoGen TriviaQA: -2.97%)." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims 'up to 19.36%' improvement — Table 1 DyLAN PopQA shows exactly 19.36%. Claims of 50% fewer tokens and ~2/3 inference time are supported by Figure 3. All abstract claims are backed by results." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "Causal claims about component contributions are supported by ablation studies (Section 5.6) with controlled single-variable manipulation (removing role profiles, disabling experience bank updates)." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper explicitly separates held-in vs held-out benchmarks and seen vs unseen MAS frameworks, and reports results separately for each category. Claims are framed within the tested settings." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "No discussion of alternative explanations for the performance gains. For example, no consideration of whether improvements could be due to increased compute, data augmentation effects, or other confounds beyond the memory mechanism." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper measures accuracy/pass rates on specific benchmarks and frames claims at that level ('performance gain of up to 19.36% over vanilla settings'). No broader framing beyond what was measured." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Exact model identifiers provided: 'Qwen/Qwen3-4B-Instruct-2507' and 'meta-llama/Llama-3.1-8B-Instruct' (Section 5.1). Embedding model: 'all-MiniLM-L6-v2'." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Appendix D provides full prompt templates for all agent roles across CAMEL and AutoGen frameworks, with actual prompt text." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Table 3 provides comprehensive hyperparameter settings including learning rate (1e-5), temperature (1.0 train, 0.0 eval), clipping epsilon (0.2), LoRA settings (r=16, alpha=32), batch sizes, and more." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "Appendix B.3 describes each MAS framework setup in detail: AutoGen (B.3.1), MacNet (B.3.2), CAMEL (B.3.3), DyLAN (B.3.4), including agent counts, topologies, and coordination mechanisms." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Appendix B.4.2 describes training trajectory collection: which MAS frameworks and datasets were used, total trajectory count (40,580), and how trajectories were processed for parametric vs non-parametric baselines." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "No limitations section. The paper has an 'Impact Statement' with ethical considerations and societal implications, but no discussion of methodological limitations." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No threats to validity discussed. The Impact Statement addresses only ethical and societal concerns, not study-specific methodological threats." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "No explicit scope boundaries. The paper does not state what the results do NOT show, what settings are excluded, or what claims the authors are NOT making." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw experimental data (trajectories, per-example results) is made available for independent verification. Only aggregated results are reported." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Appendix B.4.2 describes training trajectory collection: 'we collect data on all in-domain datasets using the training splits and in-distribution MAS,' yielding 40,580 trajectories." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants. All data comes from standard public benchmarks." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "Appendix B.4.2 documents the pipeline: collect trajectories from in-domain datasets using training splits → store in experience bank → train memory composer via LMPO on same training data." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding information or acknowledgments section with grants or sponsors is present in the paper." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations clearly listed: Tongji University, Shanghai AI Laboratory, National University of Singapore, Chinese University of Hong Kong, Nanjing University, Shanghai Jiao Tong University." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding disclosed, so independence cannot be assessed. Authors include Shanghai AI Laboratory affiliates, but no explicit conflict statement." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "No training data cutoff dates stated for Qwen3-4B-Instruct-2507 or Llama-3.1-8B-Instruct, despite evaluating these models on public benchmarks." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No discussion of whether benchmark examples (TriviaQA, StrategyQA, PopQA, etc.) appeared in the LLM training data." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "TriviaQA (2017), StrategyQA (2021), PopQA (2023) all predate the models' training. No contamination risk discussion despite high likelihood of exposure." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section 5.3 and Figures 3 and 8 report both time costs (in seconds) and token costs for all methods across multiple benchmarks and frameworks." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "No total training compute budget (GPU hours, hardware used, training time) is reported despite training the memory composer with LMPO." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No results across multiple random seeds. All tables report single point estimates without seed variation analysis." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The number of experimental runs producing reported results is never stated." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": false, 303 "justification": "Sensitivity analysis in Section 5.6 varies L' and K, but no overall hyperparameter search budget is reported. The total configurations tried to arrive at the final settings are not stated." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": true, 308 "justification": "Section 5.6 and Figure 6 show sensitivity analysis for L' with performance curves, justifying the choice of L'=8 as 'balancing accuracy and computational cost.'" 309 }, 310 "multiple_comparison_correction": { 311 "applies": false, 312 "answer": false, 313 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "No acknowledgment of author-evaluation bias. The authors implement and evaluate their own system against their implementations of baselines without discussing this potential bias." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": true, 323 "justification": "Figures 3 and 8 explicitly plot performance vs time cost and performance vs token cost, showing LatentMem's efficiency-performance trade-off relative to baselines." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": false, 328 "justification": "No discussion of whether the benchmarks actually measure what the paper claims. For example, no analysis of whether TriviaQA accuracy truly reflects 'knowledge QA capability' in multi-agent settings." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": true, 333 "justification": "The paper evaluates LatentMem across 4 different MAS frameworks (AutoGen, MacNet, CAMEL, DyLAN), showing results hold across different scaffolding architectures. Table 1 systematically compares across frameworks." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of temporal leakage. TriviaQA (2017), StrategyQA (2021), and PopQA (2023) all predate the models used, creating significant contamination risk." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No discussion of whether the retrieved experience trajectories or evaluation setup leak answer information." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of independence between training trajectories and test examples. Training and test data come from splits of the same benchmarks." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": false, 355 "justification": "No concrete leakage detection or prevention method applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination)." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "LatentMem improves state-of-the-art MAS by up to 16.20% on knowledge QA (TriviaQA) and 18.45% on code generation (KodCode) tasks.", 362 "evidence": "Table 1 shows AutoGen+TriviaQA: 76.51 vs 60.31 no-memory (+16.20%). Table 4 shows AutoGen+KodCode with Llama-3.1-8B: 65.90 vs 47.45 (+18.45%).", 363 "supported": "strong" 364 }, 365 { 366 "claim": "LatentMem uses 50% fewer tokens and reduces inference time to ~2/3 compared to mainstream memory designs.", 367 "evidence": "Figure 3 shows token and time costs. AutoGen+KodCode: LatentMem uses 1.76M tokens vs 3.04-3.64M for baselines. Time costs similarly reduced.", 368 "supported": "moderate" 369 }, 370 { 371 "claim": "LatentMem generalizes to out-of-domain datasets (PDDL: +7.10%) and unseen MAS frameworks (CAMEL: +7.90% average over vanilla).", 372 "evidence": "Table 1 AutoGen+PDDL: 23.49 vs 16.39 (+7.10%). CAMEL average held-in: 61.12 vs 53.22 (+7.90%).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "LatentMem consistently outperforms direct agent backbone fine-tuning (MARTI) across all settings.", 377 "evidence": "Table 2 shows LatentMem vs MARTI on KodCode and TriviaQA across AutoGen and MacNet, with LatentMem winning all four comparisons (up to +11.73% on AutoGen TriviaQA).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "LatentMem generates role-specific latent memories that form well-separated clusters even on unseen frameworks and out-of-domain data.", 382 "evidence": "Figures 4 and 9 show t-SNE visualizations with clear role-based clustering on both in-domain (AutoGen+KodCode) and out-of-domain (CAMEL+BigCodeBench) settings.", 383 "supported": "moderate" 384 } 385 ], 386 "methodology_tags": ["benchmark-eval"], 387 "key_findings": "LatentMem proposes a learnable latent memory framework for multi-agent systems that addresses memory homogenization and information overload. It achieves performance gains of up to 19.36% over vanilla MAS settings across 6 benchmarks and 4 frameworks, while using approximately 50% fewer tokens than competing memory architectures. The framework generalizes to unseen MAS frameworks and out-of-domain benchmarks, and outperforms direct backbone fine-tuning (MARTI) under matched compute budgets.", 388 "red_flags": [ 389 { 390 "flag": "No variance or multiple-run reporting", 391 "detail": "All results in Tables 1 and 4 are single point estimates with no standard deviations, error bars, or indication of how many runs produced them. Given LLM output stochasticity and MAS interaction variability, single-run results may not be stable." 392 }, 393 { 394 "flag": "No statistical significance tests", 395 "detail": "Performance improvements are claimed based on raw number comparisons without any statistical testing. Some improvements (e.g., 0.04% for MetaGPT on AutoGen TriviaQA) could easily be within noise." 396 }, 397 { 398 "flag": "No limitations section", 399 "detail": "The paper has no limitations or threats-to-validity discussion. The Impact Statement covers only ethical/societal considerations, not methodological weaknesses." 400 }, 401 { 402 "flag": "Benchmark contamination risk unaddressed", 403 "detail": "TriviaQA (2017), StrategyQA (2021), and PopQA (2023) all predate the training of both Qwen3 and Llama-3.1. The paper does not discuss whether the backbone models have seen these benchmarks, which would inflate all results including baselines." 404 }, 405 { 406 "flag": "Training compute budget not reported", 407 "detail": "The memory composer is trained with LMPO on 40,580 trajectories but no GPU hours, hardware specs, or training wall-clock time are reported, making cost-effectiveness claims incomplete." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors", 413 "authors": ["Weize Chen", "Yusheng Su", "Jingwei Zuo"], 414 "year": 2023, 415 "relevance": "Multi-agent collaboration framework relevant to MAS evaluation and design." 416 }, 417 { 418 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 419 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 420 "year": 2023, 421 "relevance": "Major MAS framework used as both baseline and evaluation setting." 422 }, 423 { 424 "title": "ChatDev: Communicative agents for software development", 425 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 426 "year": 2024, 427 "relevance": "MAS framework for code generation, used as baseline memory architecture." 428 }, 429 { 430 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations", 431 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 432 "year": 2024, 433 "relevance": "Core MAS framework used for evaluation, one of four tested frameworks." 434 }, 435 { 436 "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society", 437 "authors": ["Guohao Li", "Hasan Hammoud", "Hani Itani"], 438 "year": 2023, 439 "relevance": "Role-playing MAS framework used as unseen evaluation framework." 440 }, 441 { 442 "title": "Voyager: An open-ended embodied agent with large language models", 443 "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"], 444 "year": 2023, 445 "relevance": "Single-agent memory baseline adapted for multi-agent comparison." 446 }, 447 { 448 "title": "Generative agents: Interactive simulacra of human behavior", 449 "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai"], 450 "year": 2023, 451 "relevance": "Foundational generative agent memory architecture used as baseline." 452 }, 453 { 454 "title": "MARTI: A framework for multi-agent LLM systems reinforced training and inference", 455 "authors": ["Kaiyan Zhang", "Runze Liu", "Xuekai Zhu"], 456 "year": 2025, 457 "relevance": "Multi-agent fine-tuning baseline compared under matched compute budgets." 458 }, 459 { 460 "title": "BigCodeBench", 461 "authors": ["Naman Jain", "King Han", "Alex Gu"], 462 "year": 2024, 463 "relevance": "Code generation benchmark used as out-of-domain evaluation dataset." 464 }, 465 { 466 "title": "Why do multi-agent LLM systems fail?", 467 "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], 468 "year": 2025, 469 "relevance": "Analysis of MAS failure modes, motivates the memory homogenization problem addressed by LatentMem." 470 }, 471 { 472 "title": "G-Memory: Tracing hierarchical memory for multi-agent systems", 473 "authors": ["Guibin Zhang", "Muxin Fu", "Guancheng Wan"], 474 "year": 2025, 475 "relevance": "Multi-agent hierarchical memory baseline; shared authorship with LatentMem." 476 }, 477 { 478 "title": "KodCode: A diverse, challenging, and verifiable synthetic dataset for coding", 479 "authors": ["Zhangchen Xu", "Yang Liu", "Yueqin Yin"], 480 "year": 2025, 481 "relevance": "Synthetic coding benchmark used for in-domain evaluation." 482 } 483 ] 484 }