scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24921B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LatentMem: Customizing Latent Memory for Multi-Agent Systems",
      6     "authors": [
      7       "Muxin Fu",
      8       "Guibin Zhang",
      9       "Xiangyuan Xue",
     10       "Yafu Li",
     11       "Zefeng He",
     12       "Siyuan Huang",
     13       "Xiaoye Qu",
     14       "Yu Cheng",
     15       "Yang Yang"
     16     ],
     17     "year": 2026,
     18     "venue": "arXiv",
     19     "arxiv_id": "2602.03036",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Key abstract claims (up to 19.36% improvement over vanilla settings, outperforms existing memory architectures, 50% fewer tokens, ~2/3 inference time) are all backed by Table 1 and Figures 3/8.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 5.6 includes ablation studies removing role profiles and experience bank updates, providing causal support for each component's contribution to performance.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper explicitly distinguishes in-domain vs. out-of-domain benchmarks and seen vs. unseen MAS frameworks, bounding generalization claims to the tested conditions.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper attributes all performance gains solely to role-aware latent memory without considering alternative explanations such as optimization objective differences, training data scale effects, or capacity advantages.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper uses task accuracy on established benchmarks as the primary measure and frames all performance claims in terms of these same accuracy metrics, with no problematic proxy substitution.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper contains an 'Impact Statement' covering ethics and societal implications but no dedicated limitations or threats-to-validity section.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No specific threats to validity are discussed; the impact statement is generic and does not address methodological concerns like benchmark contamination, single-run variance, or limited MAS diversity.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper does not explicitly state what the results do not show or where the approach is expected to fail; no scope boundaries are stated beyond the tested benchmarks.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding acknowledgment appears anywhere in the paper despite affiliations including state-linked AI laboratories.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly listed on the title page: Tongji University, Shanghai AI Laboratory, NUS, CUHK, Nanjing University, and Shanghai Jiao Tong University.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No funding is disclosed, making this criterion not applicable.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "There is no competing interests statement or declaration of financial interests anywhere in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms including multi-agent systems, memory homogenization, information overload, and latent memory are defined in the introduction and preliminary sections with formal notation.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper explicitly states three principal advantages of LatentMem and frames the contribution as a learnable multi-agent memory framework addressing two identified bottlenecks.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 discusses LLM-based MAS and memory systems in detail, explaining how LatentMem differs from and builds on specific prior works like G-Memory, OAgents, and MIRIX.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "A GitHub URL (https://github.com/KANABOON1/LatentMem) is explicitly provided in the paper's abstract footnote.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "All evaluation benchmarks (TriviaQA, KodCode, StrategyQA, PopQA, BigCodeBench, PDDL) are publicly available standard datasets.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Table 3 provides hyperparameter settings and mentions DeepSpeed/vLLM/LoRA, but no requirements.txt, Dockerfile, or equivalent environment specification is provided.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Appendix B.4 describes training configurations at a high level, but no step-by-step reproduction instructions are provided for replicating the reported experiments.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No confidence intervals or error bars are reported in any tables or figures; all results are presented as single-point estimates.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests are used despite making comparative claims across seven baselines on six benchmarks.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Tables 1, 2, and 4 report absolute percentage-point improvements over baselines (e.g., ↑19.36%) with baseline values provided, constituting meaningful effect sizes.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No justification is provided for the number of benchmark questions used in evaluation or the 40,580 training trajectories collected.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "All results are reported as single values with no standard deviation, confidence intervals, or repeated-run statistics.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Eight baselines are included: no-memory, Voyager, Generative, JoyAgent, MetaGPT, ChatDev, OAgents, and G-Memory, plus MARTI for the fine-tuning comparison.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Baselines include recent 2025 works (G-Memory, OAgents, MARTI) representing current state-of-the-art approaches.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Section 5.6 presents ablation variants removing role profiles ('without role') and disabling experience bank updates ('without experience'), with quantified performance impacts.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The paper reports accuracy across six diverse benchmarks plus time cost and token cost, providing multiple evaluation dimensions.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Human evaluation is not applicable for this automated task-solving MAS evaluation using established benchmark metrics.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Training trajectories are collected from training splits; evaluation uses standard benchmark test sets, and BigCodeBench/PDDL are explicitly held-out out-of-domain benchmarks.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Table 1 provides per-benchmark and per-MAS-framework breakdowns across all six benchmarks and four MAS frameworks, with separate held-in and held-out sections.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section 5.7 and Figure 7 explicitly analyze failure modes (step repetition, task specification disobedience, reasoning-action mismatch) with concrete PDDL case study examples.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Tables 1 and 4 honestly report that multiple baselines (ChatDev, Voyager, MetaGPT) perform worse than the no-memory baseline on several benchmark-framework combinations.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Specific model identifiers are provided: 'Qwen/Qwen3-4B-Instruct-2507' and 'meta-llama/Llama-3.1-8B-Instruct', and the embedding model 'all-MiniLM-L6-v2'.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Appendix D provides complete prompt templates for all agent roles in CAMEL (Strategy, Code, Test, Summarizer) and AutoGen (Assistant, User Proxy) frameworks.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Table 3 provides comprehensive hyperparameter settings including learning rate (1e-5), LoRA configurations (r=16, alpha=32), batch sizes, clipping epsilon, and temperature settings.",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Appendix B.3 provides detailed descriptions of all four MAS frameworks including agent roles, topologies, and interaction patterns for AutoGen, MacNet, CAMEL, and DyLAN.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Appendix B.4.2 describes trajectory collection but does not document preprocessing steps such as trajectory formatting, tokenization, truncation, or filtering criteria.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "The 40,580 training trajectories central to training the memory composer are not released or made available for independent verification.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Appendix B.4.2 describes how training data was collected: running AutoGen and MacNet on training splits of four in-domain datasets to produce 40,580 trajectories.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "This paper uses standard benchmark evaluation with no human participants or recruitment.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "While trajectory collection is described, the complete pipeline from raw trajectories through experience bank storage, retrieval, and memory composer training is not documented with sufficient detail to replicate.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The training data cutoffs for Qwen3-4B-Instruct and Llama-3.1-8B are not stated, which is relevant since several benchmarks (TriviaQA 2017, StrategyQA 2021) predate model training.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of potential overlap between model pretraining data and evaluation benchmarks, particularly problematic for TriviaQA and PopQA.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "The paper does not address whether benchmark examples were available before the LLMs' training cutoffs; TriviaQA (2017) and StrategyQA (2021) are high-risk for contamination.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants in this study.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Figures 3 and 8 explicitly report both time cost (seconds) and token cost for LatentMem and all baselines across multiple benchmark-framework combinations.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "Total training compute budget (GPU-hours, hardware specs) is not reported; only inference-time costs on benchmarks are shown.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "LatentMem achieves performance gains of up to 19.36% over vanilla no-memory settings",
    379       "evidence": "Table 1 shows LatentMem on DyLAN+PopQA at 44.25% vs no-memory at 24.89%, a 19.36pp gain.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LatentMem consistently outperforms all memory baselines across four MAS frameworks and six benchmarks",
    384       "evidence": "Tables 1 and 4 show LatentMem achieving the best average performance across all four MAS frameworks with both Qwen3-4B and Llama-3.1-8B backbones.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "LatentMem uses approximately 50% fewer tokens than mainstream memory designs",
    389       "evidence": "Figure 3 shows LatentMem using 1.76M tokens on AutoGen+KodCode vs 3.50M for Generative (~50% fewer), though the ratio varies substantially across settings.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "LatentMem reduces inference time to approximately 2/3 of mainstream memory designs",
    394       "evidence": "Figure 3 shows LatentMem at 3.15e+04s for DyLAN+TriviaQA vs 6.84e+04s for OAgents (46% of OAgents time); the ratio varies across comparisons.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "LatentMem generalizes to out-of-domain benchmarks and unseen MAS frameworks",
    399       "evidence": "Table 1 shows 7.10% improvement on PDDL (out-of-domain) and 7.90% on CAMEL (unseen MAS) while most baselines decline on these held-out settings.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Role-aware memory customization is essential, especially for complex MAS architectures",
    404       "evidence": "Section 5.6 ablation shows removing role profiles causes 6.45% drop for MacNet but only 2.30% for simpler AutoGen.",
    405       "supported": "moderate"
    406     },
    407     {
    408       "claim": "LatentMem outperforms multi-agent fine-tuning baseline MARTI under the same computational budget",
    409       "evidence": "Table 2 shows LatentMem outperforming MARTI on all four benchmark-MAS combinations, with 11.73pp improvement on TriviaQA+AutoGen.",
    410       "supported": "strong"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "benchmark-eval"
    415   ],
    416   "key_findings": "LatentMem introduces a learnable multi-agent memory framework that addresses memory homogenization and information overload by generating fixed-length, role-aware latent representations from raw interaction trajectories using a trained memory composer. Evaluated across six benchmarks and four MAS frameworks with two LLM backbones, it achieves up to 19.36% improvement over no-memory baselines and consistently outperforms seven existing memory baselines while using fewer tokens and less inference time. The framework shows meaningful generalization to out-of-domain benchmarks and unseen MAS architectures, suggesting latent memory transfers better across contexts than text-based approaches. Ablation studies confirm both role-aware conditioning and real-time experience bank updates contribute meaningfully to performance.",
    417   "red_flags": [
    418     {
    419       "flag": "No statistical significance testing",
    420       "detail": "All comparative claims are based on point estimates with no confidence intervals, error bars, or significance tests across any of the seven baselines and six benchmarks, making it impossible to assess whether observed improvements are reliable."
    421     },
    422     {
    423       "flag": "No variance across runs",
    424       "detail": "Results are reported as single-run values with no standard deviation; MAS performance can vary significantly across runs due to temperature sampling and non-determinism."
    425     },
    426     {
    427       "flag": "No limitations section",
    428       "detail": "The paper has no dedicated limitations or threats-to-validity section; the impact statement discusses only societal implications without addressing methodological limitations."
    429     },
    430     {
    431       "flag": "Funding not disclosed",
    432       "detail": "No acknowledgements or funding disclosure appears anywhere in the paper, despite affiliations including Shanghai AI Laboratory (state-linked)."
    433     },
    434     {
    435       "flag": "Benchmark contamination unaddressed",
    436       "detail": "The paper evaluates Qwen3 and Llama models on TriviaQA (2017), StrategyQA (2021), and PopQA without discussing whether these benchmarks were in the models' pretraining data, which could inflate accuracy scores."
    437     },
    438     {
    439       "flag": "Training trajectories not released",
    440       "detail": "The 40,580 training trajectories central to training the memory composer are not released, preventing independent reproduction of the training step."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    446       "relevance": "Key MAS framework baseline with shared message pool memory design"
    447     },
    448     {
    449       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    450       "relevance": "Primary MAS framework used for LatentMem integration and evaluation"
    451     },
    452     {
    453       "title": "G-Memory: Tracing Hierarchical Memory for Multi-Agent Systems",
    454       "relevance": "Primary competing MAS-specific memory baseline with three-tier hierarchical design"
    455     },
    456     {
    457       "title": "CAMEL: Communicative Agents for Mind Exploration of Large Language Model Society",
    458       "relevance": "MAS framework used as unseen test environment for generalization evaluation"
    459     },
    460     {
    461       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    462       "relevance": "Single-agent memory baseline with reflective memory adapted for multi-agent comparison"
    463     },
    464     {
    465       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    466       "relevance": "Single-agent memory baseline with skill-based memory adapted for comparison"
    467     },
    468     {
    469       "title": "OAgents: An Empirical Study of Building Effective Agents",
    470       "relevance": "Multi-agent memory baseline with hierarchical short/long-term memory design"
    471     },
    472     {
    473       "title": "MARTI: A Framework for Multi-Agent LLM Systems Reinforced Training and Inference",
    474       "relevance": "Multi-agent fine-tuning baseline used for direct compute-controlled comparison"
    475     },
    476     {
    477       "title": "ChatDev: Communicative Agents for Software Development",
    478       "relevance": "MAS framework baseline representing inside-trial memory design"
    479     },
    480     {
    481       "title": "A Dynamic LLM-Powered Agent Network for Task-Oriented Agent Collaboration (DyLAN)",
    482       "relevance": "MAS framework used as unseen test environment with agent importance scoring"
    483     }
    484   ],
    485   "engagement_factors": {
    486     "practical_relevance": {
    487       "score": 2,
    488       "justification": "Provides concrete improvements on standard MAS benchmarks with a released GitHub repo, but requires training a memory composer with substantial infrastructure (DeepSpeed, vLLM)."
    489     },
    490     "surprise_contrarian": {
    491       "score": 2,
    492       "justification": "Using latent space representations instead of text-based memory for MAS is a genuinely novel direction that challenges the dominant paradigm of symbolic/textual memory systems."
    493     },
    494     "fear_safety": {
    495       "score": 0,
    496       "justification": "The impact statement mentions generic caution about more powerful MAS but raises no specific AI safety or risk concerns."
    497     },
    498     "drama_conflict": {
    499       "score": 0,
    500       "justification": "Standard incremental improvement paper with no controversy, no conflict with other researchers, and no surprising failures."
    501     },
    502     "demo_ability": {
    503       "score": 2,
    504       "justification": "GitHub code is provided and the system runs on standard public benchmarks, though reproducing requires significant infrastructure investment."
    505     },
    506     "brand_recognition": {
    507       "score": 1,
    508       "justification": "Authors affiliated with Shanghai AI Laboratory (notable Chinese state AI lab) and NUS/CUHK, but no affiliation with top-tier Western AI labs."
    509     }
    510   },
    511   "hn_data": {
    512     "threads": [],
    513     "top_points": 0,
    514     "total_points": 0,
    515     "total_comments": 0
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs