scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27651B)
      1 {
      2   "paper": {
      3     "title": "Agentic Memory: Learning Unified Long-Term and Short-Term Memory Management for Large Language Model Agents",
      4     "authors": [
      5       "Yi Yu",
      6       "Liuyi Yao",
      7       "Yuexiang Xie",
      8       "Qingquan Tan",
      9       "Jiaqi Feng",
     10       "Yaliang Li",
     11       "Libing Wu"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2601.01885"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No GitHub link or archive URL is provided for the AgeMem implementation. The paper references AgentScope (Gao et al., 2025a) and Trinity-RFT (Pan et al., 2025a) frameworks used to build the system, but no repository for AgeMem itself is given. No promise or URL appears in the paper."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All five evaluation benchmarks (ALFWorld, SciWorld, PDDL, BabyAI, HotpotQA) are publicly available datasets. The paper does not collect new proprietary data."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper states experiments were run on '8 NVIDIA RTX 4090 GPUs with 48GB memory each' (Appendix C.4), and mentions AgentScope and Trinity-RFT frameworks, but no requirements file, Dockerfile, or detailed dependency list with library versions is provided."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "There are no step-by-step reproduction instructions. The appendix describes the algorithm and hyperparameters (K=8 rollouts, β=0.1, uniform reward weights), but there is no README or script-level guidance a researcher could follow to reproduce the experiments."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Table 2 reports only point estimates (e.g., AgeMem achieves 41.07% on ALFWorld with Qwen2.5-7B). No confidence intervals, error bars, or standard deviations are reported for any main result."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes comparative claims (e.g., 'AgeMem improves by 4.82 and 8.57 percentage points on average' over best baselines) with no statistical significance tests. No p-values, t-tests, or equivalent tests are reported."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports percentage point improvements with baseline context throughout (e.g., 'relative gains of 49.59% and 23.52% over no-memory' in Section 4.2, absolute differences like '+13.9%' in ablations with both baseline and new values). Per the schema, '12% improvement over baseline (from 45% to 57%)' provides enough context for YES — the paper does this consistently."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper does not justify dataset split sizes used for evaluation. For HotpotQA, ~90k training questions are mentioned but the test set size used for evaluation is not explicitly stated, and no power analysis or justification for evaluation set sizes is provided."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "All results in Table 2 and other tables are single-point estimates. No standard deviations, variance across seeds, or multiple-run spread measures are reported anywhere in the paper."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares against four baselines: LangMem, A-Mem, Mem0, and Mem0g, plus an AgeMem-noRL ablation. A no-memory baseline is also included. Baselines use their official open-source implementations (links provided in Appendix C.3)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "All baselines are from 2025 (LangMem, A-Mem arXiv:2502.12110, Mem0 arXiv:2504.19413), representing current state-of-the-art memory management systems for LLM agents."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Figure 4 and Figure 9 present systematic ablations isolating: (1) LTM tools alone (+LT), (2) LTM tools with RL (+LT/RL), and (3) full system with STM and RL (+LT/ST/RL). A reward function ablation (All-Returns vs Answer-Only) is also provided in Tables 4 and 5."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses multiple metrics: Success Rate (SR) for ALFWorld/SciWorld/BabyAI, Progress Rate (PR) for PDDL, LLM-as-a-Judge (J) for HotpotQA, Memory Quality (MQ) for stored memory, and token count for context efficiency."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Human evaluation of system outputs is not relevant to this benchmark-evaluation paper. All metrics are automated (task completion metrics from environments, LLM-as-a-Judge for open-ended QA)."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Training is done only on HotpotQA training set; evaluation is on the respective test/validation splits of each benchmark. The paper states 'fine-tuned with RL only on the HotpotQA training set and then evaluated directly on all datasets' (Section 4.1)."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Table 2 provides per-dataset breakdowns across all five benchmarks (ALFWorld, SciWorld, PDDL, BabyAI, HotpotQA) for both model backbones. Ablation figures also show per-dataset results."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The case studies in Appendix B compare the trained agent (successes) against a baseline that fails, but these are synthetic illustrative examples. No systematic error analysis of where AgeMem fails or analysis of failure modes on the benchmarks is provided."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The ablation shows that adding LTM alone to Qwen3-4B on SciWorld actually hurts performance (-4.4%, Figure 9b), and PDDL remains low even with full AgeMem. These are reported honestly without being hidden."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims AgeMem 'consistently outperforms strong memory-augmented baselines across multiple LLM backbones, achieving improved task performance, higher-quality long-term memory, and more efficient context usage.' Table 2 (task performance), Figure 2 (memory quality), and Figure 3 (token counts) support these claims."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The ablation study isolates individual components (LTM, STM, RL) through controlled single-variable manipulation, supporting causal claims like 'RL training contributes 8.53 percentage points improvement over AgeMem-noRL' (Section 4.2). The design adequately supports these component-level causal inferences."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper tests on two specific open-source model backbones (Qwen2.5-7B and Qwen3-4B) on five benchmarks, but the conclusion states results 'highlight the importance of unified, agent-centric memory policies' for 'LLM agents' generally. The title and claims do not adequately bound generalizability to the tested models and task types."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The limitations section mentions the 'fixed set of memory management tools' and limited benchmark coverage but does not discuss alternative explanations for the observed improvements—e.g., whether gains could be attributed to additional RL training compute rather than the unified memory architecture, or whether the benchmark selection favors the approach."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper specifies 'Qwen2.5-7B-Instruct' and 'Qwen3-4B-Instruct' as base models, and 'Qwen-Max' as the LLM evaluator. However, per the schema, marketing names without a snapshot date or API version do NOT count. No specific model checkpoint dates, snapshot identifiers, or API versions are provided for any of these models. 'Qwen-Max' is especially vague — it is a marketing name with no version identifier."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The full system prompt used in AgeMem is provided verbatim in Appendix A.1, including the tool-calling format and response structure. The summarization system prompt is also fully provided, as are the LLM-as-a-Judge and Memory Quality evaluation prompt templates in Appendix C.2."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Key hyperparameters are reported: K=8 rollouts, KL coefficient β=0.1, uniform reward weights (1/3 each), FILTER threshold θ=0.6, max context length 8,192 tokens, max response length 2,048 tokens. Penalty coefficients (Prounds=-1, Poverflow=-0.5) are also specified."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The agentic scaffolding is described in detail: the three-stage trajectory structure, tool interface (6 tools with exact signatures in Figures 6 and 7), the step-wise GRPO mechanism, and the rollout algorithms (Algorithms 1-5 in Appendix A.3) are all fully specified."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper describes how HotpotQA supporting facts are used as Stage 1 contextual information and mentions a DISTRACTORGEN function for generating distractors, but does not detail the distractor generation process. The exact evaluation subsets (sizes, selection criteria) for each benchmark are not documented. The data pipeline from raw benchmarks to actual training/evaluation instances is underspecified."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "A dedicated 'Limitations' section is present after the conclusion, discussing the fixed tool set and limited benchmark coverage as areas for future extension."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The limitations section is generic: it notes that 'broader coverage of tasks and environments may further strengthen the empirical understanding' but does not identify specific threats to validity for the current results (e.g., benchmark contamination risks, LLM judge reliability, reward hacking possibilities)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. For example, it does not note that results are limited to two specific Qwen model families, that the RL training was done only on HotpotQA, or that the 5.1% token reduction may not be practically significant."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The underlying evaluation logs, model outputs, and raw trajectory data are not released. Only the aggregated performance numbers in tables and figures are available. Public benchmarks (HotpotQA, ALFWorld, etc.) are accessible but the model's specific outputs cannot be verified."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The experimental datasets are all established public benchmarks with well-documented collection procedures from their original papers. The paper references original dataset sources and describes their structure (Appendix C.1)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "This paper evaluates on standard benchmarks and has no human participants, so recruitment methods are not applicable."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "While the three-stage trajectory structure is described algorithmically, the DISTRACTORGEN function for generating distractors is mentioned but not fully specified. The exact evaluation subsets and how benchmark data is converted into the three-stage format are not fully documented. The data pipeline has unexplained steps."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "There is no acknowledgments section or funding disclosure in the paper. The affiliation includes Alibaba Group, but no funding sources are mentioned."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Yi Yu, Liuyi Yao, Yuexiang Xie, Yaliang Li are from Alibaba Group; others are from Wuhan University. The paper evaluates a system built using Alibaba's AgentScope framework."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Several authors are from Alibaba Group, and the system is built using Alibaba's AgentScope framework and evaluated using Qwen models (also from Alibaba). The organization developing the evaluated technology is directly represented among the authors, creating a non-independent relationship."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "There is no competing interests statement anywhere in the paper. The Alibaba affiliation is listed but no explicit declaration of financial interests or potential conflicts is made."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses Qwen2.5-7B-Instruct and Qwen3-4B-Instruct as base models but does not state their training data cutoff dates. This is relevant because HotpotQA (2018) and BabyAI (2018) are long-established benchmarks."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not discuss whether benchmark test examples (from HotpotQA, ALFWorld, etc.) appeared in Qwen model training data. Given that these are established public benchmarks predating Qwen model training, contamination risk exists but is not addressed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "HotpotQA (2018), ALFWorld (2020), SciWorld (2022), and BabyAI (2018) all predate the Qwen models' training cutoff. The paper does not discuss whether these benchmarks were present in the training data, which could inflate base model performance."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in this study; it is a benchmark evaluation paper."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved; ethics approval is not applicable."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper discusses token efficiency (average tokens per episode in Figure 3, ~2,117-2,310 tokens) but does not report API costs, wall-clock inference time per example, or cost per episode. The use of Qwen-Max as LLM evaluator implies API costs that are not quantified."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper states '8 NVIDIA RTX 4090 GPUs with 48GB memory each' (Appendix C.4), specifying hardware. However, no training wall-clock time, total GPU-hours, or total compute budget is stated. Knowing the GPU model without knowing how long training took does not constitute a stated compute budget."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "AgeMem achieves the highest average performance on both Qwen2.5-7B-Instruct (41.96%) and Qwen3-4B-Instruct (54.31%), outperforming all baselines across five datasets with relative gains of 49.59% and 23.52% over no-memory baselines.",
    294       "evidence": "Table 2 in Section 4.2 shows per-dataset and average performance numbers for both model backbones across all methods. AgeMem outperforms all four baselines in average score on both models.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "RL training contributes 8.53 and 8.72 percentage point improvements over AgeMem-noRL on Qwen2.5-7B-Instruct and Qwen3-4B-Instruct respectively.",
    299       "evidence": "Derived from Table 2: AgeMem (41.96%) vs AgeMem-noRL (33.43%) for Qwen2.5-7B, and AgeMem (54.31%) vs AgeMem-noRL (45.59%) for Qwen3-4B.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "AgeMem achieves higher memory quality (MQ) than all baselines: 0.533 on Qwen2.5-7B and 0.605 on Qwen3-4B.",
    304       "evidence": "Figure 2 shows MQ scores for all methods. However, the MQ metric is measured using Qwen-Max as LLM judge, which introduces potential bias since the system was trained using the same model family.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "AgeMem reduces prompt token usage compared to RAG-based variants by 3.1% (Qwen2.5) and 5.1% (Qwen3-4B) on HotpotQA.",
    309       "evidence": "Figure 3 shows average token counts. The absolute reduction is small (2,117 vs 2,186 for Qwen2.5-7B), and statistical significance is not tested.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "The multi-component reward function (All-Returns) leads to faster convergence and higher final performance than task-only reward (Answer-Only).",
    314       "evidence": "Figures 5 and 10 show convergence curves; Tables 4 and 5 show that All-Returns achieves higher J scores and much better MQ scores on both model backbones.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "benchmark-eval"
    320   ],
    321   "key_findings": "AgeMem proposes a unified framework for jointly managing long-term and short-term memory in LLM agents via six tool-based operations (ADD, UPDATE, DELETE, RETRIEVE, SUMMARY, FILTER) trained with a three-stage progressive reinforcement learning strategy and a step-wise GRPO algorithm. Evaluated on five long-horizon benchmarks (ALFWorld, SciWorld, PDDL, BabyAI, HotpotQA) with two Qwen model backbones, AgeMem outperforms four contemporary memory management baselines (LangMem, A-Mem, Mem0, Mem0g) by 4.82-8.57 percentage points on average. Ablation studies confirm that RL training and the combination of LTM and STM tools each contribute meaningfully to performance. Token efficiency improvements from learned STM management over RAG are modest (3-5% reduction in average prompt tokens).",
    322   "red_flags": [
    323     {
    324       "flag": "No uncertainty quantification",
    325       "detail": "All results in Table 2 are single point estimates with no standard deviations, confidence intervals, or error bars reported. Given that RL training involves stochastic rollouts (K=8 per task), variance across training seeds or evaluation runs could be substantial, and a 4-5 percentage point difference over baselines may not be statistically significant."
    326     },
    327     {
    328       "flag": "Alibaba-affiliated authors evaluating Alibaba tools",
    329       "detail": "Multiple authors are from Alibaba Group, the system is built on Alibaba's AgentScope framework, evaluated with Qwen models (Alibaba), and the LLM judge is Qwen-Max (also Alibaba). No competing interests or independence statement is provided. This creates a potential for confirmation bias in experimental design and metric selection."
    330     },
    331     {
    332       "flag": "LLM-as-judge circularity",
    333       "detail": "Memory Quality (MQ) is measured using Qwen-Max as the evaluator. The base models (Qwen2.5-7B, Qwen3-4B) are from the same family, and the training uses RL rewards partially computed by LLM judges. Using the same model family for both training rewards and evaluation metrics introduces circularity that could inflate AgeMem's apparent MQ advantage."
    334     },
    335     {
    336       "flag": "Benchmark contamination not addressed",
    337       "detail": "HotpotQA (2018), ALFWorld (2020), BabyAI (2018), and SciWorld (2022) are all long-established public benchmarks. The Qwen models' training data cutoffs are not stated, so it is possible the base models have seen these benchmarks, inflating the no-memory baseline and confounding the comparison."
    338     },
    339     {
    340       "flag": "No code release",
    341       "detail": "The AgeMem implementation is not released. Only the third-party frameworks (AgentScope, Trinity-RFT) are available, making reproduction of the specific AgeMem training procedure very difficult."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "A-Mem: Agentic Memory for LLM Agents",
    347       "authors": [
    348         "Wujiang Xu",
    349         "Zujie Liang",
    350         "Kai Mei",
    351         "Hang Gao",
    352         "Juntao Tan",
    353         "Yongfeng Zhang"
    354       ],
    355       "year": 2025,
    356       "arxiv_id": "2502.12110",
    357       "relevance": "Directly compared baseline for LLM agent long-term memory management using Zettelkasten-inspired design."
    358     },
    359     {
    360       "title": "Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory",
    361       "authors": [
    362         "Prateek Chhikara",
    363         "Dev Khant",
    364         "Saket Aryan",
    365         "Taranjeet Singh",
    366         "Deshraj Yadav"
    367       ],
    368       "year": 2025,
    369       "arxiv_id": "2504.19413",
    370       "relevance": "Directly compared baseline for scalable LTM management in AI agents."
    371     },
    372     {
    373       "title": "ReSum: Unlocking Long-Horizon Search Intelligence via Context Summarization",
    374       "authors": [
    375         "Xixi Wu",
    376         "Kuan Li",
    377         "Yida Zhao"
    378       ],
    379       "year": 2025,
    380       "arxiv_id": "2509.13313",
    381       "relevance": "Prior work on short-term memory management through context summarization, used as a comparison point."
    382     },
    383     {
    384       "title": "AgentBoard: An Analytical Evaluation Board of Multi-Turn LLM Agents",
    385       "authors": [
    386         "Ma Chang",
    387         "Junlei Zhang",
    388         "Zhihao Zhu",
    389         "Cheng Yang",
    390         "Yujiu Yang"
    391       ],
    392       "year": 2024,
    393       "relevance": "Benchmark used (PDDL) and citation for multi-turn LLM agent evaluation methodology."
    394     },
    395     {
    396       "title": "Memory-R1: Enhancing Large Language Model Agents to Manage and Utilize Memories via Reinforcement Learning",
    397       "authors": [
    398         "Sikuan Yan",
    399         "Xiufeng Yang",
    400         "Zuchao Huang"
    401       ],
    402       "year": 2025,
    403       "arxiv_id": "2508.19828",
    404       "relevance": "Related concurrent work on using RL for LLM agent memory management."
    405     },
    406     {
    407       "title": "Trinity-RFT: A General-Purpose and Unified Framework for Reinforcement Fine-Tuning of Large Language Models",
    408       "authors": [
    409         "Xuchen Pan",
    410         "Yanxi Chen",
    411         "Yushuo Chen"
    412       ],
    413       "year": 2025,
    414       "arxiv_id": "2505.17826",
    415       "relevance": "The RL training framework used in AgeMem experiments."
    416     },
    417     {
    418       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    419       "authors": [
    420         "Zhihong Shao",
    421         "Peiyi Wang",
    422         "Qihao Zhu"
    423       ],
    424       "year": 2024,
    425       "arxiv_id": "2402.03300",
    426       "relevance": "Source of the GRPO algorithm used for RL training in AgeMem."
    427     },
    428     {
    429       "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-Hop Question Answering",
    430       "authors": [
    431         "Zhilin Yang",
    432         "Peng Qi",
    433         "Saizheng Zhang",
    434         "Yoshua Bengio",
    435         "William Cohen"
    436       ],
    437       "year": 2018,
    438       "relevance": "Primary training and evaluation benchmark used in AgeMem experiments."
    439     },
    440     {
    441       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    442       "authors": [
    443         "Mohit Shridhar",
    444         "Xingdi Yuan",
    445         "Marc-Alexandre Cote"
    446       ],
    447       "year": 2020,
    448       "arxiv_id": "2010.03768",
    449       "relevance": "Key evaluation benchmark for embodied long-horizon agent tasks."
    450     },
    451     {
    452       "title": "How Memory Management Impacts LLM Agents: An Empirical Study of Experience-Following Behavior",
    453       "authors": [
    454         "Zidi Xiong",
    455         "Yuping Lin",
    456         "Wenya Xie"
    457       ],
    458       "year": 2025,
    459       "arxiv_id": "2505.16067",
    460       "relevance": "Empirical study on memory management impacts for LLM agents, directly relevant to survey scope."
    461     },
    462     {
    463       "title": "A Survey on the Memory Mechanism of Large Language Model-based Agents",
    464       "authors": [
    465         "Zeyu Zhang",
    466         "Quanyu Dai",
    467         "Xiaohe Bo"
    468       ],
    469       "year": 2025,
    470       "relevance": "Survey of memory mechanisms in LLM agents, highly relevant as a taxonomy reference."
    471     },
    472     {
    473       "title": "Zep: A Temporal Knowledge Graph Architecture for Agent Memory",
    474       "authors": [
    475         "Preston Rasmussen",
    476         "Pavlo Paliychuk",
    477         "Travis Beauvais"
    478       ],
    479       "year": 2025,
    480       "arxiv_id": "2501.13956",
    481       "relevance": "Competing approach to LLM agent memory using temporal knowledge graphs."
    482     },
    483     {
    484       "title": "AgentScope 1.0: A Developer-Centric Framework for Building Agentic Applications",
    485       "authors": [
    486         "Dawei Gao",
    487         "Zitao Li",
    488         "Yuexiang Xie"
    489       ],
    490       "year": 2025,
    491       "arxiv_id": "2508.16279",
    492       "relevance": "The agent scaffolding framework used to build AgeMem, relevant to agentic system infrastructure research."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs