scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24876B)
      1 {
      2   "paper": {
      3     "title": "Graph-based Agent Memory: Taxonomy, Techniques, and Applications",
      4     "authors": [
      5       "Chang Yang",
      6       "Chuang Zhou",
      7       "Yilin Xiao",
      8       "Su Dong",
      9       "Luyao Zhuang",
     10       "Yujing Zhang",
     11       "Zhu Wang",
     12       "Zijin Hong",
     13       "Zheng Yuan",
     14       "Zhishang Xiang",
     15       "Shengyuan Chen",
     16       "Huachi Zhou",
     17       "Qinggang Zhang",
     18       "Ninghao Liu",
     19       "Jinsong Su",
     20       "Xinrun Wang",
     21       "Yi Chang",
     22       "Xiao Huang"
     23     ],
     24     "year": 2026,
     25     "venue": "arXiv",
     26     "arxiv_id": "2602.05665"
     27   },
     28   "scan_version": 2,
     29   "active_modules": ["survey_methodology"],
     30   "checklist": {
     31     "artifacts": {
     32       "code_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper provides a GitHub link (https://github.com/DEEP-PolyU/Awesome-GraphMemory) described as collecting 'research papers, open-source data, and projects.' This is a curated resource list (awesome-list), not analysis code or scripts to reproduce the survey's methodology or findings."
     36       },
     37       "data_released": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The GitHub repository collects the surveyed papers, open-source data, and projects, making the survey's corpus publicly available as a curated resource."
     41       },
     42       "environment_specified": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No environment specifications are provided. For a survey, this would mean specifying tools used for literature search, analysis, or figure generation. None are described."
     46       },
     47       "reproduction_instructions": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No instructions are provided for reproducing the survey methodology — no description of how to replicate the paper search, selection, or categorization process."
     51       }
     52     },
     53     "statistical_methodology": {
     54       "confidence_intervals_or_error_bars": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "This is a survey/taxonomy paper with no experiments or quantitative analysis. No statistical results are reported."
     58       },
     59       "significance_tests": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No experiments or comparative statistical claims requiring significance tests. Pure literature survey."
     63       },
     64       "effect_sizes_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No experiments conducted. The paper is a literature survey with no quantitative comparisons of its own."
     68       },
     69       "sample_size_justified": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No experiments or samples. The paper is a qualitative literature review."
     73       },
     74       "variance_reported": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No experiments with multiple runs. The paper is a literature survey."
     78       }
     79     },
     80     "evaluation_design": {
     81       "baselines_included": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The survey does not systematically compare against prior surveys on agent memory. While it positions itself as 'comprehensive,' it does not benchmark its coverage, taxonomy, or methodology against existing surveys (e.g., CoALA, the AI Hippocampus survey, or other memory surveys cited in the references)."
     85       },
     86       "baselines_contemporary": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No baseline surveys are compared against, so the question of whether baselines are contemporary does not arise. The paper does not compare its taxonomy or coverage with any prior survey."
     90       },
     91       "ablation_study": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "A survey paper has no system components to ablate."
     95       },
     96       "multiple_metrics": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "No experiments are conducted in this survey paper."
    100       },
    101       "human_evaluation": {
    102         "applies": false,
    103         "answer": false,
    104         "justification": "Human evaluation is not applicable to a literature survey paper."
    105       },
    106       "held_out_test_set": {
    107         "applies": false,
    108         "answer": false,
    109         "justification": "No experiments requiring train/test splits. This is a literature survey."
    110       },
    111       "per_category_breakdown": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper provides extensive per-category breakdowns: Figure 4 categorizes techniques by extraction/storage/retrieval/evolution, Figure 6 compares graph construction paradigms (KG, hierarchical, temporal, hypergraph, hybrid) with advantages and limitations, and Table I provides per-scenario benchmark breakdowns across 7 categories."
    115       },
    116       "failure_cases_discussed": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Figure 6 explicitly lists limitations for each graph type (e.g., KGs: 'High construction & maintenance cost,' 'Poor at dynamic updates'). Section X discusses limitations of current approaches including scalability bottlenecks, privacy vulnerabilities, and rigid schemas."
    120       },
    121       "negative_results_reported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper discusses shortcomings and limitations of each approach. For example, Section VI.A.1 lists clear limitations of similarity-based retrieval ('Similarity does not guarantee relevance,' 'Poor multi-hop reasoning,' 'Temporal awareness is often missing'). Section X identifies multiple unsolved challenges."
    125       }
    126     },
    127     "claims_and_evidence": {
    128       "abstract_claims_supported": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The abstract claims to present a 'comprehensive review,' propose a taxonomy, analyze key techniques, and summarize libraries and benchmarks. All four contributions listed in the abstract are substantiated by corresponding sections (III, IV-VII, VIII, X)."
    132       },
    133       "causal_claims_justified": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "The paper is a survey/taxonomy that summarizes existing work. It does not make its own causal claims from its own empirical data. Evaluative statements like 'graph-based memory demonstrated superior performance' are summaries of cited papers' findings, not the survey's own causal claims."
    137       },
    138       "generalization_bounded": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper's scope is clearly bounded to 'graph-based agent memory' for 'LLM-based agents,' as stated in the title, abstract, and throughout the paper. Claims are appropriately scoped to this domain."
    142       },
    143       "alternative_explanations_discussed": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "This is a pure survey/taxonomy paper that presents no empirical results of its own. Alternative explanations are not applicable."
    147       },
    148       "proxy_outcome_distinction": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "This is a survey paper with no measurements or proxy-outcome gaps to discuss."
    152       }
    153     },
    154     "setup_transparency": {
    155       "model_versions_specified": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No models are used in this survey paper."
    159       },
    160       "prompts_provided": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No prompting is used in this survey paper."
    164       },
    165       "hyperparameters_reported": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No experiments requiring hyperparameters. This is a literature survey."
    169       },
    170       "scaffolding_described": {
    171         "applies": false,
    172         "answer": false,
    173         "justification": "No agentic scaffolding is used in this survey paper."
    174       },
    175       "data_preprocessing_documented": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper provides no description of its literature search strategy, paper selection pipeline, databases queried, search keywords, time period covered, or inclusion/exclusion criteria. There is no PRISMA diagram or equivalent. The reader has no way to determine how papers were found, selected, or categorized."
    179       }
    180     },
    181     "limitations_and_scope": {
    182       "limitations_section_present": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section X is titled 'Limitations and Future Directions' and provides substantive discussion of multiple challenges."
    186       },
    187       "threats_to_validity_specific": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Section X discusses limitations of the FIELD (scalability, privacy, schema rigidity, etc.) but does not discuss limitations of the SURVEY ITSELF — e.g., potential selection bias in papers included, coverage gaps, limitations of the proposed taxonomy, or threats to the survey's own validity."
    191       },
    192       "scope_boundaries_stated": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper does not explicitly state what it does NOT cover. It claims to be 'comprehensive' but does not specify exclusion criteria (e.g., which agent types are out of scope, what time period was covered, whether non-English papers were excluded, etc.)."
    196       }
    197     },
    198     "data_integrity": {
    199       "raw_data_available": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "While the GitHub awesome-list provides curated references, the raw data of the survey — the full search results, selection decisions, categorization rationale for each paper — is not available for independent verification."
    203       },
    204       "data_collection_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No description is provided of how papers were collected for this survey. There is no mention of databases searched, search queries used, date ranges, or how the authors identified the papers to include."
    208       },
    209       "recruitment_methods_described": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "For a survey, 'recruitment' is the paper selection process. No description is provided of how papers were identified, screened, or selected for inclusion. The survey's corpus appears to be assembled ad-hoc."
    213       },
    214       "data_pipeline_documented": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No data pipeline is documented. There are no stages described from initial search to final inclusion, no counts at each filtering stage, and no systematic methodology for how papers were processed and categorized."
    218       }
    219     },
    220     "conflicts_of_interest": {
    221       "funding_disclosed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding sources or acknowledgments section is present in the paper text."
    225       },
    226       "affiliations_disclosed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Author affiliations are clearly listed: The Hong Kong Polytechnic University, Xiamen University, Singapore Management University, and Jilin University."
    230       },
    231       "funder_independent_of_outcome": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No funding is disclosed, so independence of the funder from outcomes cannot be assessed. Absence of disclosure is not absence of conflict."
    235       },
    236       "financial_interests_declared": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No competing interests or financial interests statement is present in the paper."
    240       }
    241     },
    242     "contamination": {
    243       "training_cutoff_stated": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    247       },
    248       "train_test_overlap_discussed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    252       },
    253       "benchmark_contamination_addressed": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    257       }
    258     },
    259     "human_studies": {
    260       "pre_registered": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this survey paper."
    264       },
    265       "irb_or_ethics_approval": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this survey paper."
    269       },
    270       "demographics_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this survey paper."
    274       },
    275       "inclusion_exclusion_criteria": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this survey paper."
    279       },
    280       "randomization_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this survey paper."
    284       },
    285       "blinding_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this survey paper."
    289       },
    290       "attrition_reported": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants in this survey paper."
    294       }
    295     },
    296     "cost_and_practicality": {
    297       "inference_cost_reported": {
    298         "applies": false,
    299         "answer": false,
    300         "justification": "This is a survey paper with no system or method requiring cost reporting."
    301       },
    302       "compute_budget_stated": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "This is a survey paper with no computational experiments."
    306       }
    307     },
    308     "survey_methodology": {
    309       "prisma_or_structured_protocol": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The paper does not follow PRISMA or any structured review protocol. There is no flow diagram, no protocol registration, no reproducible search strategy, and no documented inclusion/exclusion criteria. The paper collection appears ad-hoc."
    313       },
    314       "quality_assessment_of_sources": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The survey does not assess the methodological quality of any source paper. All cited works are treated equally regardless of their rigor, reproducibility, or evidence quality. Papers are categorized by what they propose, not by how well they validated their claims."
    318       },
    319       "publication_bias_discussed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No discussion of publication bias. The survey does not consider whether the papers it reviews skew toward positive results, whether negative results in graph-based memory are underrepresented, or whether the surveyed literature has systematic biases."
    323       }
    324     }
    325   },
    326   "claims": [
    327     {
    328       "claim": "Graph-based agent memory provides a unified and general perspective that subsumes traditional memory paradigms as degenerate cases (e.g., linear buffer as a chain graph, vector memory as a fully-connected similarity-weighted graph).",
    329       "evidence": "Section III.E presents the theoretical argument with specific mappings: 'a linear buffer corresponds to a chain within a graph, and a vector memory can be interpreted as a fully-connected graph with similarity-weighted edges.'",
    330       "supported": "moderate"
    331     },
    332     {
    333       "claim": "Graph-based memory architectures have demonstrated superior performance in applications requiring multi-session coherence, personalized adaptation, complex task planning, and hallucination reduction.",
    334       "evidence": "Section III.E makes this claim citing various works (knowledge graphs, hypergraphs). No systematic comparison or meta-analysis is provided to quantify 'superior performance' — individual papers are cited anecdotally.",
    335       "supported": "weak"
    336     },
    337     {
    338       "claim": "The survey identifies and compares 11 open-source memory libraries across key functional dimensions.",
    339       "evidence": "Table IV in Appendix B provides a structured comparison matrix of 11 libraries (Cognee, LangMem, Mem0, LightMem, O-Mem, OpenMemory, Memori, MemMachine, Memary, Graphiti, Memvid) across dimensions including graph support, retrieval, lifecycle management, temporality, and agent integration.",
    340       "supported": "strong"
    341     },
    342     {
    343       "claim": "The survey provides a comprehensive benchmark landscape with 50+ benchmarks categorized into 7 scenarios for evaluating memory-augmented agents.",
    344       "evidence": "Table I lists over 50 benchmarks organized into Interaction, Personalization, Web, LongContext, Continual, Environments, and Tool/Gen scenarios with modality, features, environment type, and memory type annotations.",
    345       "supported": "strong"
    346     },
    347     {
    348       "claim": "Internal self-evolving mechanisms can maintain memory consistency through memory consolidation, graph reasoning, and graph reorganization without external input.",
    349       "evidence": "Section VII.A describes three mechanisms (consolidation via graph merging, latent link prediction for graph reasoning, significance-based pruning for reorganization) with references to specific systems (Mem0, FLEX, Reflexion, MemoryBank). The claims are backed by references but no independent validation.",
    350       "supported": "moderate"
    351     }
    352   ],
    353   "methodology_tags": ["meta-analysis"],
    354   "key_findings": "This survey provides a comprehensive taxonomy of graph-based agent memory organized around four lifecycle stages: extraction, storage, retrieval, and evolution. It categorizes memory storage into five graph types (knowledge graph, hierarchical, temporal, hypergraph, hybrid) with trade-off analysis, identifies six retrieval paradigms (similarity-based, rule-based, temporal, graph-based, RL-based, agent-based), and maps 50+ benchmarks across 7 application scenarios. The survey argues that graph-based memory subsumes traditional flat/vector memory as degenerate cases and is the frontier paradigm for 2025-2026 agent memory research.",
    355   "red_flags": [
    356     {
    357       "flag": "No systematic search methodology",
    358       "detail": "The survey provides no description of databases searched, search queries, time period, or inclusion/exclusion criteria. Papers appear to be collected ad-hoc rather than through a reproducible systematic search. This makes it impossible to assess completeness or bias in coverage."
    359     },
    360     {
    361       "flag": "No quality assessment of surveyed papers",
    362       "detail": "All surveyed papers are treated equally regardless of methodological quality. The survey summarizes what each paper proposes without evaluating whether the claims are validated, reproducible, or methodologically sound. This launders the signal-to-noise ratio of its sources — a well-validated system and a paper with only anecdotal evidence receive equal treatment."
    363     },
    364     {
    365       "flag": "Overclaiming comprehensiveness",
    366       "detail": "The paper repeatedly claims to be 'comprehensive' (abstract, Section I contributions) but provides no evidence of completeness. Without a systematic search strategy, the claim of comprehensiveness cannot be verified or falsified."
    367     },
    368     {
    369       "flag": "Self-reflective limitations absent",
    370       "detail": "Section X discusses limitations of the field (scalability, privacy, etc.) but does not discuss limitations of the survey itself — e.g., potential selection bias, coverage gaps, limitations of the proposed taxonomy, or whether the categorization scheme may be incomplete or overlapping."
    371     }
    372   ],
    373   "cited_papers": [
    374     {
    375       "title": "SWE-agent: agent-computer interfaces enable automated software engineering",
    376       "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"],
    377       "year": 2024,
    378       "relevance": "Core agent system for software engineering benchmark evaluation, relevant to code agent memory and SWE-bench."
    379     },
    380     {
    381       "title": "Reflexion: language agents with verbal reinforcement learning",
    382       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    383       "year": 2023,
    384       "relevance": "Foundational work on agent self-improvement through verbal memory and reflection, key technique for experience-based memory extraction and evolution."
    385     },
    386     {
    387       "title": "Voyager: An open-ended embodied agent with large language models",
    388       "authors": ["G. Wang", "Y. Xie", "Y. Jiang", "A. Mandlekar", "C. Xiao", "Y. Zhu", "L. Fan", "A. Anandkumar"],
    389       "year": 2024,
    390       "relevance": "Introduces lifelong learning for game agents with procedural memory as executable code libraries, influential for agent skill acquisition."
    391     },
    392     {
    393       "title": "Cognitive architectures for language agents",
    394       "authors": ["T. Sumers", "S. Yao", "K. R. Narasimhan", "T. L. Griffiths"],
    395       "year": 2024,
    396       "relevance": "Foundational framework (CoALA) for agent architecture including memory systems; defines the cognitive structure of agent memory."
    397     },
    398     {
    399       "title": "MemGPT: Towards LLMs as operating systems",
    400       "authors": ["C. Packer", "S. Wooders", "K. Lin", "V. Fang", "S. G. Patil", "I. Stoica", "J. E. Gonzalez"],
    401       "year": 2023,
    402       "arxiv_id": "2310.08560",
    403       "relevance": "Proposes virtual memory management for LLM agents, enabling context beyond the window limit — key work on agent memory architecture."
    404     },
    405     {
    406       "title": "Mem0: Building production-ready ai agents with scalable long-term memory",
    407       "authors": ["P. Chhikara", "D. Khant", "S. Aryan", "T. Singh", "D. Yadav"],
    408       "year": 2025,
    409       "arxiv_id": "2504.19413",
    410       "relevance": "Production-oriented graph-based memory system with KG extraction, update, and retrieval — central to the survey's analysis of open-source memory tools."
    411     },
    412     {
    413       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    414       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"],
    415       "year": 2024,
    416       "relevance": "Major benchmark for evaluating code agent capabilities, used to assess tool/gen memory requirements."
    417     },
    418     {
    419       "title": "GAIA: a benchmark for general AI assistants",
    420       "authors": ["G. Mialon", "C. Fourrier", "T. Wolf", "Y. LeCun", "T. Scialom"],
    421       "year": 2023,
    422       "relevance": "Benchmark for evaluating general agent capabilities including deep research tasks that require process memory."
    423     },
    424     {
    425       "title": "WebArena: A realistic web environment for building autonomous agents",
    426       "authors": ["S. Zhou", "F. F. Xu", "H. Zhu", "X. Zhou", "R. Lo", "A. Sridhar"],
    427       "year": 2024,
    428       "relevance": "Realistic web benchmark that tests experiential memory for caching page states and tracking environmental state across steps."
    429     },
    430     {
    431       "title": "Zep: a temporal knowledge graph architecture for agent memory",
    432       "authors": ["P. Rasmussen", "P. Paliychuk", "T. Beauvais", "J. Ryan", "D. Chalef"],
    433       "year": 2025,
    434       "arxiv_id": "2501.13956",
    435       "relevance": "Key temporal graph memory system implementing bi-temporal modeling for agent memory, central to the survey's storage and retrieval analysis."
    436     },
    437     {
    438       "title": "ExpeL: LLM agents are experiential learners",
    439       "authors": ["A. Zhao", "D. Huang", "Q. Xu", "M. Lin", "Y.-J. Liu", "G. Huang"],
    440       "year": 2024,
    441       "relevance": "Demonstrates experience-based memory evolution through success/failure differentiation, key work on external self-exploration in agent memory."
    442     },
    443     {
    444       "title": "The Virtual Lab of AI agents designs new SARS-CoV-2 nanobodies",
    445       "authors": ["K. Swanson", "W. Wu", "N. L. Bulaong", "J. E. Pak", "J. Zou"],
    446       "year": 2025,
    447       "relevance": "Demonstrates multi-agent scientific collaboration with memory, relevant to agentic AI capabilities and science agent applications."
    448     },
    449     {
    450       "title": "Optimus-1: Hybrid multimodal memory empowered agents excel in long-horizon tasks",
    451       "authors": ["Z. Li", "Y. Xie", "R. Shao", "G. Chen", "D. Jiang", "L. Nie"],
    452       "year": 2024,
    453       "relevance": "Proposes hybrid memory combining hierarchical knowledge graph with multimodal experience pool, key example of knowledge-experience decoupling in agent memory."
    454     }
    455   ]
    456 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs