scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24266B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dual Latent Memory for Visual Multi-agent System",
      6     "authors": [
      7       "Xinlei Yu",
      8       "Chengming Xu",
      9       "Zhangquan Chen",
     10       "Bo Yin",
     11       "Cheng Yang",
     12       "Yongbo He",
     13       "Yihao Hu",
     14       "Jiangning Zhang",
     15       "Cheng Tan",
     16       "Xiaobin Hu",
     17       "Shuicheng Yan"
     18     ],
     19     "year": 2026,
     20     "venue": "arXiv",
     21     "arxiv_id": "2602.00471",
     22     "doi": null
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Abstract claims about the scaling wall, 2.7-5.4% accuracy improvement, and 21.3-44.8% token reduction are all backed by Tables 1-3 and Figures 2, 6, and 11.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Ablation study (Table 6) removes each component individually and the three-stage training recipe isolates each module, providing component-level causal evidence for the dual memory design.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Claims are anchored to VMAS on VQA-type tasks; the paper tests five backbones, four model sizes, six topologies, and includes an explicit cross-benchmark generalization study (Table 10).",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper does not discuss that the learnable compression modules and 230K-step PPO training add substantial learned capacity that could independently explain accuracy gains, separate from the dual-memory framing.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Claims are framed in terms of benchmark accuracy and token counts, which match the actual measurements; no conflation of proxy metrics with broader capability claims.",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "There is no dedicated limitations section; the paper ends with a brief optimistic conclusion, and Appendix D.2 acknowledges three cells of slight drops without broader methodological caveats.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No threats to validity are discussed; potential confounds such as added trainable parameters, benchmark saturation, or PPO training instability are not addressed.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The conclusion claims L2-VMAS offers 'a scalable path forward for more reliable and complicated VMAS' without stating what the results do not generalize to.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding acknowledgment or grant information appears anywhere in the paper.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Author affiliations are listed in the header: NUS, FDU, THU, DeepWisdom, ZJU, HNU, and Shanghai AI Lab.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No funding disclosed, so funder independence cannot be assessed.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests statement or financial disclosure is present in the paper.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms including VMAS, scaling wall, dual latent memory, perception memory, thinking memory, and entropy-driven triggering are all explicitly defined or explained in context.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Three explicit contributions are stated: failure analysis of VMAS, dual latent memory synthesis, and proactive memory orchestration.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Appendix A situates L2-VMAS against VMAS paradigms and latent visual reasoning work, explicitly noting why prior latent-communication approaches (Zheng et al. 2025, Fu et al. 2025, Zou et al. 2025) are insufficient for the visual setting.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "GitHub URL provided in the abstract: https://github.com/YU-deep/L2-VMAS.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "All benchmarks used (MMBench, MMStar, RealWorldQA, SimpleVQA, MuirBench, BLINK, MVBench, LVBench, GQA) are standard publicly available datasets.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper mentions 8 NVIDIA H200 141G GPUs and provides hyperparameters in Table 4, but no requirements.txt, Dockerfile, or software dependency specification is provided.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Implementation details are scattered across appendix subsections C.1-C.5 and D.1, but there are no step-by-step reproduction instructions that could be followed without significant guesswork.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All results in Tables 1-10 are single point estimates; no confidence intervals, error bars, or standard deviations are reported.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No statistical significance tests are reported for any comparative accuracy claim across all tables.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Effect sizes are reported as percentage improvements over baselines (e.g., +2.7-5.4% accuracy, -21.3-44.8% tokens) with baseline absolute values provided for context.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The number of test examples per benchmark is not discussed, and no power analysis or justification of evaluation scale is provided.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No standard deviation or variance across runs is reported; all results appear to be single-run point estimates.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Two baselines are included: single-agent (no collaboration) and VMAS (text-based multi-agent collaboration with the same VLM backbones).",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Baselines use the same contemporary VLMs (Qwen3-VL-8B-Thinking, InternVL-3.5-8B, GLM-4.1V-9B-Thinking, LLaVA-OV-1.5-8B) as the proposed method.",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table 6 shows ablation results removing triggering, attribution, perception memory, and thinking memory individually on four benchmarks.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Both task accuracy and total token consumption are reported throughout, providing a dual metric view of effectiveness and efficiency.",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Human evaluation is not applicable; all benchmarks use ground-truth labels for automated accuracy computation.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Training uses GQA; the paper explicitly states 'no exposure to the test benchmarks,' and the generalization study (Table 10) further validates on cross-benchmark held-out sets.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 5a breaks down results by task category (perception, thinking, mixed) on MMStar, and Figure 5b shows per-category memory triggering rates.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Appendix D.2 explicitly discusses three cells of slight accuracy drops (e.g., LLaVA-OV-1.5-8B on MMBench: -0.3%) and offers an explanation tied to benchmark saturation.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Tables 1-2 show and report three cells with slight accuracy drops (-0.2% to -0.4%), which are acknowledged and discussed in Appendix D.2.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Specific model versions are named throughout: GLM-4.1V-9B-Thinking, InternVL-3.5-8B, LLaVA-OV-1.5-8B, Qwen3-VL-8B-Instruct/Thinking, with arXiv references provided.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Only the GPT-5.1 extraction prompt used for transmission content categorization (Appendix B.1) is provided verbatim; the system prompts and task instructions for main agent evaluations are not disclosed.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Table 4 provides comprehensive hyperparameters: PPO clip range, max grad norm, target KL, gamma, GAE lambda, learning rates per stage, window length W=16, threshold λ=0.5, memory length L=8, granularity g=3, max capacity N=50, top-k=5.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The multi-agent scaffolding—memory synthesis and orchestration pipeline, four-stage orchestration workflow, three-stage PPO training, and six topology structures—is described in detail in Sections 3 and Appendices C-D.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No preprocessing steps for GQA training data or the evaluation benchmarks are described; data is referenced but how it was filtered, formatted, or prepared is not documented.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "All benchmarks (MMBench, MMStar, RealWorldQA, SimpleVQA, GQA, etc.) are standard publicly available datasets with independent access.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The paper only names GQA as training data with 'no exposure to the test benchmarks'; no description of how data was prepared, filtered, or formatted is provided.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participant recruitment; the paper uses standard benchmarks only.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The full pipeline from raw benchmark data through preprocessing to evaluation metrics is not documented; only model architecture and training stages are described.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The pre-training data cutoffs for none of the five base VLMs (Qwen3-VL, InternVL-3.5, GLM-4.1V, LLaVA-OV) are stated in the paper.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The paper notes GQA fine-tuning had 'no exposure to test benchmarks' but does not discuss whether the base VLMs were pre-trained on any evaluation benchmarks (MMBench, MMStar, etc.).",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "Potential contamination of base VLMs with long-public benchmarks (MMBench has been available since 2023) during pre-training is not discussed.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Token usage is reported extensively as inference cost proxy throughout Tables 1-3, 7-9, showing 21.3-44.8% token reduction over VMAS.",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Hardware (8 NVIDIA H200 141G GPUs) is mentioned and training steps per stage are given (100k/80k/50k), but total GPU-hours or compute cost is not stated.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "Increasing agent turns in VMAS leads to performance falling below single-agent baseline by turn 6 (the 'scaling wall').",
    381       "evidence": "Figure 2 shows accuracy peaks at turn 3 (86.6%) then drops below the single-agent baseline (84.8%) starting at turn 6, reaching 82.2% at turn 10 on MMBench with Qwen3-VL-8B-Thinking. Replicated across four benchmarks in Figure 7.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Full-content text transmission is inferior to structured or conclusion-only transmission in later agent turns.",
    386       "evidence": "Figure 3 shows full-content transmission achieves +0.7% at turn 2 but -3.8% at turn 10, while conclusion-only consistently delivers +0.5-1.1% gains across all turns.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "L2-VMAS improves average accuracy by 2.7-5.4% across five VLM backbones over the text-based VMAS baseline.",
    391       "evidence": "Table 1 shows improvements from +2.7% (InternVL-3.5-8B) to +5.4% (Qwen3-VL-8B-Thinking) on the four-benchmark average.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "L2-VMAS reduces total token consumption by 21.3-44.8% compared to VMAS across five backbones.",
    396       "evidence": "Table 1 shows token reductions from -21.3% (LLaVA-OV-1.5-8B) to -44.8% (GLM-4.1V-9B-Thinking).",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Performance gains are consistent across 2B/4B/8B/32B model sizes and six multi-agent topologies.",
    401       "evidence": "Tables 2 and 3 show positive average improvements in all model size and topology combinations; no systematic failure modes except the three small negative cells noted in Appendix D.2.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Perception memory benefits perception tasks more than thinking tasks, and vice versa for thinking memory.",
    406       "evidence": "Figure 5a: perception memory boosts perception tasks by 5.5% vs 1.4% for thinking tasks; thinking memory boosts thinking tasks by 8.2% vs 1.8% for perception tasks on MMStar.",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval"
    412   ],
    413   "key_findings": "L2-VMAS demonstrates that the 'scaling wall' in Visual Multi-Agent Systems—where increasing agent turns degrades performance below single-agent baselines—can be overcome by replacing text-based inter-agent communication with dual latent memories decoupling perception and thinking. Across five VLM backbones on standard VQA benchmarks, the framework achieves 2.7-5.4% accuracy improvements and 21.3-44.8% token reductions over text-based VMAS. The gains are consistent across four model scales (2B-32B), six multi-agent topologies, and generalize to unseen benchmarks, though no error bars or significance tests are provided and the added learnable components represent an uncontrolled confound.",
    414   "red_flags": [
    415     {
    416       "flag": "No error bars or significance tests",
    417       "detail": "All results across ten tables are single-run point estimates; no confidence intervals, variance across runs, or statistical significance tests are reported for any comparative claim."
    418     },
    419     {
    420       "flag": "Added parameters confound",
    421       "detail": "L2-VMAS adds learnable compression modules (C, Cmerge, Crefine), a gating router, and trains them with PPO over 230K steps; improvements may be partly attributable to these additional learned parameters rather than the dual-memory architecture per se, and this is never discussed."
    422     },
    423     {
    424       "flag": "No limitations section",
    425       "detail": "The paper has no dedicated limitations or threats-to-validity section; the conclusion is entirely optimistic about generalization."
    426     },
    427     {
    428       "flag": "Benchmark contamination unaddressed",
    429       "detail": "The pre-training contamination of the five base VLMs on evaluation benchmarks (MMBench has been public since 2023, MMStar since 2024) is not discussed, making it impossible to assess how much headroom exists."
    430     },
    431     {
    432       "flag": "No funding disclosure",
    433       "detail": "No funding source is disclosed anywhere in the paper."
    434     },
    435     {
    436       "flag": "Latent-communication baselines absent",
    437       "detail": "The paper compares only against single-agent and text-based VMAS, not against closely related latent-space communication methods (Zheng et al. NeurIPS 2025, Fu et al. 2025, Zou et al. 2025) described in its own related work as directly relevant."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Why do multi-agent LLM systems fail?",
    443       "relevance": "Analyzes failure modes of multi-agent LLM systems (Cemri et al. 2025), directly motivating the problem studied in this paper."
    444     },
    445     {
    446       "title": "Scaling large language model-based multi-agent collaboration",
    447       "relevance": "Demonstrates positive scaling in text-based multi-agent LLM systems (Qian et al., ICLR 2025), contrasting with the visual scaling wall found here."
    448     },
    449     {
    450       "title": "Thought communication in multiagent collaboration",
    451       "relevance": "Latent-space thought communication in multi-agent LLMs (Zheng et al., NeurIPS 2025); closely related but not compared against directly."
    452     },
    453     {
    454       "title": "Cache-to-cache: Direct semantic communication between large language models",
    455       "relevance": "Direct latent communication between LLMs (Fu et al. 2025); cited as related but insufficient for visual settings."
    456     },
    457     {
    458       "title": "Latent collaboration in multi-agent systems",
    459       "relevance": "Another latent communication approach (Zou et al. 2025) cited as related but not directly evaluated against."
    460     },
    461     {
    462       "title": "Large language models miss the multi-agent mark",
    463       "relevance": "Critical perspective on multi-agent LLM systems (Malfa et al., NeurIPS 2025); supports the paper's framing that current paradigms are insufficient."
    464     },
    465     {
    466       "title": "Towards a science of scaling agent systems",
    467       "relevance": "Examines scaling behavior of agent systems (Kim et al. 2025), directly relevant to the scaling wall phenomenon studied here."
    468     },
    469     {
    470       "title": "MMBench: Is your multi-modal model an all-around player?",
    471       "relevance": "Primary evaluation benchmark used throughout all main experiments."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Token reductions of 21-44% are directly actionable for deploying multi-agent VLMs in production at lower cost."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "The core finding that more agent turns actively hurts performance challenges the prevailing assumption that more collaboration is always better."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "The paper focuses purely on efficiency and accuracy; no safety or risk implications are discussed."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Positions itself against the dominant text-centric multi-agent paradigm, creating mild tension with a large body of existing work."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "Code released on GitHub and builds on publicly accessible VLMs (Qwen3-VL), enabling practitioners to reproduce and try the method."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Authors from NUS, FDU, THU, and Shanghai AI Lab are moderately recognized; no tier-1 industrial lab affiliation."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [],
    502     "top_points": 0,
    503     "total_points": 0,
    504     "total_comments": 0
    505   }
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs