scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24759B)
      1 {
      2   "paper": {
      3     "title": "Dual Latent Memory for Visual Multi-agent System",
      4     "authors": ["Xinlei Yu", "Chengming Xu", "Zhangquan Chen", "Bo Yin", "Cheng Yang", "Yongbo He", "Yihao Hu", "Jiangning Zhang", "Cheng Tan", "Xiaobin Hu", "Shuicheng Yan"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.00471"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "The paper identifies a 'scaling wall' in visual multi-agent systems where increasing agent turns degrades performance while exponentially inflating token costs. The proposed L2-VMAS framework with dual latent memories (perception and thinking) improves average accuracy by 2.7-5.4% while reducing token usage by 21.3-44.8% across five VLM backbones. The method demonstrates consistent gains across four model sizes (2B-32B) and six multi-agent topologies, and shows strong scalability up to 10 agent turns where baselines degrade below single-agent performance.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The abstract states 'Codes: https://github.com/YU-deep/L2-VMAS' providing a GitHub repository URL."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks (MMBench, MMStar, RealWorldQA, SimpleVQA, MuirBench, BLINK, MVBench, LVBench) and trains on the public GQA dataset. No proprietary data was collected."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions '8 NVIDIA H200 141G GPUs' but does not provide requirements.txt, Dockerfile, or detailed library versions for environment recreation."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-level reproduction guidance."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results tables (Tables 1-3, 5-10) report only point estimates with no confidence intervals, error bars, or ± notation."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims L2-VMAS outperforms baselines across multiple settings but provides no statistical significance tests — comparisons are based solely on point estimate differences."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '2.7-5.4% accuracy improvement' and '21.3-44.8% token reduction', with absolute values in tables showing both baseline and proposed method performance."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for the choice of benchmarks or their sizes. No power analysis or discussion of whether the benchmark sizes are sufficient to detect the claimed improvements."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares against both single-agent and multi-agent (VMAS with text-based communication) baselines across all experiments."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines use contemporary VLMs including Qwen3-VL (2025), GLM-4.1V-Thinking (2025), InternVL-3.5-8B (2025), and LLaVA-OV-1.5-8B (2025)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 6 provides ablation removing individual components: triggering, attribution, perception memory, and thinking memory, showing each component's contribution."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports both accuracy and total token usage across all experiments, evaluating effectiveness and efficiency jointly."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant here — the paper evaluates on established automated visual QA benchmarks with ground-truth answers."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Training uses the GQA dataset with 'no exposure to the test benchmarks' (Section 4.1). Table 10 also shows generalization to unseen benchmarks."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per benchmark (Tables 1-3), per model size (Table 2), per multi-agent structure (Table 3), and per task type (perception/thinking/mixed in Figure 5)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Appendix D.2 discusses three cells where L2-VMAS slightly underperforms VMAS (LLaVA-OV on MMBench, Qwen3-VL-2B on MMBench and RealWorldQA), attributing this to performance ceiling saturation."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports cases where L2-VMAS slightly degrades performance (Table 1: LLaVA-OV-1.5-8B on MMBench drops 0.3%, Table 2: Qwen3-VL-2B drops on two benchmarks) and discusses why."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of '2.7-5.4% accuracy improvement' and '21.3-44.8% token reduction' are directly supported by Table 1 results across five backbones."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims (e.g., 'dual latent memory improves performance') are supported by controlled ablation studies (Table 6) that remove individual components and measure the impact, constituting adequate single-variable manipulation."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims applicability to 'Visual Multi-agent System' generally, but results are limited to visual QA tasks on specific benchmarks. The paper does not test on open-ended generation, visual dialogue, or other VMAS application domains (e.g., embodied agents, robotics). The generalization claims in the title are broader than the evidence."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for the improvements. For example, the gains could partly be due to the additional learnable parameters in the memory modules rather than the dual memory design per se. No threats-to-validity or alternative explanations section exists."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper directly measures accuracy on QA benchmarks and token usage, and frames claims in terms of these specific metrics rather than broader unsubstantiated constructs."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model names with sizes are provided: GLM-4.1V-9B-Thinking, InternVL-3.5-8B, LLaVA-OV-1.5-8B, Qwen3-VL-8B-Thinking/Instruct, and specific size variants (2B/4B/8B/32B)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The full prompt used for perception/thinking extraction via GPT-5.1 is provided in Appendix B.1. System prompts for the multi-agent setup are referenced in the multi-agent settings from prior work."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 4 provides comprehensive three-stage training hyperparameters including learning rates, clip range, batch sizes, PPO parameters. Section 4.1 states W=16, λ=0.5, L=8, N=50."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The multi-agent structures (six topologies) are described in detail in Section 4.1 and Appendix D.1 with illustrations in Figure 9. The memory synthesis and orchestration pipeline is thoroughly described in Sections 3.2-3.3."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper describes how visual inputs are processed through hierarchical downsampling (Equation 4-5), how thinking trajectories are chunked at entropy boundaries (Equation 6), and training data comes from GQA dataset."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 5) does not discuss limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed anywhere in the paper."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. It does not bound its claims to the specific QA benchmark setting."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (per-example predictions, logs) is made available. Only aggregated results are reported in tables."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Training uses the public GQA dataset. Evaluation benchmarks are all publicly available with citations provided (MMBench, MMStar, RealWorldQA, SimpleVQA, etc.)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data sources are standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The three-stage training pipeline is documented in detail (Table 4, Appendix C.5), with clear progression from memory synthesis to orchestration to joint training."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: NUS, FDU, THU, DeepWisdom, ZJU, HNU, Shanghai AI Lab."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed. Authors are from both academic institutions and DeepWisdom/Shanghai AI Lab (industry)."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses pre-trained VLMs (Qwen3-VL, GLM-4.1V, InternVL, LLaVA) without stating their training data cutoff dates."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the pre-trained VLM backbones may have seen the benchmark data during their pre-training."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Benchmarks like MMBench (2024) and others are publicly available and could be in the training data of the VLMs used. This contamination risk is not discussed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Token usage is reported for all experiments as a primary metric, showing both absolute token counts and percentage reductions (Tables 1-3, 7-9)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4.1 states '8 NVIDIA H200 141G GPUs'. Table 4 provides training steps (100k/80k/50k) across three stages. However, total GPU hours are not explicitly stated."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multi-seed experiments reported. All results appear to be from single runs without seed sensitivity analysis."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating how many runs produced them."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Sensitivity analyses (Tables 7-9) explore hyperparameter values but no search budget is reported. It is unclear how the final values (W=16, λ=0.5, L=8) were selected beyond what is shown."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Tables 7-9 show sensitivity analyses across multiple values for W, λ, and L, and the selected values correspond to the best or near-best accuracy-efficiency tradeoff shown in these tables."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Many comparisons are made across 5 backbones × 4 benchmarks × multiple structures but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their L2-VMAS against their own implementation of the VMAS baseline without acknowledging potential bias in baseline implementation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Token usage (a compute proxy) is reported alongside accuracy for every experiment, and the paper explicitly frames the accuracy-cost tradeoff. Tables 7-9 show how different parameter settings affect both metrics."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper uses multiple benchmarks but does not discuss whether these benchmarks adequately measure the claimed 'comprehensive visual abilities' or whether improvements on QA translate to real VMAS applications."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Table 3 evaluates across six different multi-agent structures (linear, layered, centralized, random, complete, dynamic) using the same backbone, and comparisons use the same structure for VMAS vs L2-VMAS."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether the VLM backbones were trained on data that could include the benchmark problems."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup provides information not available in real usage scenarios."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of independence between training data (GQA) and test benchmarks, or between the VLM pre-training data and test sets."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention methods are applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "L2-VMAS improves average accuracy by 2.7-5.4% over text-based VMAS across five VLM backbones while reducing token usage by 21.3-44.8%.",
    364       "evidence": "Table 1 shows results across GLM-4.1V-9B-Thinking (+4.3%, -44.8%), InternVL-3.5-8B (+2.7%, -25.4%), LLaVA-OV-1.5-8B (+3.2%, -21.3%), Qwen3-VL-8B-Instruct (+3.7%, -23.8%), and Qwen3-VL-8B-Thinking (+5.4%, -43.6%).",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Existing VMAS suffer from a 'scaling wall' where performance degrades after 3 agent turns and drops below single-agent baseline by turn 6.",
    369       "evidence": "Figure 2 and Section 2.1 show accuracy peaks at 86.6 at turn 3 then falls below single-agent baseline (84.8) by turn 6, on MMBench with Qwen3-VL-8B-Thinking.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "L2-VMAS achieves consistent improvements across all six multi-agent topologies (2.7-6.3% gains).",
    374       "evidence": "Table 3 shows gains across linear (+2.7/3.7%), layered (+2.8/3.8%), centralized (+4.5/6.3%), random (+3.1/5.1%), complete (+3.1/5.5%), and dynamic (+3.7/5.4%) structures.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "L2-VMAS scales effectively to 10 agent turns with 13.6-15.1% improvement over single-agent, while baselines degrade.",
    379       "evidence": "Figure 6 and Section 4.3 show continued improvement at 10 turns for L2-VMAS while VMAS declines below single-agent level on RealWorldQA.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Each component (triggering, attribution, perception memory, thinking memory) contributes to the overall performance.",
    384       "evidence": "Table 6 ablation study shows removing attribution causes the largest drop (-2.3 to -2.8%) and removing thinking memory causes -1.5 to -3.2% drops.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No variance or uncertainty reporting",
    391       "detail": "All experiments report single-run point estimates without standard deviations, confidence intervals, or multi-seed results. Given the stochastic nature of both RL training and VLM inference, the claimed improvements (often 2-3%) could be within noise."
    392     },
    393     {
    394       "flag": "No statistical significance tests",
    395       "detail": "Dozens of comparisons are made across backbones, model sizes, structures, and benchmarks, but no statistical tests are performed. Small differences (e.g., 0.5% in Table 1 InternVL MMBench) are presented as meaningful without any significance verification."
    396     },
    397     {
    398       "flag": "No limitations section",
    399       "detail": "The paper has no limitations, threats-to-validity, or broader impact section, despite being a systems paper with many design choices that could affect generalization."
    400     },
    401     {
    402       "flag": "Contamination risk unaddressed",
    403       "detail": "Pre-trained VLMs may have seen benchmark data during pre-training. Since the approach builds on these VLMs and claims to improve their performance on specific benchmarks, contamination could differentially affect results."
    404     },
    405     {
    406       "flag": "Additional parameters not controlled for",
    407       "detail": "L2-VMAS introduces learnable compression modules, gating networks, and refinement compressors — additional parameters that could explain improvements independent of the dual memory design. The ablation removes entire memory types but does not control for parameter count."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Why do multi-agent LLM systems fail?",
    413       "authors": ["Mert Cemri"],
    414       "year": 2025,
    415       "arxiv_id": "2503.13657",
    416       "relevance": "Analyzes failure modes of multi-agent LLM systems, directly relevant to understanding agentic workflow reliability."
    417     },
    418     {
    419       "title": "Scaling large language model-based multi-agent collaboration",
    420       "authors": ["Chen Qian"],
    421       "year": 2025,
    422       "relevance": "Studies scaling properties of LLM multi-agent collaboration, relevant to understanding agent coordination challenges."
    423     },
    424     {
    425       "title": "Large language models miss the multi-agent mark",
    426       "authors": ["Emanuele La Malfa"],
    427       "year": 2025,
    428       "relevance": "Evaluates multi-agent LLM performance, relevant to understanding limitations of LLM-based multi-agent systems."
    429     },
    430     {
    431       "title": "Thought communication in multiagent collaboration",
    432       "authors": ["Yubo Zheng"],
    433       "year": 2025,
    434       "relevance": "Explores latent thought communication between agents, directly relevant to agentic collaboration mechanisms."
    435     },
    436     {
    437       "title": "Cache-to-cache: Direct semantic communication between large language models",
    438       "authors": ["Tao Fu"],
    439       "year": 2025,
    440       "arxiv_id": "2510.03215",
    441       "relevance": "Proposes direct latent communication between LLMs, relevant to inter-agent communication efficiency."
    442     },
    443     {
    444       "title": "Latent collaboration in multi-agent systems",
    445       "authors": ["Jiaming Zou"],
    446       "year": 2025,
    447       "arxiv_id": "2511.20639",
    448       "relevance": "Studies latent-space collaboration between agents, directly relevant to non-textual agent communication."
    449     },
    450     {
    451       "title": "Cowpox: Towards the immunity of VLM-based multi-agent systems",
    452       "authors": ["Yixuan WU"],
    453       "year": 2025,
    454       "relevance": "Addresses security and robustness of VLM-based multi-agent systems, relevant to AI safety in agentic workflows."
    455     },
    456     {
    457       "title": "ChatDev: Communicative agents for software development",
    458       "authors": ["Chen Qian"],
    459       "year": 2023,
    460       "relevance": "Foundational work on multi-agent LLM collaboration for software engineering tasks."
    461     },
    462     {
    463       "title": "Improving factuality and reasoning in language models through multiagent debate",
    464       "authors": ["Yilun Du"],
    465       "year": 2023,
    466       "relevance": "Proposes multi-agent debate for improving LLM reasoning, relevant to multi-agent collaboration paradigms."
    467     },
    468     {
    469       "title": "Towards a science of scaling agent systems",
    470       "authors": ["Yoonsang Kim"],
    471       "year": 2025,
    472       "arxiv_id": "2512.08296",
    473       "relevance": "Investigates scaling laws for agent systems, relevant to understanding how multi-agent systems scale."
    474     }
    475   ]
    476 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs