ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (30039B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dual Latent Memory for Visual Multi-agent System",
      6     "authors": [
      7       "Xinlei Yu",
      8       "Chengming Xu",
      9       "Zhangquan Chen",
     10       "Bo Yin",
     11       "Cheng Yang",
     12       "Yongbo He",
     13       "Yihao Hu",
     14       "Jiangning Zhang",
     15       "Cheng Tan",
     16       "Xiaobin Hu",
     17       "Shuicheng Yan"
     18     ],
     19     "year": 2026,
     20     "venue": "arXiv",
     21     "arxiv_id": "2602.00471",
     22     "doi": null
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Abstract claims of '2.7-5.4% accuracy improvement' and '21.3-44.8% token reduction' are directly supported by Table 1 results across five backbones.",
     30         "source": "opus"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Causal claims (e.g., 'dual latent memory improves performance') are supported by controlled ablation studies (Table 6) that remove individual components and measure the impact, constituting adequate single-variable manipulation.",
     36         "source": "opus"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The title claims applicability to 'Visual Multi-agent System' generally, but results are limited to visual QA tasks on specific benchmarks. The paper does not test on open-ended generation, visual dialogue, or other VMAS application domains (e.g., embodied agents, robotics). The generalization claims in the title are broader than the evidence.",
     42         "source": "opus"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper does not discuss alternative explanations for the improvements. For example, the gains could partly be due to the additional learnable parameters in the memory modules rather than the dual memory design per se. No threats-to-validity or alternative explanations section exists.",
     48         "source": "opus"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper directly measures accuracy on QA benchmarks and token usage, and frames claims in terms of these specific metrics rather than broader unsubstantiated constructs.",
     54         "source": "opus"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 5) does not discuss limitations.",
     62         "source": "opus"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No threats to validity are discussed anywhere in the paper.",
     68         "source": "opus"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. It does not bound its claims to the specific QA benchmark setting.",
     74         "source": "opus"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding sources or acknowledgments section is present in the paper.",
     82         "source": "opus"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Author affiliations are listed: NUS, FDU, THU, DeepWisdom, ZJU, HNU, Shanghai AI Lab.",
     88         "source": "opus"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No funding information is disclosed, so independence cannot be assessed. Authors are from both academic institutions and DeepWisdom/Shanghai AI Lab (industry).",
     94         "source": "opus"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests or financial interests statement is present in the paper.",
    100         "source": "opus"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "VMAS is defined with formal graph notation (SA, SE), 'scaling wall' is defined empirically with quantitative thresholds, and the dual latent memory components (perception memory MP vs. thinking memory MT) are precisely described with formal equations.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Three explicit contributions are stated in Section 1: failure analysis of VMAS, dual latent memory synthesis, and proactive memory orchestration, each with clear technical scope and motivation.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Appendix A provides a detailed related work section explicitly situating L2-VMAS against concurrent latent-space communication works (Zheng et al., Fu et al., Zou et al.) and explaining why those cannot be directly applied to VMAS due to visual input challenges.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The abstract states 'Codes: https://github.com/YU-deep/L2-VMAS' providing a GitHub repository URL.",
    131           "source": "opus"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper uses publicly available benchmarks (MMBench, MMStar, RealWorldQA, SimpleVQA, MuirBench, BLINK, MVBench, LVBench) and trains on the public GQA dataset. No proprietary data was collected.",
    137           "source": "opus"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper mentions '8 NVIDIA H200 141G GPUs' but does not provide requirements.txt, Dockerfile, or detailed library versions for environment recreation.",
    143           "source": "opus"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-level reproduction guidance.",
    149           "source": "opus"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All results tables (Tables 1-3, 5-10) report only point estimates with no confidence intervals, error bars, or ± notation.",
    157           "source": "opus"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper claims L2-VMAS outperforms baselines across multiple settings but provides no statistical significance tests — comparisons are based solely on point estimate differences.",
    163           "source": "opus"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '2.7-5.4% accuracy improvement' and '21.3-44.8% token reduction', with absolute values in tables showing both baseline and proposed method performance.",
    169           "source": "opus"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No justification for the choice of benchmarks or their sizes. No power analysis or discussion of whether the benchmark sizes are sufficient to detect the claimed improvements.",
    175           "source": "opus"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers.",
    181           "source": "opus"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The paper compares against both single-agent and multi-agent (VMAS with text-based communication) baselines across all experiments.",
    189           "source": "opus"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Baselines use contemporary VLMs including Qwen3-VL (2025), GLM-4.1V-Thinking (2025), InternVL-3.5-8B (2025), and LLaVA-OV-1.5-8B (2025).",
    195           "source": "opus"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table 6 provides ablation removing individual components: triggering, attribution, perception memory, and thinking memory, showing each component's contribution.",
    201           "source": "opus"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "The paper reports both accuracy and total token usage across all experiments, evaluating effectiveness and efficiency jointly.",
    207           "source": "opus"
    208         },
    209         "human_evaluation": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Human evaluation is not relevant here — the paper evaluates on established automated visual QA benchmarks with ground-truth answers.",
    213           "source": "opus"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Training uses the GQA dataset with 'no exposure to the test benchmarks' (Section 4.1). Table 10 also shows generalization to unseen benchmarks.",
    219           "source": "opus"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Results are broken down per benchmark (Tables 1-3), per model size (Table 2), per multi-agent structure (Table 3), and per task type (perception/thinking/mixed in Figure 5).",
    225           "source": "opus"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Appendix D.2 discusses three cells where L2-VMAS slightly underperforms VMAS (LLaVA-OV on MMBench, Qwen3-VL-2B on MMBench and RealWorldQA), attributing this to performance ceiling saturation.",
    231           "source": "opus"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper reports cases where L2-VMAS slightly degrades performance (Table 1: LLaVA-OV-1.5-8B on MMBench drops 0.3%, Table 2: Qwen3-VL-2B drops on two benchmarks) and discusses why.",
    237           "source": "opus"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Specific model names with sizes are provided: GLM-4.1V-9B-Thinking, InternVL-3.5-8B, LLaVA-OV-1.5-8B, Qwen3-VL-8B-Thinking/Instruct, and specific size variants (2B/4B/8B/32B).",
    245           "source": "opus"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "The full prompt used for perception/thinking extraction via GPT-5.1 is provided in Appendix B.1. System prompts for the multi-agent setup are referenced in the multi-agent settings from prior work.",
    251           "source": "opus"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Table 4 provides comprehensive three-stage training hyperparameters including learning rates, clip range, batch sizes, PPO parameters. Section 4.1 states W=16, λ=0.5, L=8, N=50.",
    257           "source": "opus"
    258         },
    259         "scaffolding_described": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The multi-agent structures (six topologies) are described in detail in Section 4.1 and Appendix D.1 with illustrations in Figure 9. The memory synthesis and orchestration pipeline is thoroughly described in Sections 3.2-3.3.",
    263           "source": "opus"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The paper describes how visual inputs are processed through hierarchical downsampling (Equation 4-5), how thinking trajectories are chunked at entropy boundaries (Equation 6), and training data comes from GQA dataset.",
    269           "source": "opus"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "No raw experimental data (per-example predictions, logs) is made available. Only aggregated results are reported in tables.",
    277           "source": "opus"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Training uses the public GQA dataset. Evaluation benchmarks are all publicly available with citations provided (MMBench, MMStar, RealWorldQA, SimpleVQA, etc.).",
    283           "source": "opus"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participants. Data sources are standard public benchmarks.",
    289           "source": "opus"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "The three-stage training pipeline is documented in detail (Table 4, Appendix C.5), with clear progression from memory synthesis to orchestration to joint training.",
    295           "source": "opus"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper uses pre-trained VLMs (Qwen3-VL, GLM-4.1V, InternVL, LLaVA) without stating their training data cutoff dates.",
    303           "source": "opus"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of whether the pre-trained VLM backbones may have seen the benchmark data during their pre-training.",
    309           "source": "opus"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "Benchmarks like MMBench (2024) and others are publicly available and could be in the training data of the VLMs used. This contamination risk is not discussed.",
    315           "source": "opus"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants in this study.",
    359           "source": "opus"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Token usage is reported for all experiments as a primary metric, showing both absolute token counts and percentage reductions (Tables 1-3, 7-9).",
    367           "source": "opus"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": true,
    372           "justification": "Section 4.1 states '8 NVIDIA H200 141G GPUs'. Table 4 provides training steps (100k/80k/50k) across three stages. However, total GPU hours are not explicitly stated.",
    373           "source": "opus"
    374         }
    375       },
    376       "experimental_rigor": {
    377         "seed_sensitivity_reported": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "No multi-seed experiments reported. All results appear to be from single runs without seed sensitivity analysis.",
    381           "source": "opus"
    382         },
    383         "number_of_runs_stated": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating how many runs produced them.",
    387           "source": "opus"
    388         },
    389         "hyperparameter_search_budget": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Sensitivity analyses (Tables 7-9) explore hyperparameter values but no search budget is reported. It is unclear how the final values (W=16, λ=0.5, L=8) were selected beyond what is shown.",
    393           "source": "opus"
    394         },
    395         "best_config_selection_justified": {
    396           "applies": true,
    397           "answer": true,
    398           "justification": "Tables 7-9 show sensitivity analyses across multiple values for W, λ, and L, and the selected values correspond to the best or near-best accuracy-efficiency tradeoff shown in these tables.",
    399           "source": "opus"
    400         },
    401         "multiple_comparison_correction": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "Many comparisons are made across 5 backbones × 4 benchmarks × multiple structures but no statistical tests are performed at all, let alone corrections for multiple comparisons.",
    405           "source": "opus"
    406         },
    407         "self_comparison_bias_addressed": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "The authors compare their L2-VMAS against their own implementation of the VMAS baseline without acknowledging potential bias in baseline implementation.",
    411           "source": "opus"
    412         },
    413         "compute_budget_vs_performance": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "Token usage (a compute proxy) is reported alongside accuracy for every experiment, and the paper explicitly frames the accuracy-cost tradeoff. Tables 7-9 show how different parameter settings affect both metrics.",
    417           "source": "opus"
    418         },
    419         "benchmark_construct_validity": {
    420           "applies": true,
    421           "answer": false,
    422           "justification": "The paper uses multiple benchmarks but does not discuss whether these benchmarks adequately measure the claimed 'comprehensive visual abilities' or whether improvements on QA translate to real VMAS applications.",
    423           "source": "opus"
    424         },
    425         "scaffold_confound_addressed": {
    426           "applies": true,
    427           "answer": true,
    428           "justification": "Table 3 evaluates across six different multi-agent structures (linear, layered, centralized, random, complete, dynamic) using the same backbone, and comparisons use the same structure for VMAS vs L2-VMAS.",
    429           "source": "opus"
    430         }
    431       },
    432       "data_leakage": {
    433         "temporal_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the VLM backbones were trained on data that could include the benchmark problems.",
    437           "source": "opus"
    438         },
    439         "feature_leakage_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether the evaluation setup provides information not available in real usage scenarios.",
    443           "source": "opus"
    444         },
    445         "non_independence_addressed": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No discussion of independence between training data (GQA) and test benchmarks, or between the VLM pre-training data and test sets.",
    449           "source": "opus"
    450         },
    451         "leakage_detection_method": {
    452           "applies": true,
    453           "answer": false,
    454           "justification": "No leakage detection or prevention methods are applied.",
    455           "source": "opus"
    456         }
    457       }
    458     }
    459   },
    460   "claims": [
    461     {
    462       "claim": "Visual Multi-Agent Systems exhibit a 'scaling wall': accuracy peaks at agent turn 3 then degrades continuously, falling 2.6% below the single-agent baseline by turn 10, while token consumption grows 30x.",
    463       "evidence": "Figure 2 on MMBench with Qwen3-VL-8B-Thinking: accuracy 84.8 (single) → 86.6 (turn 3) → 82.2 (turn 10); tokens 557 (single) → 16,840 (turn 10). Replicated across 4 benchmarks in Figure 7.",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Text-centric inter-agent communication with conflated perception and thinking is the primary cause of VMAS performance degradation.",
    468       "evidence": "Figure 3: Full-content transmission causes -3.8% accuracy by turn 10; Conclusion-only achieves consistent +0.5-1.1% gains; perception-only and thinking-only outperform full-content at later turns.",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "L2-VMAS improves average accuracy by 2.7-5.4% and reduces total token usage by 21.3-44.8% compared to text-based VMAS across 5 VLM backbones.",
    473       "evidence": "Table 1 confirms all five backbones: InternVL-3.5 +2.7%/-25.4%, LLaVA-OV +3.2%/-21.3%, Qwen3-VL-Instruct +3.7%/-23.8%, GLM-4.1V +4.3%/-44.8%, Qwen3-VL-Thinking +5.4%/-43.6%.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "L2-VMAS maintains consistent gains across model sizes (2B-32B) and all 6 multi-agent topologies tested.",
    478       "evidence": "Table 2 shows Qwen3-VL improvements from +1.3% (2B Instruct) to +5.4% (8B Thinking); Table 3 shows stable gains across linear, layered, centralized, random, complete, and dynamic structures (2.7-6.3%).",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "Decoupled perception and thinking memories specialize in their respective task types: perception memory improves perception tasks by 5.5% and thinking memory improves reasoning tasks by 8.2%.",
    483       "evidence": "Figure 5a on MMStar benchmark subsets shows per-memory-type accuracy gains; combined memory achieves highest mixed-task gain of 7.9%.",
    484       "supported": "moderate"
    485     },
    486     {
    487       "claim": "L2-VMAS enables positive scaling with more agent turns, achieving 13.6-15.1% improvement over single-agent at 10 turns versus VMAS which falls below single-agent.",
    488       "evidence": "Figures 6 and 11 show L2-VMAS accuracy curves increasing monotonically with turns across 4 benchmarks while VMAS curves peak at turn 3 and decline.",
    489       "supported": "strong"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval"
    494   ],
    495   "key_findings": "This paper identifies and quantifies a 'scaling wall' in Visual Multi-Agent Systems where accuracy peaks at 3 agent turns then degrades while token costs grow 30x, and traces the cause to the lossy serialization of visual perception and cognitive reasoning into text during inter-agent communication. The proposed L2-VMAS framework uses separate latent perception and thinking memories with entropy-driven proactive retrieval, consistently improving accuracy by 2.7-5.4% and reducing token usage by 21.3-44.8% across 5 VLM backbones, 4 model sizes, and 6 topologies. The decoupled memory design shows task-type specialization: perception memory disproportionately benefits visual tasks while thinking memory benefits reasoning tasks. The method's scalability is demonstrated through positive scaling with agent turns where baseline VMAS degrades, achieving 13.6-19.2% gains over single-agent at 10 turns.",
    496   "red_flags": [
    497     {
    498       "flag": "No variance across runs",
    499       "detail": "All results across 10+ tables are single-run point estimates; no standard deviations, confidence intervals, or multi-run stability metrics are reported, making reliability assessment impossible."
    500     },
    501     {
    502       "flag": "No statistical significance tests",
    503       "detail": "Despite extensive comparative claims across backbones, sizes, topologies, and benchmarks, no statistical significance tests are applied to any comparison."
    504     },
    505     {
    506       "flag": "No limitations section",
    507       "detail": "The paper lacks a dedicated limitations or threats-to-validity section; the scope restriction to VQA benchmarks is never acknowledged despite broad VMAS framing."
    508     },
    509     {
    510       "flag": "No funding disclosure",
    511       "detail": "11 authors from 7+ institutions including the commercial entity DeepWisdom with no funding acknowledgment or competing interests declaration anywhere in the paper."
    512     },
    513     {
    514       "flag": "Token cost metric incomplete",
    515       "detail": "Token count is used as proxy for computational cost but the added overhead of compression transformer modules (three separate compressors), memory storage, and retrieval in L2-VMAS is never quantified, making efficiency comparisons potentially misleading."
    516     },
    517     {
    518       "flag": "Base VLM contamination unaddressed",
    519       "detail": "Frozen base VLMs (Qwen3-VL, InternVL-3.5, GLM-4.1V) may have been trained on evaluation benchmark data; training cutoffs are not stated and benchmark contamination for pre-trained models is not discussed."
    520     }
    521   ],
    522   "cited_papers": [
    523     {
    524       "title": "Scaling large language model-based multi-agent collaboration",
    525       "relevance": "ICLR 2025 paper showing positive scaling in text-based LLM multi-agent systems, contrasted with the paper's finding that VMAS scaling fails"
    526     },
    527     {
    528       "title": "Why do multi-agent llm systems fail?",
    529       "relevance": "Systematic failure analysis of multi-agent LLM systems, directly relevant to the paper's empirical characterization of the VMAS scaling wall"
    530     },
    531     {
    532       "title": "Thought communication in multiagent collaboration",
    533       "relevance": "NeurIPS 2025 paper on latent-space communication for MAS, prior work on the same bottleneck L2-VMAS addresses for the visual domain"
    534     },
    535     {
    536       "title": "Cache-to-cache: Direct semantic communication between large language models",
    537       "relevance": "Prior work on bypassing text bottlenecks via KV-cache sharing, related approach that doesn't address visual VMAS challenges"
    538     },
    539     {
    540       "title": "Latent collaboration in multi-agent systems",
    541       "relevance": "Concurrent work on latent space collaboration in MAS, explained as inapplicable to VMAS due to visual input complexity"
    542     },
    543     {
    544       "title": "Large language models miss the multi-agent mark",
    545       "relevance": "NeurIPS 2025 critique of multi-agent LLM effectiveness, provides theoretical grounding for the paper's empirical failure analysis"
    546     },
    547     {
    548       "title": "G-designer: Architecting multi-agent communication topologies via graph neural networks",
    549       "relevance": "Source for the dynamic multi-agent topology used as one of the six structures evaluated in ablation experiments"
    550     },
    551     {
    552       "title": "Towards a science of scaling agent systems",
    553       "relevance": "Work on scaling laws for agent systems, relevant contrast to the paper's finding that naive VMAS scaling fails"
    554     }
    555   ],
    556   "engagement_factors": {
    557     "practical_relevance": {
    558       "score": 2,
    559       "justification": "Multi-agent visual pipelines are increasingly deployed in practice, but L2-VMAS requires training custom compression transformers via 3-stage PPO on H200 GPUs, making immediate adoption non-trivial."
    560     },
    561     "surprise_contrarian": {
    562       "score": 3,
    563       "justification": "The 'scaling wall' finding that more agent collaboration turns = worse performance directly contradicts the prevailing assumption that multi-agent systems compound benefits; quantified with 30x token cost explosion."
    564     },
    565     "fear_safety": {
    566       "score": 0,
    567       "justification": "No AI safety or risk concerns raised; the paper is purely about improving multi-agent system efficiency and accuracy on VQA benchmarks."
    568     },
    569     "drama_conflict": {
    570       "score": 1,
    571       "justification": "Challenges the trend of scaling multi-agent systems but is framed constructively as a problem-solution paper rather than a critique."
    572     },
    573     "demo_ability": {
    574       "score": 1,
    575       "justification": "Code is released on GitHub but three-stage PPO training requires substantial compute (H200 GPUs, 230k total training steps), preventing casual reproduction."
    576     },
    577     "brand_recognition": {
    578       "score": 1,
    579       "justification": "Authors from NUS, THU, FDU, and Shanghai AI Lab have moderate regional recognition; no involvement from major Western AI labs (OpenAI, Google, Meta, Anthropic)."
    580     }
    581   },
    582   "hn_data": {
    583     "threads": [],
    584     "top_points": 0,
    585     "total_points": 0,
    586     "total_comments": 0
    587   }
    588 }

Impressum · Datenschutz