ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20544B)


      1 {
      2   "paper": {
      3     "title": "Cross-Modal Memory Compression for Efficient Multi-Agent Debate",
      4     "authors": ["Jing Wu", "Yue Sun", "Tianpei Xie", "Suiyao Chen", "Jingyuan Bao", "Yaopengxiao Xu", "Gaoyuan Du", "Inseok Heo", "Alexander Gutfraind", "Xin Wang"],
      5     "year": 2025,
      6     "venue": "ICML 2025 (submission)"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL or code archive is provided in the paper. Appendix B describes implementation details but no link to source code."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper uses publicly available benchmarks: GSM8K, MATH, and GPQA. No proprietary data was collected."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Appendix B mentions AdamW optimizer, learning rate 1e-4, batch size 64, and adapter dimensions, but no requirements.txt, Dockerfile, or library versions are provided. Hardware is mentioned only as 'single NVIDIA A100 GPU' for inference timing."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. Appendix B and C describe configuration details but not how to reproduce results end-to-end."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "All results in Tables 1-3 are reported as point estimates with no confidence intervals, error bars, or ± notation."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims DebateOCR outperforms baselines across models and datasets but provides no statistical significance tests."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper reports percentage improvements with baseline context, e.g., '92.4% and 75.0% reductions' in tokens, '2.25× and 2.38× speedups', and accuracy differences between methods in Table 1."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification for why these three benchmarks or their standard test set sizes are sufficient. No power analysis."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No standard deviation, variance, or multi-run results are reported. All results appear to be single-run numbers."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Two baselines are compared: T-MAD (text-based multi-agent debate) and TS-MAD (text-based with summarization), described in Section 5.1.2."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The baselines represent current standard approaches to multi-agent debate (Du et al. 2023, Khan et al. 2024, Chen et al. 2024a, Liu et al. 2025). These are contemporary methods."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.3 provides ablation studies: image resolution vs. accuracy/tokens (Table 2) and comparison of vision encoders (Table 3). Appendix D provides scaling analysis across agent counts and debate rounds."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Three metrics are reported: accuracy, token consumption, and inference time (Section 5.1.3, Table 1)."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "The paper evaluates automated reasoning benchmarks with exact-match accuracy. Human evaluation is not relevant to these claims."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Standard test splits are used: GSM8K test set (1,319 problems), MATH test set (5,000 problems), GPQA (448 questions). These are separate from training data described in Appendix B."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down per model (4 models) and per dataset (3 datasets) in Table 1, and per resolution in Table 2."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper notes that on GPQA with InternVL-8B, summarization slightly outperforms DebateOCR (23.3% vs 22.8%) and discusses why: 'extractive summaries better retain domain-specific terminology for graduate-level questions.'"
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The GPQA/InternVL case where DebateOCR underperforms is reported. Table 2 shows lower resolutions degrade accuracy (224x224 yields 71.2% vs 76.3% at 1024x1024)."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims >92% token reduction and substantially lower compute cost — Table 1 shows 92.4% reduction on InternVL/GSM8K and consistent reductions across all settings. Accuracy claims are supported by Table 1 results."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper makes causal claims via ablation studies (removing/varying components like resolution in Table 2, encoder choice in Table 3). The discussion in Section 5.2 explains accuracy gains through artifact removal, supported by the theoretical framework."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title says 'Multi-Agent Debate' generally but results are only on mathematical/scientific reasoning with 4 open-source VLMs in the 7-12B range. No evaluation on non-STEM tasks, closed-source models, or larger models. The conclusion claims 'visual representations offer a practical and efficient alternative to text-based communication in multi-agent systems' without bounding to the tested domain."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper offers one explanation for accuracy gains (artifact removal via information bottleneck theory) but does not consider alternatives such as: the vision encoder acting as a regularizer, the adapter's training data distribution effects, or whether majority voting dynamics change with compressed inputs."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "Models are listed as 'Qwen2.5-VL-7B-Instruct', 'Llama-3.2-11B-Vision', 'InternVL-8B', 'Pixtral-12B'. While some include size, none specify exact checkpoint versions, snapshot dates, or commit hashes."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Appendix C provides full prompt templates for all three prompt strategies (vision-augmented, pure text, text+summary) with actual template text and mechanism descriptions."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Appendix B reports learning rate (1e-4), batch size (64), adapter dimensions (dh=d=4096), rendering resolution (1024x1024), font size (12), line spacing (1.2). Section 5.1 reports K=3 agents, R=5 rounds, max 1024 tokens per response."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The multi-agent debate framework is described in detail: Sections 3.3-3.4 cover the SAM-CLIP-adapter pipeline, text-to-image rendering, vision encoding, and context injection. The debate protocol (majority voting, round structure) is specified."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix B documents training data sources (~85,000 samples from GSM8K, MATH, MathQA, MMLU-STEM, SQuAD, NaturalQuestions) and rendering configuration. Test set sizes and evaluation procedures are specified in Section 5.1."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No threats to validity are discussed anywhere in the paper."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion generalizes broadly to 'multi-agent systems' without bounding to mathematical reasoning or the tested model sizes."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw experimental data (per-example predictions, debate traces, intermediate outputs) is made available."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Training data sources are listed in Appendix B. Evaluation uses standard public benchmarks with specified test splits (Section 5.1.1)."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. All data comes from standard benchmarks."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline from text rendering to vision encoding to context injection is documented in Sections 3.3-3.4 and Appendix B. Training procedure and evaluation pipeline are described."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "Author affiliations are not listed in the paper text (likely anonymized for review)."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information disclosed, so independence cannot be assessed."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial disclosure statement is present."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No training data cutoff dates are stated for any of the four models used. The adapter is trained on public datasets but the base models' training data provenance is not discussed."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether GSM8K, MATH, or GPQA test examples could have been in the base models' training data."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "GSM8K (2021) and MATH (2021) are widely known benchmarks that could be in training data of models released in 2024-2025. This contamination risk is not addressed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Table 1 reports inference time per sample in seconds for all methods. Token consumption is reported in thousands. Section 5.1.3 defines how these are measured."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "Training compute for the adapter is not quantified (no GPU hours, training time, or total cost). Only inference is measured on 'a single NVIDIA A100 GPU.'"
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "DebateOCR cuts input tokens by more than 92% compared to text-based multi-agent debate.",
    285       "evidence": "Table 1 shows token reduction from 59.2K to 4.5K on InternVL-8B/GSM8K (92.4% reduction). Similar reductions across all model-dataset combinations.",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "DebateOCR achieves the best or competitive accuracy across most settings while dramatically reducing tokens.",
    290       "evidence": "Table 1 shows DebateOCR achieves highest accuracy in 11 of 12 model-dataset combinations. One exception: GPQA with InternVL-8B where TS-MAD achieves 23.3% vs 22.8%.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "Visual compression achieves 2.25× inference speedup over text-based debate.",
    295       "evidence": "Table 1 shows inference time reductions, e.g., InternVL-8B/GSM8K: 64.7s vs 145.7s (2.25×). Consistent across models.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Multi-agent aggregation enables compressed histories to approach the information bottleneck with exponentially high probability.",
    300       "evidence": "Theorem 4.2 with full proof in Appendix A. The theoretical result relies on assumptions (A.3-A.5) that are discussed but not rigorously validated empirically.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "The method generalizes effectively across diverse MLLMs and scales linearly with debate rounds and agent count.",
    305       "evidence": "Table 1 shows results across 4 models. Appendix D shows scaling analysis across 2-8 agents and 1-8 rounds. However, all models are open-source VLMs in the 7-12B range.",
    306       "supported": "moderate"
    307     }
    308   ],
    309   "methodology_tags": ["benchmark-eval"],
    310   "key_findings": "DebateOCR replaces textual debate histories with compact image representations in multi-agent debate systems, achieving >92% token reduction with competitive or improved accuracy across GSM8K, MATH, and GPQA benchmarks using four open-source VLMs (7-12B parameters). The method provides 2.25× inference speedup over text-based debate. A theoretical analysis based on information bottleneck theory explains why compression can maintain accuracy through multi-agent diversity. Ablation studies show 1024×1024 resolution offers the best accuracy-efficiency tradeoff.",
    311   "red_flags": [
    312     {
    313       "flag": "No error bars or multi-run variance",
    314       "detail": "All results appear to be single-run numbers with no standard deviation, confidence intervals, or repeated trials. Multi-agent debate outcomes can vary with sampling, so this is a significant omission."
    315     },
    316     {
    317       "flag": "No limitations section",
    318       "detail": "The paper has no dedicated limitations or threats-to-validity section. Known limitations such as restriction to mathematical reasoning, small open-source models, and potential contamination are not discussed."
    319     },
    320     {
    321       "flag": "Benchmark contamination risk unaddressed",
    322       "detail": "GSM8K and MATH were published in 2021 and are widely used. Models released in 2024-2025 may have trained on these benchmarks. This risk is not discussed, and it could affect all methods equally or differentially depending on how compression interacts with memorized solutions."
    323     },
    324     {
    325       "flag": "Overbroad generalization claims",
    326       "detail": "The conclusion claims the approach works for 'multi-agent systems' generally, but evaluation is limited to mathematical/scientific reasoning with 7-12B open-source VLMs. No evaluation on coding, creative, or language tasks."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "Improving factuality and reasoning in language models through multiagent debate",
    332       "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"],
    333       "year": 2023,
    334       "relevance": "Foundational multi-agent debate paper that DebateOCR builds upon and compares against."
    335     },
    336     {
    337       "title": "Debating with more persuasive LLMs leads to more truthful answers",
    338       "authors": ["A. Khan", "J. Hughes", "D. Valentine", "L. Ruis", "K. Sachan"],
    339       "year": 2024,
    340       "arxiv_id": "2402.06782",
    341       "relevance": "Key multi-agent debate paper used as baseline approach (T-MAD)."
    342     },
    343     {
    344       "title": "Why do multi-agent LLM systems fail?",
    345       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    346       "year": 2025,
    347       "arxiv_id": "2503.13657",
    348       "relevance": "Analysis of failure modes in multi-agent LLM systems, directly relevant to survey scope."
    349     },
    350     {
    351       "title": "ChatEval: Towards better LLM-based evaluators through multi-agent debate",
    352       "authors": ["C.-M. Chan", "W. Chen", "Y. Su"],
    353       "year": 2023,
    354       "arxiv_id": "2308.07201",
    355       "relevance": "Multi-agent debate applied to LLM evaluation, relevant to agentic workflows."
    356     },
    357     {
    358       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    359       "authors": ["S. Hong", "X. Zheng", "J. Chen"],
    360       "year": 2023,
    361       "arxiv_id": "2308.00352",
    362       "relevance": "Multi-agent collaboration framework relevant to agentic AI systems."
    363     },
    364     {
    365       "title": "Encouraging divergent thinking in large language models through multi-agent debate",
    366       "authors": ["T. Liang", "Z. He", "W. Jiao"],
    367       "year": 2023,
    368       "arxiv_id": "2305.19118",
    369       "relevance": "Multi-agent debate methodology for improving LLM reasoning diversity."
    370     },
    371     {
    372       "title": "Reconcile: Round-table conference improves reasoning via consensus among diverse LLMs",
    373       "authors": ["J. Chen", "S. Saha", "M. Bansal"],
    374       "year": 2024,
    375       "relevance": "Multi-agent consensus approach for improving LLM reasoning, a baseline methodology."
    376     },
    377     {
    378       "title": "Let models speak ciphers: Multiagent debate through embeddings",
    379       "authors": ["C. Pham", "B. Liu", "Y. Yang"],
    380       "year": 2023,
    381       "arxiv_id": "2310.06272",
    382       "relevance": "Alternative approach to multi-agent debate communication using embeddings rather than text."
    383     },
    384     {
    385       "title": "Multiagent finetuning: Self improvement with diverse reasoning chains",
    386       "authors": ["V. Subramaniam", "Y. Du", "J. B. Tenenbaum"],
    387       "year": 2025,
    388       "arxiv_id": "2501.05707",
    389       "relevance": "Multi-agent approach to LLM self-improvement through diverse reasoning."
    390     },
    391     {
    392       "title": "Examining inter-consistency of large language models collaboration: An in-depth analysis via debate",
    393       "authors": ["K. Xiong", "X. Ding", "Y. Cao"],
    394       "year": 2023,
    395       "arxiv_id": "2305.11595",
    396       "relevance": "Analysis of LLM collaboration consistency in debate settings."
    397     }
    398   ]
    399 }

Impressum · Datenschutz