scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20784B)
      1 {
      2   "paper": {
      3     "title": "Heterogeneous Multi-Agent Reinforcement Learning for Zero-Shot Scalable Collaboration",
      4     "authors": ["Xudong Guo", "Daming Shi", "Junjie Yu", "Wenhui Fan"],
      5     "year": 2024,
      6     "venue": "Neurocomputing",
      7     "arxiv_id": "2404.03869",
      8     "doi": "10.48550/arXiv.2404.03869"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SHPPO integrates heterogeneity into parameter-shared MARL via a latent network that generates agent-specific heterogeneous layer parameters. The method achieves competitive or superior performance to HAPPO and MAPPO on SMAC and GRF tasks, and demonstrates substantially better zero-shot scalability when transferred to unseen scenarios with different agent populations. Ablation studies confirm the contributions of latent learning, the inference network, and the entropy/diversity losses.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code release is mentioned anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available environments (SMAC and GRF), both standard public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions training on NVIDIA V100 GPUs but does not provide a requirements file, library versions, or detailed environment setup."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables I, II, and III report ± values (e.g., '71.2±6.5'), and figures show confidence intervals calculated over 5 seeds."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims SHPPO 'outperforms' baselines based on comparing mean values with standard deviations, but no statistical significance tests (p-values, t-tests, etc.) are reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported as win/score rates with absolute values and baselines (e.g., '85.5±5.8 vs 81.0±10.5'), providing context for the magnitude of improvement."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Five seeds are used but no justification is given for why 5 seeds is sufficient. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations across 5 seeds are reported in all results tables (e.g., '71.2±6.5')."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against HAPPO, HAPPO (share), MAPPO (share), and HATRPO (share) across all tasks."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include HAPPO (2021), MAPPO (2022), and HATRPO (2021), which are contemporary and competitive methods in cooperative MARL."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section V-E provides ablation studies on LatentNet inputs (masking with zeros, removing entirely) and losses (removing Lv, Le, Ld individually), shown in Fig. 7."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports both win/score rate and training reward as evaluation metrics (Table I)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant for MARL benchmark performance evaluation on game environments."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The scalability tests use unseen tasks with different population sizes that the models were never trained on, serving as held-out evaluation scenarios."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per task (MMM2, 8m_vs_9m, 3_vs_1_with_keeper, counterattack_easy) and per scalability scenario in Tables II and III."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses cases where SHPPO is not optimal (e.g., tasks 821_831 and 731_831 where MAPPO performs better due to increased homogeneity) and explains why."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Ablation studies show performance degradation when removing components. The paper also notes SHPPO is not best on every scalability task (e.g., 821_831, 731_831)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims superior performance on SMAC and GRF and enhanced zero-shot scalability, both supported by Tables I-III and Figures 4-6."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims about component contributions (e.g., 'the InferenceNet is able to assess the value of latent variables') are supported by controlled ablation studies in Section V-E that remove individual components."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Zero-Shot Scalable Collaboration' broadly, but results are only on SMAC and GRF. The conclusion acknowledges future work but the title/abstract overgeneralize beyond the tested environments."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, the parameter generation mechanism could be acting as regularization rather than heterogeneity per se, but this is not considered."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper directly measures win rate and score rate on game environments, which are the actual outcomes of interest for MARL collaboration. No proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "This paper trains its own RL models from scratch; it does not use pre-trained LLMs or APIs where version specification would be relevant."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting; it trains neural network policies via reinforcement learning."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix Tables IV and V provide detailed network configurations (MLP dim, RNN hidden dim, latent variable dim, activation, optimizer) and hyperparameters (learning rates, loss weights, discount factor, clip, GAE lambda, etc.)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used; this is a standard MARL training and evaluation setup."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A documents the environment modifications (limiting observation range to closest N enemies/allies) used to create scalable task variants."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The 'Note to Practitioners' mentions the sim-to-real gap briefly, but there is no substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The brief mention of 'disparity between virtual training environments and real-world scenarios' is generic."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what it does NOT show. It doesn't mention limitations to the specific environments tested, the types of multi-agent tasks, or the scalability range."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (training logs, per-seed results) is made available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data collection is inherent to the RL training process, which is well-described: environments (SMAC, GRF), training steps (20M for SMAC, 25M for GRF), evaluation intervals, and evaluation episodes (40 per evaluation)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data comes from standard RL benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Algorithm 2 documents the full training pipeline: trajectory collection, buffer storage, sequential agent updates, and network parameter updates."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are disclosed as being with the Department of Automation, Tsinghua University, with email addresses provided."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper trains its own RL models from scratch on game environments; it does not evaluate a pre-trained model's capability on benchmarks."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — models are trained from scratch, so benchmark contamination from pre-training is not applicable."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — no pre-trained model whose training data could overlap with benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency is reported for the method."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table VI reports training time per task (e.g., MMM2: 25.5±0.2 hours) on NVIDIA V100 GPUs."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results are reported across 5 random seeds with standard deviations (e.g., '71.2±6.5', 'confidence interval is calculated over 5 seeds')."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states '5 seeds' and 'evaluate times: 40' (40 evaluation episodes per evaluation point)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Hyperparameters are reported in Tables IV-V but no information is given about how they were selected or how many configurations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of how the reported hyperparameter configuration was selected."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so no correction for multiple comparisons is applied despite many pairwise comparisons across tasks and methods."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement SHPPO and compare against their own implementations of baselines without acknowledging potential bias from re-implementation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No comparison of compute budgets across methods. SHPPO has additional networks (LatentNet, InferenceNet) but the parameter/compute overhead relative to baselines is not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether SMAC and GRF adequately measure the claimed 'zero-shot scalable collaboration' capability or whether these benchmarks have known limitations."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved; all methods are trained and evaluated in the same environments with the same RL pipeline."
    337       }
    338     }
    339   },
    340   "claims": [
    341     {
    342       "claim": "SHPPO achieves superior or competitive performance compared to HAPPO and MAPPO on original SMAC and GRF tasks.",
    343       "evidence": "Table I shows SHPPO win/score rates: MMM2 71.2±6.5 (vs HAPPO 76.3±5.1), 8m_vs_9m 85.5±5.8 (vs HAPPO 81.0±10.5), 3_vs_1_with_keeper 94.2±2.1 (vs HAPPO 90.8±2.4), counterattack_easy 91.2±3.0 (vs HAPPO 92.0±4.1).",
    344       "supported": "strong"
    345     },
    346     {
    347       "claim": "SHPPO demonstrates better zero-shot scalability than baselines when transferred to unseen tasks with varied agent populations.",
    348       "evidence": "Tables II and III show SHPPO performs best on most unseen tasks. On SMAC 8m_vs_9m transfers: 6m_vs_7m 17.5±10.4 (vs HAPPO 2.5±1.2), 10m_vs_11m 70.2±20.1 (vs HAPPO 0.0±0.0). HAPPO fails badly on transfers.",
    349       "supported": "strong"
    350     },
    351     {
    352       "claim": "The latent variables learn meaningful heterogeneous strategy patterns that adapt during tasks.",
    353       "evidence": "Figures 5 and 6 visualize latent variables clustered by strategy type (attack, cover, attract fire) at different task stages, showing temporal adaptation.",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "Each component of the latent learning (InferenceNet Lv, entropy Le, distance Ld) contributes to performance.",
    358       "evidence": "Fig. 7 ablation studies on MMM2 and 8m_vs_9m show performance drops when removing each loss term or the LatentNet inputs.",
    359       "supported": "strong"
    360     }
    361   ],
    362   "red_flags": [
    363     {
    364       "flag": "No statistical significance tests",
    365       "detail": "All comparative claims ('outperforms', 'superior') are based on comparing means with overlapping standard deviations but no formal statistical tests. Many comparisons have overlapping error bars (e.g., SHPPO 71.2±6.5 vs HAPPO 76.3±5.1 on MMM2)."
    366     },
    367     {
    368       "flag": "No limitations section",
    369       "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries despite making broad claims about scalable collaboration."
    370     },
    371     {
    372       "flag": "Selective transfer protocol for HAPPO",
    373       "detail": "When transferring HAPPO to new tasks, the authors 'randomly select a model of the same type for the added agent.' This disadvantages HAPPO in the comparison — a more sophisticated transfer strategy might perform better."
    374     }
    375   ],
    376   "cited_papers": [
    377     {
    378       "title": "The surprising effectiveness of ppo in cooperative multi-agent games",
    379       "authors": ["C. Yu", "A. Velu", "E. Vinitsky", "J. Gao", "Y. Wang", "A. Bayen", "Y. Wu"],
    380       "year": 2022,
    381       "relevance": "Key MARL baseline (MAPPO) demonstrating effectiveness of PPO in multi-agent settings."
    382     },
    383     {
    384       "title": "Trust region policy optimisation in multi-agent reinforcement learning",
    385       "authors": ["J. G. Kuba", "R. Chen", "M. Wen", "Y. Wen", "F. Sun", "J. Wang", "Y. Yang"],
    386       "year": 2021,
    387       "arxiv_id": "2109.11251",
    388       "relevance": "HAPPO/HATRPO — heterogeneous MARL baseline with monotonic improvement guarantees."
    389     },
    390     {
    391       "title": "Scaling multi-agent reinforcement learning with selective parameter sharing",
    392       "authors": ["F. Christianos", "G. Papoudakis", "M. A. Rahman", "S. V. Albrecht"],
    393       "year": 2021,
    394       "relevance": "SePS method for scaling MARL via selective parameter sharing, directly related to scalability challenge."
    395     },
    396     {
    397       "title": "Generative agents: Interactive simulacra of human behavior",
    398       "authors": ["J. S. Park", "J. O'Brien", "C. J. Cai", "M. R. Morris", "P. Liang", "M. S. Bernstein"],
    399       "year": 2023,
    400       "relevance": "Generative multi-agent systems using LLMs, relevant to agentic AI and multi-agent collaboration."
    401     },
    402     {
    403       "title": "Grandmaster level in StarCraft II using multi-agent reinforcement learning",
    404       "authors": ["O. Vinyals"],
    405       "year": 2019,
    406       "relevance": "Landmark MARL achievement demonstrating multi-agent RL in complex game environments."
    407     },
    408     {
    409       "title": "Multi-agent actor-critic for mixed cooperative-competitive environments",
    410       "authors": ["R. Lowe", "Y. I. Wu", "A. Tamar", "J. Harb", "O. Pieter Abbeel", "I. Mordatch"],
    411       "year": 2017,
    412       "relevance": "MADDPG — foundational CTDE framework for multi-agent reinforcement learning."
    413     },
    414     {
    415       "title": "Multi-agent reinforcement learning is a sequence modeling problem",
    416       "authors": ["M. Wen", "J. Kuba", "R. Lin", "W. Zhang", "Y. Wen", "J. Wang", "Y. Yang"],
    417       "year": 2022,
    418       "relevance": "Transformer-based MARL approach relevant to scaling and heterogeneous agent coordination."
    419     },
    420     {
    421       "title": "Celebrating diversity in shared multi-agent reinforcement learning",
    422       "authors": ["C. Li", "T. Wang", "C. Wu", "Q. Zhao", "J. Yang", "C. Zhang"],
    423       "year": 2021,
    424       "relevance": "CDS method for encouraging diversity in shared MARL, directly related to heterogeneity problem."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs