scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24404B)
      1 {
      2   "paper": {
      3     "title": "BenchMARL: Benchmarking Multi-Agent Reinforcement Learning",
      4     "authors": [
      5       "Matteo Bettini",
      6       "Amanda Prorok",
      7       "Vincent Moens"
      8     ],
      9     "year": 2023,
     10     "venue": "arXiv preprint (under review)",
     11     "arxiv_id": "2312.01472"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The abstract explicitly provides the GitHub URL: https://github.com/facebookresearch/BenchMARL. The library is described as open-sourced."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The experiments use publicly available environments (VMAS, SMACv2, MPE, SISL, MeltingPot). Interactive benchmark results are published on Wandb, and fine-tuned configurations are in the repository's conf folder. No proprietary data is involved."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper does not provide a requirements.txt, Dockerfile, or detailed dependency/version listing. It mentions TorchRL, Hydra, and PyTorch as dependencies but does not specify library versions or provide an environment specification file."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 3.2 describes how to run benchmarks from one-line command inputs using Hydra. The paper states fine-tuned hyperparameters are in the fine_tuned/vmas folder in the repository, and references examples and notebooks. The execution diagram (Fig. 2) shows the workflow."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Fig. 4 and Fig. 5 explicitly report '95% stratified bootstrap confidence intervals over 3 random seeds for each experiment.' Shaded regions are visible in the sample efficiency curves."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "While the paper uses stratified bootstrap confidence intervals, it does not perform formal significance tests (e.g., p-values, t-tests) when comparing algorithms. Claims like 'MA versions obtain the best overall performance' are made based on visual inspection of plots and aggregate scores without statistical tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper reports aggregate scores (median, IQM, mean, optimality gap) with confidence intervals in Fig. 4c, but does not report effect sizes (e.g., Cohen's d) for pairwise algorithm comparisons. Differences between algorithms are discussed qualitatively rather than quantified."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Experiments use 3 random seeds per algorithm-task combination. No justification is given for why 3 seeds is sufficient, and no power analysis is discussed. Three seeds is a small number for reliable statistical inference in RL."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports 95% stratified bootstrap confidence intervals over 3 random seeds, which provides a measure of spread. The IQM with confidence intervals across seeds is shown in all experimental plots (Figs. 4 and 5)."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares 9 different MARL algorithms against each other (MAPPO, IPPO, MADDPG, IDDPG, MASAC, ISAC, QMIX, VDN, IQL), serving as mutual baselines. This is appropriate for a benchmarking paper."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The algorithms include contemporary methods (MAPPO 2021, MASAC/ISAC as novel implementations) alongside established baselines (MADDPG 2017, QMIX 2018, VDN 2017, IQL 1993). The mix of recent and classic algorithms is appropriate for establishing a benchmark."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is provided. The paper does not ablate library components (e.g., parameter sharing, vectorized training, different model architectures). The experiments only compare algorithms, not the design choices of BenchMARL itself."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Fig. 4 reports multiple aggregate metrics: median, IQM (inter-quartile mean), mean, and optimality gap. Performance profiles and sample efficiency curves are also provided."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant for a MARL benchmarking library. The outputs are algorithmic performance metrics in simulation environments, which are fully automated."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a reinforcement learning benchmarking paper, not a supervised learning paper. There is no train/test split concept in the traditional sense -- agents are trained and evaluated in simulation environments."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Fig. 5 provides individual task results (Sampling, Navigation, Balance) alongside the aggregate results in Fig. 4. This allows readers to see how algorithms perform on each task separately."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4 discusses why Q-Learning algorithms (IQL, VDN, QMIX) 'perform suboptimally compared to the actor-critic ones,' attributing this to discrete action discretization in continuous control tasks. This is a substantive discussion of where certain approaches fail."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that Q-Learning algorithms underperform actor-critic methods on the VMAS tasks, and discusses the likely reason (discretized actions in continuous control). This is a negative result for those algorithm classes on these tasks."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims BenchMARL is 'the first MARL training library created to enable standardized benchmarking across different algorithms, models, and environments.' The paper supports this by describing the library architecture and demonstrating it experimentally. The abstract does not make performance claims beyond the library's capabilities."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper's main causal claim is that MA versions (centralized critics) outperform independent versions due to conditioning on global state. This is supported by the controlled comparison where the only difference is centralization (e.g., MAPPO vs IPPO, MADDPG vs IDDPG, MASAC vs ISAC), which constitutes a reasonable ablation-like design."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The experiments only cover 3 VMAS tasks (Navigation, Sampling, Balance) out of 27 available in VMAS and many more across other environments. The paper does not explicitly bound its conclusions to these 3 tasks. Statements like 'MA versions of all algorithms obtain the best overall performance' could be read as general, though they are presented in the context of the VMAS benchmark."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper offers one explanation for Q-Learning underperformance (discrete actions in continuous domains) but does not consider alternative explanations such as hyperparameter tuning differences, training budget differences, or other algorithmic factors."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "This paper does not use LLMs or pre-trained language models. It trains RL agents from scratch using neural networks (MLP, GNN, CNN, LSTM, GRU). Model architecture details are in Table 3."
    137       },
    138       "prompts_provided": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "This paper does not use prompting. It is a reinforcement learning paper where agents learn through environment interaction, not language model prompting."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper states 'The experiment hyperparameters are available in the fine_tuned/vmas folder' of the repository. Default configurations are available in the conf folder. While not printed in the paper itself, they are accessible in the open-source repository."
    147       },
    148       "scaffolding_described": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No agentic scaffolding (in the LLM sense) is used. The paper describes a MARL training pipeline, not an LLM agent system."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3.1 (Tasks) describes how environments are processed: action masking, custom transforms for visual observations (storing images as integers, casting to float for neural networks), and the agent grouping mechanism. Section 4 specifies that default BenchMARL configurations were used."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "There is no dedicated limitations section. The paper has Introduction, Related Work, BenchMARL (system description), Experiments, and Conclusion. No limitations or threats to validity are discussed."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No threats to validity are discussed. The paper does not address potential issues such as hyperparameter sensitivity, limited task diversity (only 3 tasks tested), small number of seeds (3), or potential performance differences across environments."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper does not explicitly state what the benchmark results do NOT show. For instance, it does not acknowledge that results on 3 VMAS tasks may not generalize to other environments (SMACv2, MeltingPot, etc.) or to tasks with different characteristics."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper states 'An interactive version of these results is available here' (referring to Wandb), and fine-tuned configurations and results are in the public repository. This allows independent verification of the experimental results."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 4 describes the experimental setup: 9 algorithms on 3 VMAS tasks, using default BenchMARL configurations with fine-tuned hyperparameters from the conf folder. The results collection process (training runs with 3 seeds, evaluation metrics) is described."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants are involved. The data comes from automated RL experiments in simulation environments."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The execution diagram (Fig. 2) shows the full pipeline from configuration to results. Section 3.2 (Reporting) describes how experiment data is output in JSON format for the marl-eval reporting library, with integration to Wandb and other loggers."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The Acknowledgments section states: 'The majority of the work was done during an internship at PyTorch, Meta. This work was supported in part by European Research Council (ERC) Project 949940 (gAIa) and ARL DCIST CRA W911NF-17-2-0181.'"
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are clearly listed: Matteo Bettini (University of Cambridge, with footnote 'Work done during an internship at PyTorch, Meta'), Amanda Prorok (University of Cambridge), Vincent Moens (PyTorch, Meta)."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Meta/PyTorch is the developer of TorchRL, which BenchMARL is built on, so there is an affiliation interest. However, the paper primarily benchmarks MARL algorithms (not Meta products vs. competitors), and the ERC and ARL grants are independent academic funders. The funder does not have a direct financial interest in which MARL algorithm performs best."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests or financial interests statement is provided in the paper. While affiliations are disclosed, there is no explicit declaration of financial interests or conflicts."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "This paper trains RL agents from scratch in simulation environments. There is no pre-trained model with a training data cutoff to consider."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "RL agents interact with simulation environments in real-time; there is no pre-existing training corpus that could overlap with test data."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Benchmark contamination (in the LLM sense of training data containing test examples) is not applicable to RL agents trained from scratch in procedurally generated simulation environments."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants are involved in this study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper does not report training time, wall-clock time, or inference latency for the benchmark experiments. Given that the paper promotes the library for practical benchmarking, reporting cost per experiment would be relevant."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No GPU hours, hardware specifications, or total compute budget are reported for the experiments. The paper mentions SLURM and HPC cluster support but does not state what hardware was used for the reported results."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "BenchMARL is the first MARL training library created to enable standardized benchmarking across different algorithms, models, and environments.",
    290       "evidence": "The paper describes the library architecture (Sec. 3) with standardized components, configuration via Hydra, and integrated reporting tools. Comparison with related work (Sec. 2) argues other libraries lack this standardization.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "Multi-Agent (MA) versions of all algorithms (MASAC, MADDPG, MAPPO) obtain the best overall performance on VMAS tasks.",
    295       "evidence": "Fig. 4 shows aggregate scores with 95% confidence intervals. MA versions consistently rank higher in IQM, median, and mean metrics across 3 VMAS tasks with 3 seeds per experiment (Sec. 4).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Q-Learning algorithms (IQL, VDN, QMIX) perform suboptimally compared to actor-critic algorithms, potentially due to action discretization in continuous control tasks.",
    300       "evidence": "Figs. 4 and 5 show Q-Learning algorithms ranking lower. The explanation that discrete action spaces impede performance on continuous control tasks is plausible but not directly tested (Sec. 4).",
    301       "supported": "weak"
    302     },
    303     {
    304       "claim": "BenchMARL results match those reported by Bou et al. (2024) and Bettini et al. (2022).",
    305       "evidence": "Stated in Sec. 4: 'The obtained results match the ones reported by Bou et al. [2024] and Bettini et al. [2022].' No detailed comparison is provided.",
    306       "supported": "weak"
    307     }
    308   ],
    309   "methodology_tags": [
    310     "benchmark-eval"
    311   ],
    312   "key_findings": "BenchMARL provides a standardized MARL benchmarking framework built on TorchRL, supporting 9 algorithms, 5 environments (101 tasks), and 6 model architectures. Initial benchmarks on 3 VMAS tasks with 9 algorithms show that multi-agent versions with centralized critics (MAPPO, MADDPG, MASAC) consistently outperform their independent counterparts, while Q-Learning methods underperform on continuous control tasks. The library integrates statistically rigorous reporting tools from Gorsane et al. (2022) and outputs 95% stratified bootstrap confidence intervals.",
    313   "red_flags": [
    314     {
    315       "flag": "Very limited experimental scope",
    316       "detail": "Only 3 out of 101 available tasks across 5 environments are benchmarked. The paper supports environments like SMACv2, MPE, SISL, and MeltingPot but provides no experimental results for them, limiting the generalizability of the benchmark findings."
    317     },
    318     {
    319       "flag": "Only 3 random seeds",
    320       "detail": "All experiments use only 3 random seeds, which is small for reliable statistical inference in RL, where variance across seeds is known to be high. No justification is provided for this choice."
    321     },
    322     {
    323       "flag": "No limitations section",
    324       "detail": "The paper has no limitations, threats to validity, or discussion of what the results do not show. This is a notable omission for a paper about benchmarking methodology and standardization."
    325     },
    326     {
    327       "flag": "No compute cost reported",
    328       "detail": "For a benchmarking library, the computational cost of running benchmarks is directly relevant to practitioners. No training time, hardware, or GPU hours are reported."
    329     },
    330     {
    331       "flag": "Potential conflict of interest",
    332       "detail": "BenchMARL is built on TorchRL (Meta/PyTorch), and one author is from Meta. The paper advocates for TorchRL as the backend, which could be seen as promoting Meta's ecosystem. However, the algorithmic comparisons themselves appear fair."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Torchrl: A data-driven decision-making library for pytorch",
    338       "authors": ["Albert Bou", "Matteo Bettini", "Sebastian Dittert", "Vikash Kumar", "Shagun Sodhani", "Xiaomeng Yang", "Gianni De Fabritiis", "Vincent Moens"],
    339       "year": 2024,
    340       "relevance": "Core backend library for BenchMARL; relevant as an example of standardized RL tooling and benchmarking infrastructure."
    341     },
    342     {
    343       "title": "Towards a standardised performance evaluation protocol for cooperative marl",
    344       "authors": ["Rihab Gorsane", "Omayma Mahjoub", "Ruan John de Kock", "Roland Dubb", "Siddarth Singh", "Arnu Pretorius"],
    345       "year": 2022,
    346       "relevance": "Proposes the reporting tools and statistical evaluation protocol that BenchMARL integrates, addressing the MARL reproducibility crisis."
    347     },
    348     {
    349       "title": "Deep reinforcement learning at the edge of the statistical precipice",
    350       "authors": ["Rishabh Agarwal", "Max Schwarzer", "Pablo Samuel Castro", "Aaron Courville", "Marc G Bellemare"],
    351       "year": 2021,
    352       "relevance": "NeurIPS Outstanding Paper proposing statistically rigorous evaluation methods for RL, directly used in BenchMARL's reporting."
    353     },
    354     {
    355       "title": "Benchmarking multi-agent deep reinforcement learning algorithms in cooperative tasks",
    356       "authors": ["Georgios Papoudakis", "Filippos Christianos", "Lukas Schäfer", "Stefano V. Albrecht"],
    357       "year": 2021,
    358       "relevance": "Prior MARL benchmarking effort (EPyMARL) that BenchMARL aims to improve upon and supersede."
    359     },
    360     {
    361       "title": "Marllib: A scalable and efficient multi-agent reinforcement learning library",
    362       "authors": ["Siyi Hu", "Yifan Zhong", "Minquan Gao", "Weixun Wang", "Hao Dong", "Xiaodan Liang", "Zhihui Li", "Xiaojun Chang", "Yaodong Yang"],
    363       "year": 2023,
    364       "relevance": "Competing MARL library built on RLlib; relevant for comparing approaches to standardized MARL benchmarking."
    365     },
    366     {
    367       "title": "Jaxmarl: Multi-agent rl environments and algorithms in jax",
    368       "authors": ["Alexander Rutherford", "Benjamin Ellis", "Matteo Gallici"],
    369       "year": 2024,
    370       "relevance": "Concurrent Jax-based MARL library, representing a complementary ecosystem for MARL benchmarking."
    371     },
    372     {
    373       "title": "The surprising effectiveness of ppo in cooperative, multi-agent games",
    374       "authors": ["Chao Yu", "Akash Velu", "Eugene Vinitsky", "Yu Wang", "Alexandre Bayen", "Yi Wu"],
    375       "year": 2021,
    376       "arxiv_id": "2103.01955",
    377       "relevance": "Introduces MAPPO, one of the key algorithms benchmarked, and provides evidence for PPO's effectiveness in MARL."
    378     },
    379     {
    380       "title": "Multi-agent actor-critic for mixed cooperative-competitive environments",
    381       "authors": ["Ryan Lowe", "Yi I Wu", "Aviv Tamar", "Jean Harb", "Pieter Abbeel", "Igor Mordatch"],
    382       "year": 2017,
    383       "relevance": "Introduces MADDPG, a foundational multi-agent algorithm benchmarked in BenchMARL."
    384     },
    385     {
    386       "title": "Smacv2: An improved benchmark for cooperative multi-agent reinforcement learning",
    387       "authors": ["Benjamin Ellis", "Skander Moalla", "Mikayel Samvelyan", "Mingfei Sun", "Anuj Mahajan", "Jakob N. Foerster", "Shimon Whiteson"],
    388       "year": 2022,
    389       "arxiv_id": "2212.07489",
    390       "relevance": "Key MARL benchmark environment supported by BenchMARL, representing discrete-action cooperative scenarios."
    391     },
    392     {
    393       "title": "Scalable evaluation of multi-agent reinforcement learning with melting pot",
    394       "authors": ["Joel Z. Leibo", "Edgar Duénez Guzmán", "Alexander Sasha Vezhnevets"],
    395       "year": 2021,
    396       "relevance": "Large-scale MARL evaluation environment (49 tasks) integrated into BenchMARL, testing cooperative and competitive scenarios."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs