scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22676B)
      1 {
      2   "paper": {
      3     "title": "Scaling Laws for a Multi-Agent Reinforcement Learning Model",
      4     "authors": ["Oren Neumann", "Claudius Gros"],
      5     "year": 2022,
      6     "venue": "International Conference on Learning Representations",
      7     "arxiv_id": "2210.00849",
      8     "doi": "10.48550/arXiv.2210.00849"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Player strength in AlphaZero scales as a power law with neural network parameter count (exponent ~0.88) and with compute (exponent ~0.55) on Connect Four and Pentago. The scaling exponents are nearly identical across both games, suggesting possible universality classes. The optimal model size scales as a power law with compute, with an exponent derivable from the individual scaling laws. Larger models are more sample-efficient, outperforming smaller models given the same training data.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'All code and data used in our experiments are available online' with a GitHub link (https://github.com/OrenNeumann/AlphaZero-scaling-laws) in Section 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states all code and data are available online at the GitHub repository."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is described in the paper. They mention using OpenSpiel but do not specify library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. Hyperparameters are listed in Appendix A but there are no explicit instructions for reproducing experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The plots in Figures 2-5 show data spread across multiple training seeds. Elo scores are shown with visible variance across the 6 seeds per configuration."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal statistical significance tests are reported. Claims of scaling laws are based on visual log-linear fits without p-values or goodness-of-fit statistics."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Scaling exponents are reported precisely (Table 1: αN=0.88/0.87, αC=0.55/0.55, αopt_C=0.62/0.63) providing quantitative effect sizes for the scaling relationships."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 6 seeds per configuration were chosen, nor why specific numbers of model sizes were tested. The total pools of 1326 and 714 agents are stated but not justified."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Each training configuration is repeated 6 times with different random seeds (Section 4), and variance across seeds is visible in the figures. Appendix D shows averaged compute scaling."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Agents are benchmarked against a perfect game solver for Connect Four (Pons, 2015) and a near-optimal benchmark player for Pentago using a 4TB solver dataset (Irving, 2014)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Perfect/near-perfect game solvers are the strongest possible baselines. The paper also compares against prior scaling law results (Jones 2021, Kaplan et al. 2020)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The paper varies model size and compute but does not ablate components of the AlphaZero algorithm."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports Elo scores, playing strength (Bradley-Terry γ), expected game scores, and also examines test-set loss for both value and policy heads (Appendix F)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to this paper; it studies scaling laws of automated game-playing agents measured by Elo ratings."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Elo scores are computed from match outcomes between agents, which is inherently a held-out evaluation (agents play against each other at test time, not on training data). Appendix F uses a separate solver-generated test set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by game (Connect Four, Pentago, Oware in appendix), by scaling type (size, compute, optimal), and by training duration (Fig. 3)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Appendix E discusses Oware where large models fail to converge, with analysis of possible causes (integer vs boolean input format). Section E.4 discusses training issues."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Oware's large model failures are reported (Appendix E). Appendix F shows that test-set loss is NOT a good predictor of performance scaling, a negative result."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "All abstract claims (power-law scaling with parameters, compute, optimal size, sample efficiency) are supported by corresponding results in Sections 5.1-5.5 with figures and numerical exponents."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about scaling relationships. These are supported by controlled experiments varying individual factors (model size with fixed compute, compute with optimal sizing), with 6 seeds per configuration."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is careful to bound claims to Connect Four and Pentago, noting Oware shows different behavior (Appendix E) and that extrapolation to Go/Chess/Shogi is speculative (Section 5.4.1). The title says 'a Multi-Agent RL Model' not 'all MARL'."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5.3 discusses why Hex exponents differ (game types, architecture choices). Section 5.4.1 discusses why AlphaGo/AlphaZero fall below the curve (different architectures, game complexity). Section 6 notes hyperparameter tuning could affect exponents."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly relates Elo rating to playing strength via the Bradley-Terry model (Section 3.2), discusses when Elo fails (non-transitive dynamics, Appendix F last paragraph), and notes test loss is a poor proxy for performance."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The architecture is fully specified: MLP with 2 fully-connected hidden layers as torso, policy and value heads each with 1 hidden layer, width varied 4-256. Uses OpenSpiel's AlphaZero implementation (Lanctot et al., 2019)."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This paper does not use language model prompting; it trains RL agents."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 3 in Appendix A provides a comprehensive list of all hyperparameters with values and descriptions, including MCTS parameters, learning rate, batch size, replay buffer, temperature, etc."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a standard AlphaZero training pipeline."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The data generation process is described: self-play with MCTS generates training data, replay buffer management is specified (size 2^16, reuse factor 10). Compute is defined as C = S·T·F·D (Section 5.3)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The Discussion (Section 6) mentions hyperparameter tuning as future work and notes limitations are spread across the text, but no formal limitations section exists."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats are discussed: hyperparameter tuning could change exponents (Section 6), Oware's training issues with non-boolean inputs (Section E.4), Elo's failure with non-transitive dynamics (Appendix F), and the architectural difference between MLPs and ResNets (Section 5.4.1)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5.4.1 explicitly states the scaling law 'might not accurately describe AlphaGo Zero and AlphaZero trained on Go, chess and shogi due to the differences between the games.' Section 6 notes significance is 'evidence of power-law scaling' not the exact exponent values."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states all code and data are available at the GitHub repository."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection is self-play: agents generate their own training data through MCTS-guided games. The process is described in Section 4 with all relevant parameters."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is generated through self-play simulations."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: self-play generates games → states stored in replay buffer → optimization steps → Elo calculation via BayesElo. Compute definition is explicit (Section 5.3)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is mentioned. The acknowledgments section thanks individuals for discussions but does not mention any funding."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Authors are from Institute for Theoretical Physics, Goethe University Frankfurt. Affiliations are clearly stated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funder is disclosed; appears to be unfunded academic research."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper trains RL agents from scratch via self-play; it does not evaluate a pre-trained model's capability on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — agents generate their own training data through self-play and are evaluated via matches against each other and solvers."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — no pre-trained model is evaluated on an existing benchmark. Agents are trained from scratch."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix B discusses inference-time compute scaling in detail, examining performance under inference compute constraints with varying MCTS steps."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Training compute is defined explicitly as C = S·T·F·D (Section 5.3) and measured in FLOPs. Results are plotted as a function of compute budget in Fig. 4."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Each configuration is trained 6 times with different random seeds (Section 4), and the variance across seeds is visible in all figures."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4: 'repeating each training six times with different random seeds to ensure reproducibility (Henderson et al., 2018).'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "The paper explicitly avoids hyperparameter tuning: 'we specifically avoid hyperparameter tuning whenever possible, fixing most hyperparameters to the ones suggested in OpenSpiel's AlphaZero example code' (Section 4). Only temperature drop is varied."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper uses default OpenSpiel hyperparameters rather than selecting best configurations. For compute scaling, the Pareto front approach is well-defined. Appendix D confirms scaling holds for averaged (not cherry-picked) results."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors train and evaluate their own AlphaZero agents without discussing self-evaluation bias. However, the use of perfect game solvers as external benchmarks partially mitigates this."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "This is a central result of the paper. Figure 4 shows performance as a function of compute, and the optimal model size vs compute relationship is the main contribution (Fig. 1)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper discusses Elo's construct validity: Section 3.2 derives the relationship between Elo and playing strength, Appendix C tests for measurement bias, and Appendix F's last paragraph acknowledges Elo fails for non-transitive dynamics."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. All agents use the same AlphaZero algorithm with MCTS."
    337       }
    338     }
    339   },
    340   "claims": [
    341     {
    342       "claim": "Player strength scales as a power law with neural network parameter count (αN ≈ 0.88 for Connect Four, 0.87 for Pentago)",
    343       "evidence": "Figure 2 shows log-linear Elo scaling with model size for both games. Table 1 reports exponents. 1326 Connect Four agents and 714 Pentago agents trained with 6 seeds each.",
    344       "supported": "strong"
    345     },
    346     {
    347       "claim": "Player strength scales as a power law with compute when training optimally sized agents (αC ≈ 0.55 for both games)",
    348       "evidence": "Figure 4 shows log-linear Pareto front scaling with compute for both games. Table 1 reports identical exponents. Appendix D confirms with averaged (not best-of) results.",
    349       "supported": "strong"
    350     },
    351     {
    352       "claim": "Optimal model size scales as a power law with compute, with an exponent derivable from the other two scaling laws (αopt_C = αC/αN ≈ 0.62-0.63)",
    353       "evidence": "Figure 1 shows the predicted and observed scaling. The analytically derived exponent from Eq. 8 matches empirical data for both games.",
    354       "supported": "strong"
    355     },
    356     {
    357       "claim": "AlphaGo Zero and AlphaZero were substantially undersized relative to their training compute",
    358       "evidence": "Figure 1 right shows these models fall 'well below the optimal trend.' However, this extrapolates from Connect Four/Pentago MLPs to Go/Chess ResNets, which the authors acknowledge may not apply.",
    359       "supported": "weak"
    360     },
    361     {
    362       "claim": "Larger AlphaZero models are more sample efficient",
    363       "evidence": "Figure 5 shows larger models outperform smaller ones given the same training data for both games.",
    364       "supported": "strong"
    365     }
    366   ],
    367   "red_flags": [
    368     {
    369       "flag": "No formal goodness-of-fit statistics",
    370       "detail": "Scaling law fits are presented visually with log-linear trends but no R² values, residual analysis, or formal statistical tests for the power-law hypothesis vs alternatives."
    371     },
    372     {
    373       "flag": "Extrapolation to state-of-the-art models is weakly supported",
    374       "detail": "The claim that AlphaGo Zero and AlphaZero were undersized extrapolates from small games with MLPs to large games with ResNets. The authors acknowledge this but still present it prominently in Figure 1."
    375     }
    376   ],
    377   "cited_papers": [
    378     {
    379       "title": "Scaling Laws for Neural Language Models",
    380       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B Brown", "Benjamin Chess", "Rewon Child", "Scott Gray", "Alec Radford", "Jeffrey Wu", "Dario Amodei"],
    381       "year": 2020,
    382       "arxiv_id": "2001.08361",
    383       "relevance": "Foundational scaling laws paper for language models that this work extends to MARL."
    384     },
    385     {
    386       "title": "Training Compute-Optimal Large Language Models",
    387       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    388       "year": 2022,
    389       "arxiv_id": "2203.15556",
    390       "relevance": "Chinchilla paper recalculating optimal compute allocation; directly compared scaling exponents."
    391     },
    392     {
    393       "title": "Deep Reinforcement Learning that Matters",
    394       "authors": ["Peter Henderson", "Riashat Islam", "Philip Bachman", "Joelle Pineau", "Doina Precup", "David Meger"],
    395       "year": 2018,
    396       "relevance": "Seminal paper on reproducibility in deep RL; motivated the 6-seed experimental design used here."
    397     },
    398     {
    399       "title": "Scaling Scaling Laws with Board Games",
    400       "authors": ["Andy L Jones"],
    401       "year": 2021,
    402       "arxiv_id": "2104.03113",
    403       "relevance": "Closest prior work showing compute scaling in AlphaZero on Hex; directly compared and extended."
    404     },
    405     {
    406       "title": "A Generalist Agent",
    407       "authors": ["Scott Reed", "Konrad Zolna", "Emilio Parisotto"],
    408       "year": 2022,
    409       "arxiv_id": "2205.06175",
    410       "relevance": "Gato multi-task agent showing performance scaling with model size across diverse tasks."
    411     },
    412     {
    413       "title": "Multi-Game Decision Transformers",
    414       "authors": ["Kuang-Huei Lee", "Ofir Nachum", "Mengjiao Yang"],
    415       "year": 2022,
    416       "arxiv_id": "2205.15241",
    417       "relevance": "Applies Transformer architecture to Atari games with model-size scaling analysis."
    418     },
    419     {
    420       "title": "Language Models are Few-Shot Learners",
    421       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    422       "year": 2020,
    423       "relevance": "GPT-3 paper demonstrating the practical impact of scaling laws in creating large models."
    424     },
    425     {
    426       "title": "Deep Learning Scaling is Predictable, Empirically",
    427       "authors": ["Joel Hestness", "Sharan Narang", "Newsha Ardalani"],
    428       "year": 2017,
    429       "arxiv_id": "1712.00409",
    430       "relevance": "Early observation of power-law scaling across ML architectures."
    431     },
    432     {
    433       "title": "Energy and Policy Considerations for Deep Learning in NLP",
    434       "authors": ["Emma Strubell", "Ananya Ganesh", "Andrew McCallum"],
    435       "year": 2019,
    436       "arxiv_id": "1906.02243",
    437       "relevance": "Discusses environmental and access implications of scaling trends; cited re: small-lab concerns."
    438     }
    439   ]
    440 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs