scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28835B)
      1 {
      2   "paper": {
      3     "title": "Towards a Science of Scaling Agent Systems",
      4     "authors": ["Yubin Kim", "Ken Gu", "Chanwoo Park", "Chunjong Park", "Samuel Schmidgall", "A. Ali Heydari", "Yao Yan", "Zhihan Zhang", "Yuchen Zhuang", "Yun Liu", "Mark Malhotra", "Paul Pu Liang", "Hae Won Park", "Yuzhe Yang", "Xuhai Xu", "Yilun Du", "Shwetak Patel", "Tim Althoff", "Daniel McDuff", "Xin Liu"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.08296",
      8     "doi": "10.48550/arXiv.2512.08296"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Multi-agent system performance is highly task-dependent, ranging from +80.9% improvement on parallelizable financial reasoning tasks to -70% degradation on sequential planning tasks. A predictive mixed-effects model (R²=0.524) using coordination metrics achieves 87% accuracy in selecting optimal architectures for held-out configurations. Three dominant effects identified: tool-coordination trade-off, capability saturation above ~45% single-agent baseline, and architecture-dependent error amplification (17.2× for independent vs 4.4× for centralized). Out-of-sample validation on GPT-5.2 confirms four of five scaling principles generalize.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: Finance-Agent, BrowseComp-Plus, PlanCraft, and WorkBench, all cited with references and publicly accessible."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using LiteLLM and LangChain (Appendix E) but does not provide requirements.txt, Dockerfile, or detailed dependency versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Appendix E describes the setup conceptually but not with executable instructions."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% confidence intervals are reported for regression coefficients (Table 4), bootstrap SEs, and error amplification factors (e.g., '95% CI: [14.3, 20.1]')."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "p-values reported throughout for regression coefficients (Table 4), t-tests for turn count differences (e.g., t(178)=16.8, p<0.001), Shapiro-Wilk and Breusch-Pagan tests for residual diagnostics."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Standardized regression coefficients reported (e.g., β̂=-0.267), Cohen's d referenced in Figure 6 caption (d>1.2, d≈0.35), and percentage improvements with baselines throughout (e.g., '+80.8% mean 0.631 vs. SAS 0.349')."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "N=180 configurations is stated but no justification for why this number is sufficient. Sample sizes per benchmark (50-100 instances) are stated in Appendix E.4 as 'balancing computational cost with statistical significance' but no power analysis is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations reported for turn counts (e.g., '7.2±2.1'), redundancy rates, cross-validation R² (±0.033 SD), MAE (±0.011), and coefficient of variation across families (CV<0.02)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Single-Agent System (SAS) serves as the baseline throughout, with all MAS variants compared against it. Prior claims ('More agents is all you need') are directly contrasted."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines use frontier models (GPT-5, Gemini 2.5 Pro, Claude 4.5 Sonnet) and recent benchmarks (BrowseComp-Plus 2025, Finance-Agent 2025). All are contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The five architectures (SAS, Independent, Centralized, Decentralized, Hybrid) form a structured ablation over orchestrator presence and peer communication (Section 4.1). Agent heterogeneity experiments in Figure 4 also serve as ablations."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics used: task success/accuracy, coordination efficiency (Ec), error amplification (Ae), message density, redundancy, overhead, success per 1K tokens, information gain, and cost per 1% gain."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of system outputs. All evaluation is automated via domain-specific validators with reported inter-rater reliability (Cohen's κ), but these are automated, not human."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Five-fold cross-validation with experiment-level holdout is used (Section 4.3). Out-of-sample validation on GPT-5.2 (released after the study) provides a truly held-out test (Appendix B)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by benchmark (Figure 2), by architecture, by LLM family (Figures 3, 4, 6), and by error category (Section 4.4). Table 4 shows per-predictor breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive failure analysis: PlanCraft degradation (-39% to -70%) with execution trace examples, error taxonomy (Section 4.4) with four error categories, MAST failure taxonomy applied, and architecture-specific failure modes documented."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Major negative results reported: PlanCraft universal degradation, independent MAS underperformance, capability saturation, overall mean MAS improvement of -3.5%. The paper's central thesis is that MAS often hurts performance."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about R²=0.524, 87% architecture prediction accuracy, error amplification ratios (17.2× vs 4.4×), +80.8% Finance Agent improvement, -39% to -70% PlanCraft degradation, and GPT-5.2 validation (MAE=0.071) are all supported with detailed results in Sections 4.2-4.4 and Appendix B."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly designs controlled experiments to enable causal attribution, controlling for prompts, tools, and computational budgets while varying only coordination structure and model capability (Section 4.1). The ablation-like structure of the five architectures supports causal claims about coordination mechanisms."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5 (Limitations) explicitly bounds generalization: four benchmarks may not capture full spectrum, scaling limited to 9 agents, all agents share base architectures, prompts not optimized per model. The title says 'Towards' a science, appropriately hedging."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses multiple alternative explanations: whether performance gains come from compute vs coordination (matched budgets address this), whether prompt sensitivity could affect results (Limitation v), and considers model-specific mechanisms for family differences (Section 4.2)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper is careful about measurement: task success metrics are domain-specific and explicitly defined (Section 4.1 Metrics), and the paper distinguishes between coordination metrics (proxies) and actual performance. The R²=0.524 is presented honestly as explaining 'more than half of performance variance.'"
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are referenced by marketing names only: 'GPT-5', 'GPT-5-mini', 'GPT-5-nano', 'Gemini 2.5 Pro', 'Claude Sonnet 4.5', etc. No API versions, snapshot dates, or specific model IDs (e.g., gpt-5-0613) are provided."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Section E.3 describes a prompt compilation system with YAML templates and variable interpolation, but no actual prompt text is provided. The paper says 'prompts are defined in YAML files' but does not include them."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix E.2 reports architecture parameters: max 10 iterations for SAS, 3 agents for MAS, 5 orchestration rounds, 3 debate rounds, 3 iterations per round. Section E.5 reports temperature τ=0.7 and K=10 for information gain computation. Token budget mean 4,800 stated in Section 4.4."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.1 formally defines each architecture with communication topology, orchestration policy, and complexity analysis. Table 2 provides detailed complexity metrics. Section E describes tools, API integration, and agent configuration in detail."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix D describes each dataset with instance counts. Section E.4 describes sample sizes and selection criteria. Metrics are defined with normalization procedures (Section 4.1). Token overlap analysis methodology described with BERTScore threshold of 0.3."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Limitations and Future Works' provides a dedicated section with six numbered limitation categories (i-vi)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: (i) scaling limited to 9 agents, (ii) agents share same base architecture, (iii) tool-heavy environments as failure modes, (iv) prompts not optimized per model, (v) four benchmarks may not cover full spectrum, (vi) economic viability concerns."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5 explicitly states boundaries: does not include embodied agents, multi-user interaction, or long-horizon temporal dependencies. Agent scaling explored only up to 9. Only four benchmarks tested. Prompts not tuned per model family."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (execution traces, per-instance results, agent logs) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix D describes each benchmark dataset and its collection. Section 4.1 describes the experimental setup in detail: 180 configurations, 5 architectures × 9 models × 4 benchmarks."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All experiments are automated benchmark evaluations."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: benchmarks → agent configurations → API calls via LiteLLM/LangChain → domain-specific validators → coordination metrics computation → mixed-effects modeling. Section E provides implementation details."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding statement or acknowledgments section mentioning funding sources. The paper is marked '© 2025 Google. All rights reserved' but no explicit funding disclosure."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly listed: Google Research, Google DeepMind, and MIT. The superscripts map each author to their affiliation."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Most authors are from Google Research/DeepMind, and the paper evaluates Google's Gemini models alongside competitors. Google has a direct financial interest in the relative performance of its models and in the conclusions about multi-agent architectures."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure section. Given Google affiliation and evaluation of Google products, this is a notable omission."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the nine models evaluated. The paper discusses benchmark contamination conceptually (referencing Kapoor et al. 2025) but does not state cutoffs."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether any benchmark data could appear in the training sets of the evaluated models. Some benchmarks (PlanCraft, Workbench) may predate model training."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Despite referencing Kapoor et al. (2025) on benchmark evaluation pitfalls and discussing contamination conceptually for non-agentic tasks, the paper does not address contamination risk for its own four benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Cost analysis in Section 4.4: dollar costs per trial by model family (OpenAI ~$0.008, Anthropic ~$0.024, Google ~$0.012 per 1% gain), success per 1K tokens (SAS 67.7, Centralized 21.5, Hybrid 13.6), and Figure 3 shows cost-performance trade-offs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Token budgets stated: mean 4,800 tokens per trial matched across architectures (Section 4.4, Table 5). Total of 15,750 instance runs across 180 configurations."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be single-run per configuration, with variance coming from across-configuration comparisons rather than multiple seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "N=180 configurations stated, with 15,750 total instance runs. Per-benchmark instance counts given in Appendix E.4 (50-100 instances each). K=10 Monte Carlo samples stated for information gain."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. Architecture parameters (iterations, rounds, agent counts) appear fixed without justification for specific values chosen."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "All 180 configurations are reported, not just the best. Table 4 shows all coefficients including non-significant ones. Cross-validation used for model selection (Table 3 compares model specifications)."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 4 reports 20 p-values from the regression model without correction for multiple comparisons. No Bonferroni, Holm, or other family-wise error rate correction mentioned."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implemented all five architectures and evaluate them. No discussion of self-comparison bias or acknowledgment that their implementations of baselines might not be optimal (per Lucic et al. 2018, which they don't cite)."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Token budgets explicitly matched across architectures (Section 4.1, 4.4). Figure 3 shows cost vs performance. Efficiency metric Ec normalizes success by computational cost. The paper explicitly discusses how fixed budgets affect per-agent reasoning capacity."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3.2 extensively discusses what makes tasks 'agentic' vs non-agentic, formally defining agentic evaluation criteria. The paper questions whether non-agentic benchmarks give misleading guidance and chooses benchmarks satisfying three necessary conditions for agentic evaluation."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The paper's central design principle is controlling for scaffold confounds: 'identical task prompts, tools, and computational budgets across all configurations, while systematically varying only coordination structure and model capability' (Section 1, 4.1)."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether benchmark tasks existed before model training cutoffs. BrowseComp-Plus (2025) and Finance-Agent (2025) are recent, but PlanCraft (2024) and Workbench (2024) could have been in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setups leak information. For agentic tasks with tool use, this is less of a concern, but the paper does not address it."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training data and benchmark instances share structural similarities or overlap."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods applied despite evaluating pre-trained models on benchmarks."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Multi-agent systems show highly heterogeneous performance, ranging from +80.9% improvement (Finance Agent, Centralized) to -70.0% degradation (PlanCraft, Independent).",
    365       "evidence": "Section 4.2, Figure 2: detailed per-benchmark, per-architecture results with percentage changes vs SAS baseline. N=180 configurations across 4 benchmarks.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "A predictive mixed-effects model achieves cross-validated R²=0.524 and 87% accuracy in predicting optimal architectures for held-out configurations.",
    370       "evidence": "Section 4.3, Table 3: five-fold cross-validation with experiment-level holdout. R²_CV=0.524 (±0.033 SD), MAE=0.089 (±0.011). Model comparison shows 20% improvement over categorical architecture labels.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "There is a capability saturation effect: coordination yields diminishing or negative returns (β̂=-0.404, p<0.001) once single-agent baselines exceed ~45%.",
    375       "evidence": "Section 4.3, Table 4: P_SA × log(1+n_a) interaction coefficient β̂=-0.404, 95% CI [-0.557, -0.252], p<0.001. Decision boundary derived at raw performance ≈0.45.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Independent agents amplify errors 17.2× while centralized coordination contains this to 4.4×.",
    380       "evidence": "Table 5: error amplification factors across architectures. Section 4.4 provides 95% CIs for independent [14.3, 20.1] and centralized [3.8, 5.0].",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Out-of-sample validation on GPT-5.2 achieves MAE=0.071 and confirms four of five scaling principles.",
    385       "evidence": "Appendix B, Tables 7-9: GPT-5.2 (Intelligence Index 75) tested on BrowseComp-Plus. SAS over-predicted (+49.5%), but MAS predictions well-calibrated (average error 7.4%). Kendall's τ=0.200 is weak for ranking.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Coordination scaling follows model-agnostic principles with maximum inter-family difference Δmax=0.023.",
    390       "evidence": "Section 4.1: cross-family consistency with CV<0.02 across families. However, Section 4.2 and Figure 4 show notable family-specific differences in architecture preferences.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own products",
    397       "detail": "Most authors are Google Research/DeepMind employees evaluating Gemini models alongside competitors. While results don't obviously favor Google (Gemini is not always best), the conflict is not disclosed. No competing interests statement."
    398     },
    399     {
    400       "flag": "No seed sensitivity analysis",
    401       "detail": "For a paper establishing 'scaling principles,' the absence of multi-seed evaluation is notable. Henderson et al. (2018), which the paper cites in its experimental rigor module description, showed RL results can vary 2x across seeds. The 180 configurations provide cross-configuration variance but not within-configuration stability."
    402     },
    403     {
    404       "flag": "Weak out-of-sample ranking validation",
    405       "detail": "The GPT-5.2 validation (Appendix B) reports Kendall's τ=0.200 for architecture ranking, described as 'weak.' SAS is over-predicted by 49.5%. The '4/5 findings validated' framing obscures that the model fails at the primary task of predicting optimal architecture for this held-out model."
    406     },
    407     {
    408       "flag": "No contamination analysis despite evaluating pre-trained models",
    409       "detail": "The paper cites Kapoor et al. (2025) on benchmark evaluation pitfalls but does not address contamination risk for its own four benchmarks, especially PlanCraft (2024) and WorkBench (2024) which predate some model training periods."
    410     },
    411     {
    412       "flag": "No code or data release",
    413       "detail": "For a paper proposing quantitative scaling principles as an engineering tool, the absence of code, prompts, and raw experimental data limits reproducibility and independent verification of the 180-configuration evaluation."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "AI agents that matter",
    419       "authors": ["S. Kapoor", "B. Stroebl", "Z. S. Siegel", "N. Nadgir", "A. Narayanan"],
    420       "year": 2025,
    421       "relevance": "Key prior work on agentic evaluation pitfalls, benchmark rigor, and cost metrics for agent systems."
    422     },
    423     {
    424       "title": "More agents is all you need",
    425       "authors": ["J. Li", "Q. Zhang", "Y. Yu", "Q. Fu", "D. Ye"],
    426       "year": 2024,
    427       "relevance": "Claim that scaling agent count improves performance, directly challenged by this paper's findings."
    428     },
    429     {
    430       "title": "Why do multi-agent LLM systems fail?",
    431       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    432       "year": 2025,
    433       "arxiv_id": "2503.13657",
    434       "relevance": "Multi-Agent System Failure Taxonomy (MAST) used in this paper's error analysis."
    435     },
    436     {
    437       "title": "Establishing best practices in building rigorous agentic benchmarks",
    438       "authors": ["Y. Zhu", "T. Jin", "Y. Pruksachatkun"],
    439       "year": 2025,
    440       "relevance": "Agentic Benchmark Checklist (ABC) extended by this paper for evaluation rigor."
    441     },
    442     {
    443       "title": "SWE-bench: Can language models resolve real-world github issues?",
    444       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    445       "year": 2024,
    446       "relevance": "Major agentic benchmark for software engineering, referenced as exemplar of agentic evaluation."
    447     },
    448     {
    449       "title": "Scaling large language model-based multi-agent collaboration",
    450       "authors": ["C. Qian", "Z. Xie", "Y. Wang"],
    451       "year": 2025,
    452       "relevance": "Proposes collaborative scaling laws for MAS, which this paper finds show no universal pattern."
    453     },
    454     {
    455       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    456       "authors": ["S. Hong", "M. Zhuge", "J. Chen"],
    457       "year": 2024,
    458       "relevance": "Meta-programming workflows for MAS that mitigate hallucination cascades."
    459     },
    460     {
    461       "title": "Single-agent or multi-agent systems? Why not both?",
    462       "authors": ["M. Gao", "Y. Li", "B. Liu"],
    463       "year": 2025,
    464       "arxiv_id": "2505.18286",
    465       "relevance": "Shows benefits of MAS diminish as base models improve, supporting capability saturation finding."
    466     },
    467     {
    468       "title": "Multi-agent architecture search via agentic supernet",
    469       "authors": ["G. Zhang", "L. Niu", "J. Fang"],
    470       "year": 2025,
    471       "relevance": "Achieves comparable MAS performance at 6-45% cost through dynamic architecture search."
    472     },
    473     {
    474       "title": "Scaling laws for neural language models",
    475       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan"],
    476       "year": 2020,
    477       "arxiv_id": "2001.08361",
    478       "relevance": "Foundational neural scaling laws that this paper contrasts with agent coordination scaling."
    479     },
    480     {
    481       "title": "Improving factuality and reasoning in language models through multiagent debate",
    482       "authors": ["Y. Du", "S. Li", "A. Torralba"],
    483       "year": 2023,
    484       "relevance": "Demonstrates peer-to-peer debate effectiveness depends on task decomposability."
    485     },
    486     {
    487       "title": "Should we be going MAD? A look at multi-agent debate strategies for LLMs",
    488       "authors": ["A. Smit", "P. Duckworth", "N. Grinsztajn"],
    489       "year": 2023,
    490       "arxiv_id": "2311.17371",
    491       "relevance": "Shows multi-agent debate does not reliably outperform single-agent self-consistency."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs