scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26719B)
      1 {
      2   "paper": {
      3     "title": "SC-MAS: Constructing Cost-Efficient Multi-Agent Systems with Edge-Level Heterogeneous Collaboration",
      4     "authors": ["Di Zhao", "Longhui Ma", "Siwei Wang", "Miao Wang", "Yi Kong"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.09434",
      8     "doi": "10.48550/arXiv.2601.09434"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SC-MAS models multi-agent systems as directed acyclic graphs with edge-level heterogeneous collaboration strategies, achieving 1.46-3.53% accuracy gains with 11-16% cost reductions over MasRouter across five benchmarks (MMLU, GSM8K, MATH, HumanEval, MBPP). Ablation shows the edge optimizer and LLM router are the most impactful components. The framework can generalize to unseen LLMs (Deepseek-v3) without retraining and implicitly filters unhelpful agents during selection.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All five benchmarks used (MMLU, GSM8K, MATH, HumanEval, MBPP) are publicly available standard datasets. The paper references their original publications."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section C.2 mentions '8G GPU for training' but provides no requirements.txt, Dockerfile, library versions, or detailed environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithm description (Algorithm 2) describes the method but is not a reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Despite stating 'We run the experiment three times on each dataset and report the average,' Tables 1, 2, and 4 report only point estimates with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used. All claims of improvement (e.g., '3.35% improvement on MMLU') are based on comparing point estimates without any test of statistical significance."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '3.35% on MMLU' (84.25 → 87.60), '3.53% on MBPP' (84.00 → 87.53), '12.13% cost reduction on MBPP.' Both absolute and relative differences are provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for dataset sizes or the 1:4 validation/test split ratio. The MATH subset of 519 problems is mentioned as following prior work protocol but not independently justified."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper states experiments are 'run three times' and averages are reported, but no standard deviation, variance, or any spread measure appears in any table or figure."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Extensive baselines are compared in Table 1: dynamic MAS methods (GPTSwarm, AgentPrune, AFlow), single-agent routing (PromptLLM, RouteLLM, FrugalGPT, RouterDC), and multi-agent routing (MasRouter), plus vanilla single-model results."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include MasRouter (2025), AFlow (2025c), RouteLLM (2025), and other 2024-2025 methods, representing the current state of the art in MAS routing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 presents a thorough ablation replacing each component (Fθv, Fθe, Fθl) with randomized selection, measuring both accuracy and cost impact across all five datasets."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports both accuracy and inference cost (in dollars) across all benchmarks, providing two complementary evaluation dimensions."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is performed. All evaluation is automated via benchmark test suites (accuracy on MMLU, GSM8K, MATH, HumanEval, MBPP)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 5.1 states 'we partition each dataset into validation and test subsets using a 1:4 ratio,' with validation used for training and test for evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down across 5 benchmarks (Table 1), per-component ablation (Table 2), role selection distributions (Figure 3), LLM allocation distributions (Figure 4), and case studies per dataset (Appendix D)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No failure analysis or error cases are presented. Section 5.4 shows filtering of deliberately noisy agents but does not analyze cases where SC-MAS fails on actual tasks."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Every experiment shows SC-MAS outperforming all baselines. The sensitivity analysis (Section 5.6) shows expected cost-performance tradeoffs but no surprising negative findings or approaches that were tried and abandoned."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 3.35% accuracy gain on MMLU (84.25→87.60 in Table 1) and 3.53% gain on MBPP (84.00→87.53) are confirmed by Table 1. Cost reduction claims are supported by Figure 2 and Table 4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 2) uses controlled single-variable manipulation (replacing each component with random selection), which is an adequate design for claims like 'removing Fθe resulted in a performance drop.' This supports the causal role of each component."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Constructing Cost-Efficient Multi-Agent Systems' generally, but results are limited to 5 specific benchmarks (knowledge QA, math, coding). No qualification is provided about task types not covered (e.g., open-ended generation, creative tasks, long-form reasoning)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for SC-MAS's improvements. For example, whether the gains come specifically from heterogeneous collaboration or simply from using more diverse LLM combinations, or from the training procedure itself, is not explored."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures accuracy on standard benchmarks and reports it as accuracy, and measures token cost and reports it as cost. The claims match the granularity of measurements without inflated framing."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Only 'gpt-4o-mini-0718' has a specific version identifier. 'claude-3.5-haiku', 'gemini-1.5-flash', and 'llama-3.1-70b' are named without specific API versions or snapshot dates. Model behavior can change across versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The collaboration strategies (Chain, Debate, Criticism, CoT, Reflection) are named and cited but actual prompt text is never provided. The 26 agent role definitions are also not shown."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.1 reports learning rate α=0.01, temperature τ=1, cost penalty λ=5 (from {0,5,10,15,20}), and states LLM temperature parameters 'remain at default settings.'"
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The SC-MAS framework is described in detail: Node Selector (Section 4.1), Edge Optimizer (Section 4.2), LLM Router (Section 4.3), graph execution (Algorithm 1), and the full training workflow (Algorithm 2). Collaboration strategies are defined and the DAG constraint is formalized."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1 documents the 1:4 validation/test split and the stratified sampling procedure for MATH (519 problems spanning difficulty levels), following the MasRouter protocol."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Limitation' is a dedicated section discussing two specific limitations: structural DAG constraints and limited interpretability."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to SC-MAS: (1) the DAG constraint 'precludes cyclical communication patterns and feedback loops' needed for iterative refinement, (2) the 'rationale behind selecting specific agents, collaboration strategies, and large language models remains non-trivial to interpret.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The limitations section discusses design constraints but does not explicitly state what the results do NOT show — e.g., no statement about untested task types, scale limitations, or settings where SC-MAS would not apply."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (per-run results, agent selections, generated graphs) is released. Only aggregated results in tables and figures."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes the benchmark datasets used, their sources (with citations), and the evaluation protocol including validation/test splits and stratified sampling."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All evaluation uses standard benchmark datasets."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from query input through node selection, edge optimization, LLM assignment, graph execution, and reward computation is fully documented in Algorithms 1 and 2. Dataset splitting is described in Section 5.1."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources, grants, or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: College of Computer Science and Technology, National University of Defense Technology, and Academy of Military Sciences."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence of funder cannot be assessed. The authors are at military institutions which could have institutional interest in cost-efficient AI systems, but no funding statement addresses this."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the four LLM backbones (GPT-4o-mini, Claude 3.5 Haiku, Gemini 1.5 Flash, Llama 3.1 70B)."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark examples (MMLU, GSM8K, MATH, HumanEval, MBPP — all published 2021) appeared in the training data of models trained through 2024."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "All five benchmarks were published in 2021, well before the training cutoffs of models used (2024 models). No contamination analysis is performed despite this obvious risk."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Inference costs in dollars are reported in Table 2, Table 4, and Figure 2, comparing SC-MAS against baselines across all five benchmarks."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section C.2 states '8G GPU for training' and Table 5 provides detailed training token counts (prompt tokens, completion tokens, total cost) for SC-MAS and baselines on MATH and MMLU."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper runs experiments three times but does not report per-seed results or any measure of seed sensitivity. Only averaged results appear in the tables."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 5.1 explicitly states 'We run the experiment three times on each dataset and report the average as the final result.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The values α=0.01, τ=1 are stated without explaining how they were selected or how many configurations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The λ parameter is varied across {0, 5, 10, 15, 20} with full results shown in Figure 5. The choice of λ=5 for main results is transparent, and the sensitivity analysis reveals the tradeoff curve."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No multiple comparison correction is applied. The paper compares SC-MAS against 8+ baselines across 5 datasets without any family-wise error rate control, and no significance tests are used at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors compare their SC-MAS implementation against their own implementations of baselines without acknowledging potential author-evaluation bias (Lucic et al. 2018 issue)."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 2 directly compares inference costs between SC-MAS and MasRouter. Table 4 provides detailed cost-performance data. Table 5 compares training costs across methods. The λ sensitivity analysis (Figure 5) shows performance as a function of cost penalty."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether MMLU, GSM8K, MATH, HumanEval, or MBPP actually measure the capabilities claimed. The benchmarks are used without questioning construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "SC-MAS constructs different MAS configurations (scaffolds) per query. While the LLM pool is held constant across routing methods, different MAS structures represent different scaffolding. The confound between scaffold design and LLM capability is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Not addressed. All five benchmarks were published in 2021, and the LLMs used were trained on data through 2024, creating a 3-year window of potential temporal leakage."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. No analysis of whether the evaluation setup leaks information beyond what would be available in real usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. No verification of independence between training data of the LLMs and test examples."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used (no canary strings, membership inference, decontamination, or temporal splits)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SC-MAS improves accuracy by 3.35% on MMLU while reducing inference cost by 15.38% compared to MasRouter.",
    365       "evidence": "Table 1 shows SC-MAS 87.60% vs MasRouter 84.25% on MMLU. Cost comparison in Figure 2. Point estimates only, no error bars despite 3 runs.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "SC-MAS achieves a 3.53% accuracy gain with a 12.13% cost reduction on MBPP compared to MasRouter.",
    370       "evidence": "Table 1 shows SC-MAS 87.53% vs MasRouter 84.00% on MBPP. Table 4 shows costs: SC-MAS $0.913 vs MasRouter $1.039.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Edge-level heterogeneous collaboration is more effective than homogeneous graph-level collaboration.",
    375       "evidence": "Ablation in Table 2: removing Fθe drops accuracy 1.17-3.83% and increases cost 16.67-41.32%. Comparison with MasRouter (homogeneous) in Table 1.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "SC-MAS can generalize to unseen LLMs without retraining (inductive ability).",
    380       "evidence": "Section 5.5 and Figure 4: adding Deepseek-v3 without retraining improves MMLU from 87.60% to 88.14% and MATH from 76.75% to 77.84%. Only one new LLM tested.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "SC-MAS implicitly filters unhelpful agents by assigning low selection probabilities.",
    385       "evidence": "Section 5.4 and Figure 3: two deliberately noisy agents received only 7.63% and 4.58% selection rates on HumanEval. Single experiment with artificially constructed noisy agents.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "SC-MAS reduces token consumption by 11.17% to 16.35% while improving accuracy by 1.46% to 3.34% over state-of-the-art methods.",
    390       "evidence": "Section 1 contributions and Section 5.2 performance analysis. Table 1 for accuracy, Figure 2 and Table 4 for costs. Range spans multiple benchmarks.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No error bars despite multiple runs",
    397       "detail": "The paper states experiments are run 3 times and averaged, but no standard deviation, confidence interval, or any spread measure is reported in any table. This makes it impossible to assess whether the claimed improvements (e.g., 3.35% on MMLU) are within noise."
    398     },
    399     {
    400       "flag": "No significance tests for claimed improvements",
    401       "detail": "All comparative claims are based on comparing point estimates. With only 3 runs and no significance tests, claimed improvements of 1-3% could easily be within random variation."
    402     },
    403     {
    404       "flag": "Benchmark contamination risk unaddressed",
    405       "detail": "All five benchmarks (MMLU, GSM8K, MATH, HumanEval, MBPP) were published in 2021, well before the training cutoffs of the models used (2024). No contamination analysis is performed, despite this being a known issue for these benchmarks."
    406     },
    407     {
    408       "flag": "Uniformly positive results across all benchmarks",
    409       "detail": "SC-MAS outperforms all baselines on every single benchmark and metric. No failure cases or negative results are reported, which is unusual for a method tested across diverse task types."
    410     },
    411     {
    412       "flag": "No code release for reproducibility",
    413       "detail": "No source code, repository, or reproducibility artifacts are provided. Combined with missing prompts and incomplete model version specifications, independent reproduction is not possible."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    419       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    420       "year": 2024,
    421       "relevance": "Key multi-agent framework for software development that SC-MAS builds upon and compares against conceptually."
    422     },
    423     {
    424       "title": "MasRouter: Learning to route LLMs for multi-agent systems",
    425       "authors": ["Yanwei Yue", "Guibin Zhang", "Boyang Liu"],
    426       "year": 2025,
    427       "relevance": "Primary baseline and predecessor for multi-agent routing; SC-MAS directly extends its homogeneous collaboration to heterogeneous edge-level strategies."
    428     },
    429     {
    430       "title": "GPTSwarm: language agents as optimizable graphs",
    431       "authors": ["Mingchen Zhuge", "Wenyi Wang", "Louis Kirsch"],
    432       "year": 2024,
    433       "relevance": "Graph-based MAS optimization baseline that models agents as optimizable graphs, compared against in Table 1."
    434     },
    435     {
    436       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    437       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    438       "year": 2024,
    439       "relevance": "Foundational work on LLM cost reduction through routing strategies, evaluated as a single-agent routing baseline."
    440     },
    441     {
    442       "title": "RouteLLM: Learning to route LLMs from preference data",
    443       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    444       "year": 2025,
    445       "relevance": "Single-agent LLM routing method using preference data, compared as baseline for routing quality."
    446     },
    447     {
    448       "title": "AFlow: Automating agentic workflow generation",
    449       "authors": ["Jiayi Zhang", "Jinyu Xiang", "Zhaoyang Yu"],
    450       "year": 2025,
    451       "relevance": "Automated agentic workflow generation method compared as dynamic MAS baseline."
    452     },
    453     {
    454       "title": "Cut the crap: An economical communication pipeline for LLM-based multi-agent systems",
    455       "authors": ["Guibin Zhang", "Yanwei Yue"],
    456       "year": 2025,
    457       "relevance": "AgentPrune method for reducing communication overhead in MAS, directly compared as dynamic MAS baseline."
    458     },
    459     {
    460       "title": "ChatDev: Communicative agents for software development",
    461       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    462       "year": 2024,
    463       "relevance": "Multi-agent software development system using chain collaboration patterns referenced in SC-MAS's edge strategy design."
    464     },
    465     {
    466       "title": "GraphRouter: A graph-based router for LLM selections",
    467       "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"],
    468       "year": 2025,
    469       "relevance": "Graph-based LLM routing method used as single-agent routing baseline (PromptLLM)."
    470     },
    471     {
    472       "title": "More agents is all you need",
    473       "authors": ["junyou li", "Qin Zhang", "Yangbin Yu"],
    474       "year": 2024,
    475       "relevance": "Studies scaling properties of multi-agent systems, relevant to understanding when more agents help vs. hurt."
    476     },
    477     {
    478       "title": "Mixture-of-agents enhances large language model capabilities",
    479       "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun"],
    480       "year": 2025,
    481       "relevance": "Mixture-of-agents approach for combining LLMs, related to the heterogeneous LLM assignment in SC-MAS."
    482     },
    483     {
    484       "title": "Automated design of agentic systems",
    485       "authors": ["Shengran Hu", "Cong Lu", "Jeff Clune"],
    486       "year": 2025,
    487       "relevance": "Automated agent architecture search using MCTS, related to SC-MAS's search over MAS configurations."
    488     }
    489   ]
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs