scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29151B)
      1 {
      2   "paper": {
      3     "title": "The Single-Multi Evolution Loop for Self-Improving Model Collaboration Systems",
      4     "authors": [
      5       "Shangbin Feng",
      6       "Kishan Panaganti",
      7       "Yulia Tsvetkov",
      8       "Wenhao Yu"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.05182"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "The single-multi evolution loop iteratively alternates between multi-LM collaboration and knowledge distillation, improving individual models by 8.0% and collaboration systems by 14.9% on average across 15 tasks. The approach outperforms existing evolution strategies (multi-agent debate, multi-agent fine-tuning, SPARTA alignment) by 7.7% on average. Multi-student and on-policy distillation yield further gains over supervised KD. Results hold across 3 model pools, 7 collaboration strategies, and models of varying sizes (1.5B-7B).",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository URL provided in footnote 2: https://github.com/BunsenFeng/moco_distill. The paper states 'Code and data are available at' this URL."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states 'Code and data are available at' the GitHub link. All 15 evaluation datasets are publicly available standard benchmarks (AGIEval, ARC, MMLU-redux, GSM8k, MATH, etc.) listed in Table 7."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup is described. Model names and some hyperparameters are listed but not the software environment needed to reproduce."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but no README contents or reproduction guide is described."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Table 1 reports avg ± std for single model results across all settings, e.g., '51.10 (0.72)'. Standard deviations are consistently provided for individual model performance."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Appendix B states: 'For every dataset and model pool, there is at least one (distillation, collaboration algorithm) setting where the improvements to single models and multi-model systems are statistically significant with p < 0.05 using one-tailed z-test.'"
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports percentage improvements with baseline context: '8.0% on average' for individual models, '14.9% on average' for collaboration systems, '7.7% on average' over existing methods. Tables provide before/after values enabling effect size computation."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper states 'We by default sample 1k data points for both dev and test sets for large datasets' (Section 3) but provides no justification for why 1k was chosen and no power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Standard deviations are reported in parentheses throughout Table 1 for single model results, e.g., '51.10 (0.72)'. However, the std is across models in the pool, not across random seeds or independent runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines included: initial unevolved models/systems (t=0) and three existing evolution methods (multi-agent debate, multi-agent fine-tuning, SPARTA alignment) in Figure 2."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent methods: SPARTA alignment (Jiang et al., 2025), multi-agent fine-tuning (Subramaniam et al., 2025), LLM Blender (2023), and GraphRouter (Feng et al., 2025c). All are contemporary."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Systematic ablation across collaboration strategies (4 in main + 3 additional), distillation methods (supervised, multi-student, on-policy KD in Table 2), model pools (3+1 settings), and model sizes (Figure 4). Each component's contribution is measured."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "15 evaluation datasets across 6 domains (QA, reasoning, knowledge, safety, science, instruction following). Each dataset has its own metric, providing diverse evaluation."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is included. All evaluation is automated benchmark scoring. Human evaluation would have been relevant for safety (CocoNot) and instruction following (human interest) tasks."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3 explicitly describes dev/test separation: 'We by default sample 1k data points for both dev and test sets' and 'retain the best model/collaboration system based on the dev set.' Table 7 lists separate dev and test sizes."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 1 provides per-dataset breakdowns across all 15 datasets for every experimental setting. Section 4 also breaks down gains by domain: '16.84% (reasoning), 12.67% (knowledge), and 7.52% (QA).'"
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 3 analyzes the skill dynamics including 'single no, multi no' cases (unsolvable problems). The paper discusses logit fusion's poor performance. Table 6 shows qualitative examples where pre-evolution models fail with incorrect reasoning."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Logit fusion consistently underperforms other strategies (42.86 vs 50+ in pool #1). Some multi-KD logit settings show regressions. The paper acknowledges '91.9% settings' improve, implying 8.1% do not. Negative human interest scores appear in Table 1."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of '8.0% on average' and '14.9% on average' are directly supported by Table 1 results. The claim of outperforming existing methods by 7.7% is supported by Figure 2. The claim about 66.5% of previously unsolvable problems is supported by Figure 3 analysis."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims ('improves', 'benefits from') but the experimental design is adequate: controlled ablations varying one factor at a time (collaboration strategy, distillation method, model pool), with before/after comparisons (t=0 vs t=n) across multiple settings."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The conclusion claims 'a new paradigm of evolutionary AI systems' but experiments use only 7B-8B models on English-language benchmarks. The broad paradigmatic claim extends well beyond the tested settings. No testing with larger models, multilingual tasks, or production deployment scenarios."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are discussed. The paper does not consider whether improvements come simply from additional training data rather than the collaboration pattern, nor whether the distillation data quality rather than the iterative loop drives gains."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures benchmark accuracy scores and reports improvements in those same scores. Claims generally match the measurement granularity ('8.0% improvement on 15 tasks') without overclaiming broader capabilities beyond what was measured."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model identifiers are provided: 'QWEN/QWEN2.5-7B-INSTRUCT', 'META-LLAMA/LLAMA-3.1-8B-INSTRUCT', 'DEEPSEEK-AI/DEEPSEEK-R1-DISTILL-QWEN-7B' (Section 3). These are HuggingFace-style identifiers sufficient for reproduction."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Prompt 1 in Section 2.2 provides the full prompt text used for multi-agent debate collaboration, including the exact template with input slots clearly marked."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3 reports: '512 max new tokens, temperature τ = 0.7, and top-p p = 0.9', '1e-5 learning rate for 5 epochs' for router, 'α : β : γ = 1:1:1' for multi-student KD. Appendix B provides additional hyperparameters for all 7 collaboration strategies."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The paper evaluates model collaboration strategies (routing, debate, logit fusion, merging) which are described in the methodology, but these are not agentic scaffolding involving tool use, planning, or memory management."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3 describes dataset sampling ('1k data points for both dev and test sets'), pool #1 creation (finetuned on three domains of Tulu-v3 SFT data), and the distillation data pipeline is formalized in Algorithm 1. Table 7 provides complete dataset statistics."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix A contains a dedicated 'Limitations' section with three substantive paragraphs discussing computational budget constraints, limited collaboration algorithm coverage, and restricted model pool settings."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Limitations are specific to this study: bounded to 3 iterations due to computational budgets, only 7 collaboration algorithms tested out of many possible, only 4 model pool settings explored. These are concrete rather than generic."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The limitations explicitly state what was not tested: 'more models, more diverse models, more evolution iterations', 'general-specialized collaboration, the collaboration of post-trained aligned models and pretrained base models', and more collaboration algorithms."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "While code and data are claimed available at the GitHub link, the paper does not explicitly state that raw experimental outputs (model generations, per-example scores) are available for independent verification of the reported numbers."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data collection is described: 15 standard benchmarks are listed with sources and sizes (Table 7). The distillation data generation process is formalized in Algorithm 1. Pool #1 creation from Tulu-v3 SFT data is described."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data comes from standard public benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Algorithm 1 formally documents the complete pipeline: collaboration → dataset generation → distillation → iteration. Section 3 describes sampling procedures and evaluation protocols."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding statement, acknowledgments section, or grant information is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: University of Washington (Feng, Tsvetkov) and Tencent AI Seattle Lab (Panaganti, Yu)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, making independence impossible to assess. Tencent is a corporate lab with potential commercial interest in model collaboration methods, but no funding statement addresses this."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the base models (Qwen2.5-7B, Llama-3.1-8B, DeepSeek-R1-Distill-Qwen-7B), despite these being pre-trained models evaluated on public benchmarks."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of potential train/test overlap. The base models could have seen benchmark data (e.g., MMLU, ARC, GSM8k) during pre-training, but this is not addressed."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Many benchmarks predate the models: ARC (2018), MMLU (2021), GSM8k (2021), MATH (2021). These were almost certainly in the pre-training data of 2024-2025 models. No contamination analysis is performed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or per-example cost is reported. This is a significant omission for a paper whose central motivation is improving efficiency of multi-LM systems."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No GPU hours, training time, total API spend, or hardware specifications are reported. The paper involves fine-tuning multiple 7B models across 3 iterations with multiple settings, but computational cost is never quantified."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The ± std values in Table 1 represent variance across the 3 models in the pool, not across random seeds. No seed sensitivity analysis is performed."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of evolution iterations (k=3) is stated, but whether experiments were repeated across independent runs is not stated. The reported std is across models in the pool, not across experimental repetitions."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters are reported but no search budget is stated. No description of how many configurations were tried or how hyperparameters were selected."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Section 3 states 'retain the best model/collaboration system based on the dev set', explicitly using the dev set (not test set) for model selection."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper runs significance tests across 15 datasets × multiple settings but applies no correction for multiple comparisons. P-values are reported at p < 0.05 without Bonferroni or other family-wise corrections."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement their own system and baselines (using their MoCo framework) without acknowledging author-evaluation bias. Re-implementations of competing methods (SPARTA, multi-agent fine-tuning) may systematically underperform."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No performance-vs-compute analysis despite the paper's motivation being efficiency. The multi-step collaboration involves multiple model calls, and distillation requires training, but compute costs are never compared against baseline methods."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "15 benchmarks are used without any discussion of construct validity. No analysis of whether these benchmarks actually measure the claimed capabilities or whether improvements transfer to real-world usage."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "The collaboration strategy IS the independent variable being tested, not an external confound. Models are compared within the same collaboration framework."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. Models trained in 2024-2025 are evaluated on benchmarks from 2018-2024, with no analysis of whether training data included benchmark solutions."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of feature leakage or whether the evaluation setup provides information not available in real usage scenarios."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of train/test independence. The distillation process trains on collaboration outputs for the same instruction set used for evaluation (though dev/test are split). No analysis of whether distillation data overlaps with or is structurally similar to test data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Distilling from model collaboration improves individual LMs by 8.0% on average.",
    369       "evidence": "Table 1 shows improvements across 15 datasets, 4 collaboration strategies, 2 distillation methods, and 2 model pools. The 8.0% is an average across all these settings.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The multi-LM collaboration systems improve over initial unevolved systems by 14.9% on average.",
    374       "evidence": "Table 1 compares multi t=0 vs multi t=n across all settings. Orange-highlighted cells show improvements in 91.9% of settings.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The single-multi evolution loop outperforms existing evolutionary AI methods by 7.7% on average.",
    379       "evidence": "Figure 2 compares against multi-agent debate, multi-agent fine-tuning, and SPARTA alignment, but only on 3 datasets with model pool #1.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "The evolution loop helps solve an average of 66.5% of problems where initial models/systems struggled.",
    384       "evidence": "Figure 3 shows skill dynamics across evolution iterations, tracking the 'single no, multi no' quadrant shrinking over iterations.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Multi-student knowledge distillation outperforms simple supervised distillation by 4.67% on average.",
    389       "evidence": "Table 1 shows multi-KD consistently outperforming supervised-KD across most settings. Table 2 extends this to on-policy distillation.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The single-multi evolution loop is compatible with diverse models, collaboration methods, and distillation algorithms.",
    394       "evidence": "Tables 1, 3, 4, and Figure 4 demonstrate improvements across 3+ model pools, 7 collaboration strategies, 3 distillation methods, and models of varying sizes (1.5B-7B).",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No compute costs for an efficiency-motivated paper",
    401       "detail": "The paper's central motivation is reducing the cost of multi-LM systems, yet no computational costs (GPU hours, training time, API costs, wall-clock time) are reported anywhere. Without knowing the distillation and iteration costs, the efficiency claim cannot be evaluated."
    402     },
    403     {
    404       "flag": "Aggregate averages mask variable results",
    405       "detail": "The headline 8.0% and 14.9% improvements are averages across many settings, but several settings show marginal gains or regressions. Logit fusion multi-KD in pool #1 shows substantial degradation on some tasks (e.g., BBH drops from 13.10 to 17.10, MATH from 68.05 to 46.23)."
    406     },
    407     {
    408       "flag": "Limited comparison scope for existing evolution methods",
    409       "detail": "The comparison against existing evolution methods (Figure 2) uses only 3 datasets with model pool #1. The 7.7% superiority claim rests on a narrow comparison that may not generalize to other settings."
    410     },
    411     {
    412       "flag": "No contamination analysis",
    413       "detail": "Multiple benchmarks (ARC 2018, MMLU 2021, GSM8k 2021, MATH 2021) predate the models used (Qwen2.5, Llama-3.1, DeepSeek-R1 from 2024-2025). No contamination analysis is performed despite high likelihood the models saw these benchmark problems during pre-training."
    414     },
    415     {
    416       "flag": "Variance across models conflated with uncertainty",
    417       "detail": "The ± std values in Table 1 represent variance across the 3 models in the pool, not across random seeds or independent runs. This conflates model diversity with experimental uncertainty, making it impossible to assess result stability."
    418     },
    419     {
    420       "flag": "Self-authored framework used for baselines",
    421       "detail": "All experiments use the authors' MoCo framework (Feng et al., 2026), and baseline evolution methods are re-implemented within this framework. No independent implementations are used, creating potential for author-evaluation bias."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Improving factuality and reasoning in language models through multiagent debate",
    427       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"],
    428       "year": 2023,
    429       "relevance": "Foundational multi-agent debate method used as a collaboration strategy and baseline in the single-multi evolution loop."
    430     },
    431     {
    432       "title": "On-policy distillation of language models: Learning from self-generated mistakes",
    433       "authors": ["Rishabh Agarwal", "Nino Vieillard", "Yongchao Zhou"],
    434       "year": 2024,
    435       "relevance": "On-policy distillation method extended in this work for the distillation step, showing further gains over supervised KD."
    436     },
    437     {
    438       "title": "RouteLLM: Learning to route LLMs from preference data",
    439       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    440       "year": 2025,
    441       "relevance": "LLM routing method used as the API-level collaboration strategy in the single-multi evolution loop."
    442     },
    443     {
    444       "title": "When one LLM drools, multi-LLM collaboration rules",
    445       "authors": ["Shangbin Feng", "Wenqian Ding", "Alisa Liu"],
    446       "year": 2025,
    447       "arxiv_id": "2502.04506",
    448       "relevance": "Survey of model collaboration methods that provides the taxonomy of collaboration levels (API, text, logit, weight) used in this paper."
    449     },
    450     {
    451       "title": "Multiagent finetuning: Self improvement with diverse reasoning chains",
    452       "authors": ["Vighnesh Subramaniam", "Yilun Du", "Joshua B. Tenenbaum"],
    453       "year": 2025,
    454       "relevance": "Multi-agent fine-tuning baseline for comparison and additional collaboration strategy tested in the evolution loop."
    455     },
    456     {
    457       "title": "SPARTA alignment: Collectively aligning multiple language models through combat",
    458       "authors": ["Yijiang Ding", "Wenqian Ding", "Shangbin Feng"],
    459       "year": 2025,
    460       "arxiv_id": "2506.04721",
    461       "relevance": "Game-theoretic multi-model alignment method used as an existing evolution baseline for comparison."
    462     },
    463     {
    464       "title": "LLM-Blender: Ensembling large language models with pairwise ranking and generative fusion",
    465       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    466       "year": 2023,
    467       "relevance": "LLM ensembling method used as an additional collaboration strategy demonstrating the loop's compatibility with diverse approaches."
    468     },
    469     {
    470       "title": "Model Swarms: Collaborative search to adapt LLM experts via swarm intelligence",
    471       "authors": ["Shangbin Feng", "Zifeng Wang", "Yijia Wang"],
    472       "year": 2025,
    473       "relevance": "Model collaboration via swarm intelligence in parameter space; weight-level collaboration method related to the model merging approach."
    474     },
    475     {
    476       "title": "Distilling the knowledge in a neural network",
    477       "authors": ["Geoffrey Hinton", "Oriol Vinyals", "Jeff Dean"],
    478       "year": 2015,
    479       "relevance": "Foundational knowledge distillation paper that established the teacher-student framework extended in this work to multi-LM teachers."
    480     },
    481     {
    482       "title": "MoCo: A one-stop shop for model collaboration research",
    483       "authors": ["Shangbin Feng", "Yijia Bai", "Zirui Yang"],
    484       "year": 2026,
    485       "arxiv_id": "2601.21257",
    486       "relevance": "The model collaboration framework used to implement all collaboration algorithms in the experiments."
    487     },
    488     {
    489       "title": "GraphRouter: A graph-based router for LLM selections",
    490       "authors": ["Tao Feng", "Yifan Shen", "Jiaxuan You"],
    491       "year": 2025,
    492       "relevance": "Graph neural network-based LLM routing strategy tested as an additional collaboration method in the evolution loop."
    493     },
    494     {
    495       "title": "MaGDI: Structured distillation of multi-agent interaction graphs improves reasoning in smaller language models",
    496       "authors": ["Justin Chen", "Swarnadeep Saha", "Elias Stengel-Eskin", "Mohit Bansal"],
    497       "year": 2024,
    498       "relevance": "Distillation specifically in multi-agent settings, closely related to distilling from multi-LM collaboration systems."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs