scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (35597B)
      1 {
      2   "paper": {
      3     "title": "Disentangling Causal Importance from Emergent Structure in Multi-Expert Orchestration",
      4     "authors": [
      5       "Sudipto Ghosh",
      6       "Sujoy Nath",
      7       "Sunny Manchanda",
      8       "Tanmoy Chakraborty"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.04291"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The orchestrator and INFORM analysis code are not released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses publicly available benchmarks: GSM8K (Cobbe et al., 2021), HumanEval (Chen et al., 2021), and MMLU (Hendrycks et al., 2021). No proprietary data was collected."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions training on 'a single NVIDIA A100 80GB GPU' (Appendix C) and lists hyperparameters in Table 4, but does not provide a requirements.txt, Dockerfile, or library version specifications sufficient to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, scripts, or README are provided. The architecture and training objective are described but not in enough detail to reproduce without significant reverse engineering."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 8 reports 'mean KL divergence ± 95% CI computed across training epochs.' Table 11 reports 'mean and the standard deviation' with ± notation. Box plots with whiskers appear in Figures 6 and 7."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 10 reports Spearman's ρ with two-sided p-values and Kendall's τ for rank correlation. Table 11 reports Wilcoxon signed-rank test p-values and paired t-test p-values for masking ablation comparisons. Appendix J provides detailed statistical tests."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports KL divergence magnitudes (e.g., routing divergence of 2.366 vs sequence divergence of 0.428 on MMLU in Table 8), relative comparisons ('5.5× higher routing KL divergence'), speedup ratios ('~3.5× speedup'), and absolute performance differences ('~1.4% performance gain')."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why 10 experts were chosen, why 100 held-out examples were used for hyperparameter tuning, or why 5 training epochs were run. No power analysis is mentioned."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Table 8 reports ± 95% CI across epochs. Table 11 reports mean ± standard deviation. Figure 11 states results are 'mean accuracy... over 3 runs.' Box plots in Figures 6 and 7 show spread across samples."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 3 compares against MetaGPT. Figures 9 and 10 compare against a Uniform Baseline, Relational-Only, and Intrinsic-Only variants. Table 9 includes individual model baselines (LLaMA 3.1 8B, Qwen3 8B, DeepSeek-R1 8B)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "MetaGPT (Hong et al., 2024, ICLR 2024) is a contemporary multi-agent framework. Table 1 also positions against RouteLLM (Ong et al., 2025), IRT-Router (Song et al., 2025), and other recent systems."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 4.5 and Appendix F present extensive ablations: static collaboration graph (Figure 11a), static inference sequence (Figure 11b), masking intrinsically important experts (Table 8, Figure 8), and Relational-Only vs Intrinsic-Only routing variants (Figures 9, 10)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper reports accuracy (GSM8K, MMLU), Pass@1 (HumanEval), KL divergence (routing and sequence), entropy (collaboration and sequence), Gini coefficient (centralization), Spearman's ρ, Kendall's τ, and gradient norms (intrinsic importance)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is performed. All evaluation of the orchestration system and its outputs is automated. The interpretability claims (e.g., whether INFORM reveals 'meaningful' structure) could benefit from human assessment but none is provided."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 states the orchestrator is 'evaluated on a held-out subset of the test set.' Appendix C states 'A held-out set of 100 examples were used to determine the weights of the loss terms,' separating tuning from evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are consistently broken down by task (GSM8K, HumanEval, MMLU), by training epoch (Tables 8, 10; Figures 3-8), and by expert (Figures 4, 5, 13, 14). Different tasks show qualitatively different behaviors."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Appendix H presents an extensive discussion of failure modes: hub over-centralization, routing-attribution misalignment, early commitment to suboptimal initializers, overconfidence under semantically damaged inputs, and redundancy masking structural dependence. Task-specific failure profiles are also analyzed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 10 shows weak and statistically insignificant rank correlation between routing dominance and causal importance — a key negative finding. Relational-Only and Intrinsic-Only ablations show degraded performance (Figure 9). Appendix H documents systematic failure modes of the orchestrator."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: routing-importance divergence by Tables 10-11 and Figures 4-5; asynchronous emergence by Figures 6-7; masking collapse by Table 8 and Figure 8; task-dependent structure by per-task breakdowns throughout."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims about expert importance and addresses them through gradient-based attribution (Section 2.2) and controlled ablation/masking experiments (Section 4.5, Appendix F). FAQ Q1 explicitly acknowledges these are 'causal signals' not 'formal causal graphs,' appropriately hedging. Masking single experts (single-variable manipulation) is adequate for the claims made."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title and framing ('Multi-Expert Orchestration') are broad, but the paper only evaluates one specific differentiable orchestrator architecture (Oθ) with one specific training setup, plus a brief cascade analysis in Appendix I. The claim that 'any differentiable routing policy shall suffice' (Section 2.1) is asserted without testing on other architectures. Results on 3 benchmarks with specific model families are presented as general orchestration findings."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper considers multiple alternative explanations: FAQ Q3 clarifies high importance doesn't mean better accuracy; Q4 explains why frequently routed experts may have low attribution (interaction hubs vs causal necessity); Q8 discusses when masking doesn't reduce performance (redundancy). Section 4.2 discusses interaction-level dependencies as alternative to intrinsic strength."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "FAQ Q1 explicitly distinguishes between gradient sensitivity (the proxy measured) and formal causal structure (the outcome claimed), stating INFORM provides 'causal signals' not 'full causal mechanisms.' FAQ Q2 and Q3 distinguish between influence on routing decisions (measured) and expert correctness/quality (what might be inferred). The paper is clear about what its measurements do and do not show."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model versions are given: 'LLaMA-3.1 8B', 'Qwen-3 8B', 'DeepSeek-R1 8B' (with 'distilled DeepSeek-R1-0528-Qwen3-8B variant' specified in Appendix D) for the homogeneous consortium, and 'LLaMA-3.2 1B', 'Qwen2.5 3B', 'Mistral 7B' for the heterogeneous one. Oracle: 'GPT OSS 20B' (Agarwal et al., 2025)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The full prompt template is provided in Appendix E, including the expert coordination instructions and template structure. The template shows the actual text sent to models, including role instructions and task formatting."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 4 provides comprehensive hyperparameters: learning rate, batch size, epochs, warmup ratio, gradient clipping, hidden dimension, attention heads, dropout, Gumbel-Softmax temperatures, and all loss coefficients. Tables 6 and 7 list per-expert decoding temperatures. Appendix D specifies top_p settings."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2.1 describes the orchestration architecture in detail: the Interaction Module (routing adapter, conditional transition matrix with query-key attention and cosine similarity prior), the Selection Module (Gumbel-Softmax sampling with position penalty), and the multi-step expert chaining. Figure 1 provides a visual overview. Appendix C details the training objective."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4 states 'initial 512 tokens generated by each expert is used as an input to the orchestrator.' Appendix C describes the held-out set used for tuning. The paper uses standard benchmark splits for GSM8K, HumanEval, and MMLU with standard evaluation protocols."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Appendix A (FAQ) addresses some limitations indirectly (Q1, Q7, Q8), and Appendix H discusses failure modes of orchestration, but neither constitutes a dedicated limitations section about the study methodology itself."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "FAQ Q1 specifically states INFORM does not 'recover causal structure in the formal sense.' FAQ Q7 notes the method requires white-box access and cannot be applied to API-based systems. FAQ Q8 discusses when masking may not reduce performance due to redundancy. These are specific to this study, not generic disclaimers."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. FAQ Q7 mentions black-box inapplicability, but there is no systematic statement of scope boundaries — e.g., whether findings generalize beyond the specific orchestrator architecture, beyond 8B-scale models, or beyond the three benchmarks tested. The title implies broad generality without bounding."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental data (routing matrices, gradient attributions, per-sample results) is released. Only aggregated statistics are reported in figures and tables. The underlying benchmarks are public, but the orchestrator outputs and intermediate analysis data are not available."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The data consists of standard public benchmarks (GSM8K, HumanEval, MMLU), which are well-documented in their respective papers. The paper describes how expert outputs are generated (512 initial tokens per expert) and how the orchestrator is evaluated on held-out test subsets."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard public benchmarks."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline from input to evaluation is documented: benchmarks → expert generation (512 tokens each, Tables 6-7 for decoding configs) → BERT encoder → orchestrator routing → expert chaining → evaluation. Appendix C details the training pipeline. Table 4 specifies all parameters."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding or acknowledgments section is present in the paper. One author is affiliated with DRDO (Defence Research and Development Organisation, India), which is a government defense agency, but no funding statement is provided."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: IIT Delhi (Yardi School of AI, Department of Electrical Engineering) and DRDO Young Scientist Laboratory. The paper does not evaluate any product made by these institutions, so no product-affiliation conflict exists."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence of funder cannot be verified. One author is affiliated with DRDO (defense organization), but no explicit funding or sponsorship statement is provided."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the models used (LLaMA 3.1, Qwen3, DeepSeek-R1, Mistral 7B, GPT OSS 20B). These models were likely trained on data that includes the benchmarks used for evaluation."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of potential train/test overlap. The expert LLMs may have been trained on GSM8K (2021), HumanEval (2021), and MMLU (2021) solutions, which could affect both individual expert performance and orchestration dynamics. This is not addressed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "All three benchmarks (GSM8K, HumanEval, MMLU) were published in 2021, well before the training cutoffs of the 2024-2025 era models used. Benchmark contamination risk is significant and not discussed. If experts have memorized answers, the orchestration task is fundamentally different from what the paper analyzes."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Table 3 reports average model calls (1.44 for INFORM vs 5.00 for MetaGPT) and claims '~3.5× speedup,' but no actual wall-clock time, API costs, tokens consumed, or cost per example is reported. The model call count is a proxy, not an actual cost measurement."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper states training was on 'a single NVIDIA A100 80GB GPU' (Appendix C) and lists 5 training epochs with batch size 2, but does not report total GPU hours, training wall-clock time, or total computational budget."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No explicit seed sensitivity analysis. Figure 11 mentions results are 'over 3 runs' for the ablation study, but the main results in Table 9 report 'best accuracy' values without showing sensitivity across random seeds."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Figure 11 states 'over 3 runs' for ablation results. However, the main performance results in Table 9 report 'best accuracy' without stating how many runs produced them. Table 8 reports values 'computed across training epochs,' which is across epochs, not independent runs. Inconsistent across experiments."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Table 5 shows a hyperparameter sweep 'through magnitudes {10⁻², 10⁻¹, 10⁰}' for λ_oracle, λ_symmetry, and λ_sparsity. Appendix C states 'A held-out set of 100 examples were used to determine the weights of the loss terms.' The search method and budget are documented."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 5 shows the sweep results, but Table 9 reports 'best accuracy' across training epochs without explaining the selection criterion (e.g., best validation epoch, last epoch, or cherry-picked). The 'best' label implies cherry-picking across epochs."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Table 10 reports 15 correlation tests across tasks and epochs, and Table 11 reports multiple paired comparisons, all without correction for multiple comparisons (no Bonferroni, Holm, or Benjamini-Hochberg mentioned)."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors compare their INFORM orchestrator against MetaGPT using their own experimental setup (assigning same-size models to MetaGPT roles). No discussion of author-evaluation bias or the risk of inadvertently optimizing their system's presentation relative to the baseline."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Table 3 compares INFORM (1.44 avg calls) vs MetaGPT (5.00 avg calls) at matched expert capabilities. Figure 12 plots MMLU accuracy as a function of total parameters (consortium scaling), showing performance relative to monolith models at different scales."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses GSM8K, HumanEval, and MMLU without discussing whether these benchmarks actually measure what INFORM claims to analyze — i.e., whether orchestration dynamics on these tasks generalize to real-world multi-expert collaboration. No construct validity discussion is provided."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The MetaGPT comparison (Table 3) uses a different orchestration mechanism (SOPs vs learned routing) with the same underlying models, but the scaffold confound is not explicitly discussed. The different frameworks introduce confounds beyond routing policy (e.g., MetaGPT's role definitions, communication patterns) that could explain performance differences."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. All three benchmarks (GSM8K, HumanEval, MMLU, all 2021) predate the training data of the models used (LLaMA 3.1, Qwen3, DeepSeek-R1, all 2024-2025). Solutions to these benchmarks are widely available online and likely in training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The initial 512 tokens generated by each expert could contain memorized solutions that leak answer information to the orchestrator. If experts have seen benchmark problems during training, the orchestration task changes fundamentally."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not discussed. No analysis of whether evaluation examples share structural properties with training data or whether benchmark problems are independent of each other."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination is mentioned."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Routing dominance is a poor proxy for functional necessity — frequently selected experts are not always causally important.",
    370       "evidence": "Table 10 shows weak and statistically insignificant Spearman rank correlation between routing dominance and intrinsic importance across all tasks and epochs (max ρ = 0.648 with p=0.043 on MMLU Epoch 2, most others p > 0.1). Figures 4 and 5 visually show divergence between gradient attribution and routing mass patterns.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Masking the single most intrinsically important expert induces 5.5× higher routing KL divergence than sequencing divergence on MMLU.",
    375       "evidence": "Table 8 reports KL(Routing) = 2.366 ± 0.497 vs KL(Sequence) = 0.428 ± 0.072 on MMLU (ratio ~5.5×). Figure 8c confirms routing divergence consistently exceeds sequencing divergence across epochs on MMLU.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Orchestration behaviors emerge asynchronously: centralization precedes stable routing confidence.",
    380       "evidence": "Figures 6a and 6c show Gini coefficient (centralization) increasing before collaboration entropy fully stabilizes across tasks. Section 4.3 states 'centralization increases before routing entropy fully stabilizes.'",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Expert ordering remains non-deterministic — entropy decreases but stays above zero.",
    385       "evidence": "Figure 6b shows sequence entropy decreasing across training epochs but remaining significantly above zero on all three tasks. Table 2 shows the starting-expert distribution concentrating but not collapsing to a single expert.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "INFORM orchestrator achieves 87.1% Pass@1 on HumanEval with 1.44 average model calls vs MetaGPT's 85.9% with 5.00 calls (~3.5× speedup with ~1.4% gain).",
    390       "evidence": "Table 3 reports these specific numbers. However, the comparison is between a learned adaptive orchestrator and MetaGPT's fixed SOP framework — they are architecturally quite different, so the comparison isolates orchestration style, not just routing efficiency.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "A consortium of two 8B Qwen3 experts (~16B total parameters) matches the accuracy of a 70B monolith model on MMLU.",
    395       "evidence": "Figure 12 shows the consortium curve crossing the 70B monolith accuracy line (~86%) at ~16B total parameters. However, the orchestrator was trained with oracle distillation from GPT OSS 20B, meaning the training signal includes a much larger model's knowledge. The comparison is between a distilled/orchestrated system and a standalone model.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Routing sensitivity is task-dependent: numerical token removal affects GSM8K most, while sentence shuffling and reasoning removal affect HumanEval and MMLU most.",
    400       "evidence": "Figure 3 shows perturbation-specific entropy shifts across tasks, with 'remove_numbers' causing the largest shift on GSM8K (Figure 3a) and 'shuffle_sentences' causing larger shifts on HumanEval and MMLU (Figures 3b, 3c).",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Masking intrinsically important experts induces substantially greater routing collapse than masking frequently routed experts.",
    405       "evidence": "Table 11 reports intrinsic masking KL = 1.38 ± 0.40 vs frequent masking KL = 0.11 ± 0.15 on GSM8K (paired t p=0.0065). However, on HumanEval the pattern reverses (intrinsic = 0.30 ± 0.19, frequent = 0.81 ± 0.46, p=0.0223). MMLU shows the expected direction (2.37 vs 1.40) but is not statistically significant (p=0.267).",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "INFORM reveals a systematic divergence between routing frequency (relational importance) and gradient-based causal attribution (intrinsic importance) in multi-expert LLM orchestration. Experts that dominate routing often function as interaction hubs with limited causal influence, while sparsely routed experts can be structurally critical. Orchestration dynamics emerge asynchronously during training, with centralization preceding stable routing confidence, and expert ordering remaining non-deterministic. Targeted ablations confirm that masking intrinsically important experts disrupts routing structure disproportionately compared to masking frequently selected peers, though this effect is task-dependent.",
    413   "red_flags": [
    414     {
    415       "flag": "Cherry-picked best-epoch results",
    416       "detail": "Table 9 reports 'best accuracy' across training epochs rather than final-epoch or averaged results. This allows selecting the most favorable epoch for each task independently, inflating reported performance numbers."
    417     },
    418     {
    419       "flag": "Oracle distillation confounds scaling comparison",
    420       "detail": "Figure 12 claims a consortium of 2×8B experts matches a 70B monolith on MMLU, but the orchestrator was trained with oracle distillation from GPT OSS 20B. The consortium benefits from knowledge of a much larger model during training, making the parameter-count comparison misleading."
    421     },
    422     {
    423       "flag": "No contamination discussion despite using well-known benchmarks",
    424       "detail": "GSM8K, HumanEval, and MMLU (all 2021) are likely in the training data of the 2024-2025 era models used. If experts have memorized benchmark answers, the orchestration task is fundamentally different from coordinating genuine reasoning — the orchestrator may be routing to memorized answers rather than reasoning chains."
    425     },
    426     {
    427       "flag": "Unequal comparison with MetaGPT",
    428       "detail": "Table 3 compares INFORM (a learned, trained orchestrator with oracle distillation) against MetaGPT (a fixed SOP framework). These are architecturally very different systems. The claim of '~3.5× speedup' conflates the benefits of adaptive routing with the overhead of rigid role-based workflows."
    429     },
    430     {
    431       "flag": "Small held-out set for hyperparameter tuning",
    432       "detail": "Only 100 held-out examples were used for determining loss weights (Appendix C). This small set may not be representative, risking overfitting the orchestrator's training configuration to a small sample."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    438       "authors": [
    439         "Qingyun Wu",
    440         "Gagan Bansal",
    441         "Jieyu Zhang"
    442       ],
    443       "year": 2024,
    444       "relevance": "Foundational multi-agent LLM framework for conversational orchestration, directly compared as a coordination approach in Table 1."
    445     },
    446     {
    447       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    448       "authors": [
    449         "Sirui Hong",
    450         "Mingchen Zhuge",
    451         "Jonathan Chen"
    452       ],
    453       "year": 2024,
    454       "relevance": "State-of-the-art rigid multi-agent coordination framework used as the primary performance baseline in Table 3."
    455     },
    456     {
    457       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    458       "authors": [
    459         "Isaac Ong",
    460         "Amjad Almahairi",
    461         "Vincent Wu"
    462       ],
    463       "year": 2025,
    464       "relevance": "LLM routing framework for cost-performance trade-off, positioned in Table 1 as having moderate interpretability."
    465     },
    466     {
    467       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    468       "authors": [
    469         "Lingjiao Chen",
    470         "Matei Zaharia",
    471         "James Zou"
    472       ],
    473       "year": 2024,
    474       "relevance": "Cost-efficient cascade routing approach analyzed in Appendix I using INFORM's causal attribution methods."
    475     },
    476     {
    477       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    478       "authors": [
    479         "Dongfu Jiang",
    480         "Xiang Ren",
    481         "Bill Yuchen Lin"
    482       ],
    483       "year": 2023,
    484       "relevance": "Output aggregation framework for multi-LLM ensembles that treats expert contributions as exchangeable, contrasted with INFORM's sequential approach."
    485     },
    486     {
    487       "title": "Mixture-of-agents enhances large language model capabilities",
    488       "authors": [
    489         "Junlin Wang",
    490         "Jue Wang",
    491         "Ben Athiwaratkun"
    492       ],
    493       "year": 2025,
    494       "relevance": "Multi-LLM mixture-of-agents approach demonstrating capability gains from model collaboration."
    495     },
    496     {
    497       "title": "Multi-Agent Collaboration via Evolving Orchestration",
    498       "authors": [
    499         "Yufan Dang",
    500         "Chen Qian",
    501         "Xueheng Luo"
    502       ],
    503       "year": 2025,
    504       "relevance": "Evolving orchestration framework for multi-agent collaboration, directly relevant to learned orchestration mechanisms."
    505     },
    506     {
    507       "title": "IRT-router: Effective and interpretable multi-LLM routing via item response theory",
    508       "authors": [
    509         "Wei Song",
    510         "Zhenya Huang",
    511         "Cheng Cheng"
    512       ],
    513       "year": 2025,
    514       "relevance": "Interpretable LLM router using Item Response Theory, positioned as having high interpretability in Table 1."
    515     },
    516     {
    517       "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
    518       "authors": [
    519         "Md. Ashraful Islam",
    520         "Mohammed Eunus Ali",
    521         "Md Rizwan Parvez"
    522       ],
    523       "year": 2024,
    524       "relevance": "Multi-agent code generation system demonstrating agentic collaboration for programming tasks."
    525     },
    526     {
    527       "title": "Evaluating large language models trained on code",
    528       "authors": [
    529         "Mark Chen",
    530         "Jerry Tworek",
    531         "Heewoo Jun"
    532       ],
    533       "year": 2021,
    534       "arxiv_id": "2107.03374",
    535       "relevance": "Introduces the HumanEval benchmark used as one of the three primary evaluation benchmarks in this paper."
    536     },
    537     {
    538       "title": "Wisdom and Delusion of LLM Ensembles for Code Generation and Repair",
    539       "authors": [
    540         "Fernando Vallecillos-Ruiz",
    541         "Max Hort",
    542         "Leon Moonen"
    543       ],
    544       "year": 2025,
    545       "relevance": "Studies LLM ensemble degeneration into static ensembles, a failure mode INFORM aims to diagnose."
    546     },
    547     {
    548       "title": "Can Dependencies Induced by LLM-Agent Workflows Be Trusted?",
    549       "authors": [
    550         "Yu Yao",
    551         "Yiliao Song",
    552         "Yian Xie"
    553       ],
    554       "year": 2025,
    555       "relevance": "Studies trustworthiness of dependencies in LLM-agent workflows, directly related to INFORM's analysis of orchestration dependencies."
    556     }
    557   ],
    558   "engagement_factors": {
    559     "practical_relevance": {
    560       "score": 1,
    561       "justification": "The INFORM framework could help practitioners debug multi-agent LLM systems, but requires white-box access and significant adaptation to apply beyond the specific orchestrator studied."
    562     },
    563     "surprise_contrarian": {
    564       "score": 2,
    565       "justification": "The core finding that routing frequency diverges from causal importance — popular experts aren't necessarily important ones — is a genuinely counterintuitive insight for anyone building multi-agent systems."
    566     },
    567     "fear_safety": {
    568       "score": 0,
    569       "justification": "No safety, security, or misuse concerns are raised; the paper is purely about interpretability of routing mechanisms."
    570     },
    571     "drama_conflict": {
    572       "score": 1,
    573       "justification": "Mildly questions the opacity of existing multi-agent frameworks like MetaGPT and AutoGen, but does so academically without any pointed controversy."
    574     },
    575     "demo_ability": {
    576       "score": 0,
    577       "justification": "No code release, no demo, no tool — purely an analytical framework described in a paper with no public implementation mentioned."
    578     },
    579     "brand_recognition": {
    580       "score": 0,
    581       "justification": "From IIT Delhi and DRDO — recognized in India but not household names in the global tech/AI community; no famous product or lab involved."
    582     }
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs