scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36637B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Disentangling Causal Importance from Emergent Structure in Multi-Expert Orchestration",
      6     "authors": [
      7       "Sudipto Ghosh",
      8       "Sujoy Nath",
      9       "Sunny Manchanda",
     10       "Tanmoy Chakraborty"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.04291",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are supported: routing-importance divergence by Tables 10-11 and Figures 4-5; asynchronous emergence by Figures 6-7; masking collapse by Table 8 and Figure 8; task-dependent structure by per-task breakdowns throughout.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper makes causal claims about expert importance and addresses them through gradient-based attribution (Section 2.2) and controlled ablation/masking experiments (Section 4.5, Appendix F). FAQ Q1 explicitly acknowledges these are 'causal signals' not 'formal causal graphs,' appropriately hedging. Masking single experts (single-variable manipulation) is adequate for the claims made.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title and framing ('Multi-Expert Orchestration') are broad, but the paper only evaluates one specific differentiable orchestrator architecture (Oθ) with one specific training setup, plus a brief cascade analysis in Appendix I. The claim that 'any differentiable routing policy shall suffice' (Section 2.1) is asserted without testing on other architectures. Results on 3 benchmarks with specific model families are presented as general orchestration findings.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper considers multiple alternative explanations: FAQ Q3 clarifies high importance doesn't mean better accuracy; Q4 explains why frequently routed experts may have low attribution (interaction hubs vs causal necessity); Q8 discusses when masking doesn't reduce performance (redundancy). Section 4.2 discusses interaction-level dependencies as alternative to intrinsic strength.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "FAQ Q1 explicitly distinguishes between gradient sensitivity (the proxy measured) and formal causal structure (the outcome claimed), stating INFORM provides 'causal signals' not 'full causal mechanisms.' FAQ Q2 and Q3 distinguish between influence on routing decisions (measured) and expert correctness/quality (what might be inferred). The paper is clear about what its measurements do and do not show.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Appendix A (FAQ) addresses some limitations indirectly (Q1, Q7, Q8), and Appendix H discusses failure modes of orchestration, but neither constitutes a dedicated limitations section about the study methodology itself.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "FAQ Q1 specifically states INFORM does not 'recover causal structure in the formal sense.' FAQ Q7 notes the method requires white-box access and cannot be applied to API-based systems. FAQ Q8 discusses when masking may not reduce performance due to redundancy. These are specific to this study, not generic disclaimers.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show. FAQ Q7 mentions black-box inapplicability, but there is no systematic statement of scope boundaries — e.g., whether findings generalize beyond the specific orchestrator architecture, beyond 8B-scale models, or beyond the three benchmarks tested. The title implies broad generality without bounding.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding or acknowledgments section is present in the paper. One author is affiliated with DRDO (Defence Research and Development Organisation, India), which is a government defense agency, but no funding statement is provided.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed: IIT Delhi (Yardi School of AI, Department of Electrical Engineering) and DRDO Young Scientist Laboratory. The paper does not evaluate any product made by these institutions, so no product-affiliation conflict exists.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence of funder cannot be verified. One author is affiliated with DRDO (defense organization), but no explicit funding or sponsorship statement is provided.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are precisely defined: 'relational importance' (total incoming routing mass u_j(x)), 'intrinsic importance' (gradient norm of log-probability w.r.t. expert representation I(E_i)), 'orchestration' (sequential expert selection mechanism), and 'expert zoo' (fixed consortium of frozen LLMs) are all formally defined with equations.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states its contribution: INFORM, an interpretability framework that treats orchestration as analyzable computation enabling decoupling of interaction structure, sequencing, and causal attribution, with four explicit research questions (RQ1–RQ4) stated in the introduction.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 1 provides a structured comparison of INFORM against six prior systems (MetaGPT, AutoGen, FrugalGPT, RouteLLM, etc.) along specific dimensions; Appendix B extends the related work across MoE, multi-agent collaboration, and interpretability literature with clear positioning.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The orchestrator and INFORM analysis code are not released.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The paper uses publicly available benchmarks: GSM8K (Cobbe et al., 2021), HumanEval (Chen et al., 2021), and MMLU (Hendrycks et al., 2021). No proprietary data was collected.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper mentions training on 'a single NVIDIA A100 80GB GPU' (Appendix C) and lists hyperparameters in Table 4, but does not provide a requirements.txt, Dockerfile, or library version specifications sufficient to recreate the environment.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions, scripts, or README are provided. The architecture and training objective are described but not in enough detail to reproduce without significant reverse engineering.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Table 8 reports 'mean KL divergence ± 95% CI computed across training epochs.' Table 11 reports 'mean and the standard deviation' with ± notation. Box plots with whiskers appear in Figures 6 and 7.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Table 10 reports Spearman's ρ with two-sided p-values and Kendall's τ for rank correlation. Table 11 reports Wilcoxon signed-rank test p-values and paired t-test p-values for masking ablation comparisons. Appendix J provides detailed statistical tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The paper reports KL divergence magnitudes (e.g., routing divergence of 2.366 vs sequence divergence of 0.428 on MMLU in Table 8), relative comparisons ('5.5× higher routing KL divergence'), speedup ratios ('~3.5× speedup'), and absolute performance differences ('~1.4% performance gain').",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification is given for why 10 experts were chosen, why 100 held-out examples were used for hyperparameter tuning, or why 5 training epochs were run. No power analysis is mentioned.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Table 8 reports ± 95% CI across epochs. Table 11 reports mean ± standard deviation. Figure 11 states results are 'mean accuracy... over 3 runs.' Box plots in Figures 6 and 7 show spread across samples.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 3 compares against MetaGPT. Figures 9 and 10 compare against a Uniform Baseline, Relational-Only, and Intrinsic-Only variants. Table 9 includes individual model baselines (LLaMA 3.1 8B, Qwen3 8B, DeepSeek-R1 8B).",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "MetaGPT (Hong et al., 2024, ICLR 2024) is a contemporary multi-agent framework. Table 1 also positions against RouteLLM (Ong et al., 2025), IRT-Router (Song et al., 2025), and other recent systems.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 4.5 and Appendix F present extensive ablations: static collaboration graph (Figure 11a), static inference sequence (Figure 11b), masking intrinsically important experts (Table 8, Figure 8), and Relational-Only vs Intrinsic-Only routing variants (Figures 9, 10).",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper reports accuracy (GSM8K, MMLU), Pass@1 (HumanEval), KL divergence (routing and sequence), entropy (collaboration and sequence), Gini coefficient (centralization), Spearman's ρ, Kendall's τ, and gradient norms (intrinsic importance).",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation is performed. All evaluation of the orchestration system and its outputs is automated. The interpretability claims (e.g., whether INFORM reveals 'meaningful' structure) could benefit from human assessment but none is provided.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Section 4 states the orchestrator is 'evaluated on a held-out subset of the test set.' Appendix C states 'A held-out set of 100 examples were used to determine the weights of the loss terms,' separating tuning from evaluation.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are consistently broken down by task (GSM8K, HumanEval, MMLU), by training epoch (Tables 8, 10; Figures 3-8), and by expert (Figures 4, 5, 13, 14). Different tasks show qualitatively different behaviors.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Appendix H presents an extensive discussion of failure modes: hub over-centralization, routing-attribution misalignment, early commitment to suboptimal initializers, overconfidence under semantically damaged inputs, and redundancy masking structural dependence. Task-specific failure profiles are also analyzed.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Table 10 shows weak and statistically insignificant rank correlation between routing dominance and causal importance — a key negative finding. Relational-Only and Intrinsic-Only ablations show degraded performance (Figure 9). Appendix H documents systematic failure modes of the orchestrator.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model versions are given: 'LLaMA-3.1 8B', 'Qwen-3 8B', 'DeepSeek-R1 8B' (with 'distilled DeepSeek-R1-0528-Qwen3-8B variant' specified in Appendix D) for the homogeneous consortium, and 'LLaMA-3.2 1B', 'Qwen2.5 3B', 'Mistral 7B' for the heterogeneous one. Oracle: 'GPT OSS 20B' (Agarwal et al., 2025).",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The full prompt template is provided in Appendix E, including the expert coordination instructions and template structure. The template shows the actual text sent to models, including role instructions and task formatting.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Table 4 provides comprehensive hyperparameters: learning rate, batch size, epochs, warmup ratio, gradient clipping, hidden dimension, attention heads, dropout, Gumbel-Softmax temperatures, and all loss coefficients. Tables 6 and 7 list per-expert decoding temperatures. Appendix D specifies top_p settings.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section 2.1 describes the orchestration architecture in detail: the Interaction Module (routing adapter, conditional transition matrix with query-key attention and cosine similarity prior), the Selection Module (Gumbel-Softmax sampling with position penalty), and the multi-step expert chaining. Figure 1 provides a visual overview. Appendix C details the training objective.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4 states 'initial 512 tokens generated by each expert is used as an input to the orchestrator.' Appendix C describes the held-out set used for tuning. The paper uses standard benchmark splits for GSM8K, HumanEval, and MMLU with standard evaluation protocols.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "No raw experimental data (routing matrices, gradient attributions, per-sample results) is released. Only aggregated statistics are reported in figures and tables. The underlying benchmarks are public, but the orchestrator outputs and intermediate analysis data are not available.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The data consists of standard public benchmarks (GSM8K, HumanEval, MMLU), which are well-documented in their respective papers. The paper describes how expert outputs are generated (512 initial tokens per expert) and how the orchestrator is evaluated on held-out test subsets.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Data sources are standard public benchmarks.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from input to evaluation is documented: benchmarks → expert generation (512 tokens each, Tables 6-7 for decoding configs) → BERT encoder → orchestrator routing → expert chaining → evaluation. Appendix C details the training pipeline. Table 4 specifies all parameters.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates are stated for any of the models used (LLaMA 3.1, Qwen3, DeepSeek-R1, Mistral 7B, GPT OSS 20B). These models were likely trained on data that includes the benchmarks used for evaluation.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of potential train/test overlap. The expert LLMs may have been trained on GSM8K (2021), HumanEval (2021), and MMLU (2021) solutions, which could affect both individual expert performance and orchestration dynamics. This is not addressed.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "All three benchmarks (GSM8K, HumanEval, MMLU) were published in 2021, well before the training cutoffs of the 2024-2025 era models used. Benchmark contamination risk is significant and not discussed. If experts have memorized answers, the orchestration task is fundamentally different from what the paper analyzes.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Table 3 reports average model calls (1.44 for INFORM vs 5.00 for MetaGPT) and claims '~3.5× speedup,' but no actual wall-clock time, API costs, tokens consumed, or cost per example is reported. The model call count is a proxy, not an actual cost measurement.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "The paper states training was on 'a single NVIDIA A100 80GB GPU' (Appendix C) and lists 5 training epochs with batch size 2, but does not report total GPU hours, training wall-clock time, or total computational budget.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No explicit seed sensitivity analysis. Figure 11 mentions results are 'over 3 runs' for the ablation study, but the main results in Table 9 report 'best accuracy' values without showing sensitivity across random seeds.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "Figure 11 states 'over 3 runs' for ablation results. However, the main performance results in Table 9 report 'best accuracy' without stating how many runs produced them. Table 8 reports values 'computed across training epochs,' which is across epochs, not independent runs. Inconsistent across experiments.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": true,
    385           "justification": "Table 5 shows a hyperparameter sweep 'through magnitudes {10⁻², 10⁻¹, 10⁰}' for λ_oracle, λ_symmetry, and λ_sparsity. Appendix C states 'A held-out set of 100 examples were used to determine the weights of the loss terms.' The search method and budget are documented.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "Table 5 shows the sweep results, but Table 9 reports 'best accuracy' across training epochs without explaining the selection criterion (e.g., best validation epoch, last epoch, or cherry-picked). The 'best' label implies cherry-picking across epochs.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "Table 10 reports 15 correlation tests across tasks and epochs, and Table 11 reports multiple paired comparisons, all without correction for multiple comparisons (no Bonferroni, Holm, or Benjamini-Hochberg mentioned).",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors compare their INFORM orchestrator against MetaGPT using their own experimental setup (assigning same-size models to MetaGPT roles). No discussion of author-evaluation bias or the risk of inadvertently optimizing their system's presentation relative to the baseline.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": true,
    409           "justification": "Table 3 compares INFORM (1.44 avg calls) vs MetaGPT (5.00 avg calls) at matched expert capabilities. Figure 12 plots MMLU accuracy as a function of total parameters (consortium scaling), showing performance relative to monolith models at different scales.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper uses GSM8K, HumanEval, and MMLU without discussing whether these benchmarks actually measure what INFORM claims to analyze — i.e., whether orchestration dynamics on these tasks generalize to real-world multi-expert collaboration. No construct validity discussion is provided.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": true,
    420           "answer": false,
    421           "justification": "The MetaGPT comparison (Table 3) uses a different orchestration mechanism (SOPs vs learned routing) with the same underlying models, but the scaffold confound is not explicitly discussed. The different frameworks introduce confounds beyond routing policy (e.g., MetaGPT's role definitions, communication patterns) that could explain performance differences.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "Not discussed. All three benchmarks (GSM8K, HumanEval, MMLU, all 2021) predate the training data of the models used (LLaMA 3.1, Qwen3, DeepSeek-R1, all 2024-2025). Solutions to these benchmarks are widely available online and likely in training data.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "Not discussed. The initial 512 tokens generated by each expert could contain memorized solutions that leak answer information to the orchestrator. If experts have seen benchmark problems during training, the orchestration task changes fundamentally.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "Not discussed. No analysis of whether evaluation examples share structural properties with training data or whether benchmark problems are independent of each other.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination is mentioned.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Routing dominance is a poor proxy for functional necessity: rank correlation between routing mass and gradient-based importance is weak and mostly statistically insignificant across tasks and epochs.",
    456       "evidence": "Table 10 shows Spearman's ρ ranging from 0.152–0.648 across tasks and epochs, with most p-values > 0.1; only one epoch on MMLU achieves p=0.043.",
    457       "supported": "moderate"
    458     },
    459     {
    460       "claim": "Masking the single most intrinsically important expert on MMLU induces ~5.5× higher routing KL divergence than sequencing KL divergence, confirming interaction hub dependence.",
    461       "evidence": "Table 8 shows MMLU routing KL = 2.366 ± 0.497 vs sequence KL = 0.428 ± 0.072 (ratio ≈ 5.5); GSM8K also shows routing > sequencing divergence.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Orchestration behaviors emerge asynchronously: expert centralization increases before routing confidence stabilizes, indicating the system learns 'who to trust' before 'how confidently to route.'",
    466       "evidence": "Figures 6a and 6c show Gini coefficient rising before routing entropy fully decreases across tasks; the decoupling is described quantitatively but without a formal statistical test.",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "INFORM achieves better performance than MetaGPT on HumanEval (87.1 vs 85.9 pass@1) with 3.5× fewer model calls (1.44 vs 5.00 average calls).",
    471       "evidence": "Table 3 directly reports these numbers; however, the comparison uses role-specific MetaGPT with same-sized models, which may not represent MetaGPT's intended use case.",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Intrinsic expert importance is sparse and task-dependent: only a small subset of experts consistently exert causal influence, and which experts matter differs by task.",
    476       "evidence": "Figures 4 and 13 show sparse attribution heatmaps across epochs for all three tasks; qualitative consistency is clear but cross-task comparison relies on visual inspection.",
    477       "supported": "moderate"
    478     },
    479     {
    480       "claim": "The full INFORM model (combining relational and intrinsic signals) outperforms relational-only and intrinsic-only variants on all three benchmarks.",
    481       "evidence": "Figure 9 shows consistent performance ordering (full > intrinsic-only > relational-only > uniform baseline) across GSM8K, MMLU, and HumanEval, though no error bars are shown.",
    482       "supported": "weak"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "observational",
    488     "ablation-study"
    489   ],
    490   "key_findings": "INFORM reveals a systematic divergence between relational importance (routing frequency/mass) and intrinsic importance (gradient-based causal attribution) in learned multi-expert LLM orchestration: frequently selected experts often function as interaction hubs with limited causal influence, while sparsely routed experts can be structurally critical. Orchestration dynamics emerge asynchronously—expert centralization precedes routing confidence stabilization—and expert ordering remains non-deterministic at inference time. Targeted masking of intrinsically important experts induces disproportionate routing collapse (up to 5.5× greater KL divergence than sequencing disruption on MMLU), validating that gradient attribution captures functional necessity invisible to accuracy metrics alone. The heterogeneous expert consortium shows slower, more volatile convergence than the homogeneous setting, indicating that mixed-capacity orchestration is harder to stabilize.",
    491   "red_flags": [
    492     {
    493       "flag": "No code released",
    494       "detail": "The custom INFORM orchestrator architecture is described mathematically but no implementation is released, making reproduction dependent entirely on reimplementing from equations and hyperparameter tables."
    495     },
    496     {
    497       "flag": "Weak statistical support for central claim",
    498       "detail": "The core claim that routing dominance misrepresents causal importance rests on rank correlations (Table 10) that are statistically insignificant in 14 of 15 task-epoch combinations; the one significant result (MMLU Epoch 2, p=0.043) does not replicate across epochs."
    499     },
    500     {
    501       "flag": "No variance on main performance results",
    502       "detail": "Table 9 performance values (MMLU, GSM8K, HumanEval accuracy/pass@1) are single-point estimates; Figure 11 caption mentions 'mean over 3 runs' but no standard deviations are shown, obscuring reliability of the comparisons."
    503     },
    504     {
    505       "flag": "Benchmark contamination not addressed",
    506       "detail": "GSM8K, HumanEval, and MMLU all predate the training cutoffs of LLaMA 3.1, Qwen3, and DeepSeek-R1; the paper does not discuss potential contamination, which could inflate performance baselines."
    507     },
    508     {
    509       "flag": "Single orchestrator architecture tested",
    510       "detail": "All interpretability findings derive from one custom BERT-based orchestrator with Gumbel-Softmax routing; whether the routing-attribution divergence is a general phenomenon or an artifact of this specific design is not tested."
    511     },
    512     {
    513       "flag": "No limitations section",
    514       "detail": "There is no dedicated limitations or threats-to-validity section; methodological caveats are dispersed across a FAQ appendix and not synthesized into a coherent assessment of where findings may not hold."
    515     },
    516     {
    517       "flag": "Inconsistent MetaGPT comparison",
    518       "detail": "The MetaGPT comparison (Table 3) uses INFORM at what appears to be Epoch 4 (87.1 pass@1 matches Table 9 Epoch 4), while Epoch 5 achieves 88.4, suggesting the comparison may not use the best-performing INFORM checkpoint."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    524       "relevance": "Multi-agent orchestration framework; used as context for the problem of opaque coordination policies"
    525     },
    526     {
    527       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    528       "relevance": "Direct baseline comparison for multi-expert coordination efficiency; represents rigid, structured orchestration"
    529     },
    530     {
    531       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    532       "relevance": "Cascade-based orchestration baseline; INFORM is applied to interpret FrugalGPT-style systems in Appendix I"
    533     },
    534     {
    535       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    536       "relevance": "Contemporary learned routing approach compared in Table 1; represents performance-driven routing without interpretability"
    537     },
    538     {
    539       "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    540       "relevance": "Foundational MoE routing work; motivates the distinction between selection frequency and causal importance"
    541     },
    542     {
    543       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    544       "relevance": "Output aggregation approach contrasted with INFORM's sequential orchestration analysis"
    545     },
    546     {
    547       "title": "On the Resilience of LLM-Based Multi-Agent Collaboration with Faulty Agents",
    548       "relevance": "Related work on failure propagation in multi-agent systems; motivates need for interpretability of expert influence"
    549     },
    550     {
    551       "title": "ChatDev: Communicative Agents for Software Development",
    552       "relevance": "Multi-agent coding system representing role-based orchestration; context for the multi-expert collaboration paradigm"
    553     },
    554     {
    555       "title": "Evaluating Large Language Models Trained on Code",
    556       "relevance": "HumanEval benchmark paper; primary evaluation dataset for code generation in this study"
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 1,
    562       "justification": "The INFORM framework could help practitioners debug multi-agent LLM systems, but requires white-box access and significant adaptation to apply beyond the specific orchestrator studied."
    563     },
    564     "surprise_contrarian": {
    565       "score": 2,
    566       "justification": "The core finding that routing frequency diverges from causal importance — popular experts aren't necessarily important ones — is a genuinely counterintuitive insight for anyone building multi-agent systems."
    567     },
    568     "fear_safety": {
    569       "score": 0,
    570       "justification": "No safety, security, or misuse concerns are raised; the paper is purely about interpretability of routing mechanisms."
    571     },
    572     "drama_conflict": {
    573       "score": 1,
    574       "justification": "Mildly questions the opacity of existing multi-agent frameworks like MetaGPT and AutoGen, but does so academically without any pointed controversy."
    575     },
    576     "demo_ability": {
    577       "score": 0,
    578       "justification": "No code release, no demo, no tool — purely an analytical framework described in a paper with no public implementation mentioned."
    579     },
    580     "brand_recognition": {
    581       "score": 0,
    582       "justification": "From IIT Delhi and DRDO — recognized in India but not household names in the global tech/AI community; no famous product or lab involved."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [],
    587     "top_points": 0,
    588     "total_points": 0,
    589     "total_comments": 0
    590   }
    591 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs