scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28266B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Disentangling Causal Importance from Emergent Structure in Multi-Expert Orchestration",
      6     "authors": [
      7       "Sudipto Ghosh",
      8       "Sujoy Nath",
      9       "Sunny Manchanda",
     10       "Tanmoy Chakraborty"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.04291",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's core claim — routing dominance diverges from intrinsic importance, with 5.5× routing KL divergence vs. sequencing KL on MMLU — is directly supported by Table 8 (KL Routing 2.366±0.497 vs. KL Sequence 0.428±0.072) and Table 10.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Targeted masking ablations (removing specific experts at inference time) constitute intervention-based causal analysis; the FAQ explicitly qualifies that gradient attribution captures functional dependence rather than formal causal graphs, which is an appropriate caveat.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Conclusions like 'routing dominance is a poor proxy for functional necessity' are stated as general principles for multi-expert systems but are derived from a single orchestrator architecture (BERT encoder + attention routing + oracle distillation); applicability to other orchestration designs is asserted but not demonstrated.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The divergence between routing frequency and gradient attribution could be an artifact of the oracle distillation training objective or the BERT encoder bottleneck; these alternative explanations are not systematically considered.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes gradient-based attribution (influence on routing decisions) from expert correctness; FAQ Q3 states 'intrinsic importance measures how strongly an expert's representation influences the orchestrator's decisions, not the semantic quality or correctness of outputs.'",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations or threats-to-validity section exists in the main paper; Appendix H discusses system failure modes but frames these as orchestration behavior patterns rather than limitations of the INFORM framework itself.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The FAQ addresses methodological interpretation questions (Q1: INFORM is not formal causality) but does not frame these as threats to the validity of specific empirical claims; no specific threats are enumerated.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "FAQ Q7 notes INFORM requires white-box access and cannot be applied to API-based systems, but the paper does not explicitly bound when its empirical findings apply — which orchestrator types, scales, or tasks generalize.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment is present in the paper; affiliations are IIT Delhi and DRDO but no grant support or funding sources are disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly stated: IIT Delhi (Yardi School of AI, Dept. of Electrical Engineering) and DRDO Young Scientist Laboratory – Artificial Intelligence.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are precisely defined: 'relational importance' (total incoming routing mass uj(x)), 'intrinsic importance' (gradient norm of log P(Ei|x) w.r.t. hi), 'routing dominance,' 'collaboration matrix,' and 'orchestration' are all formally defined in Section 2.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "INFORM as an interpretability framework is clearly framed with four explicit research questions (RQ1-RQ4), three primary insights, and a comparison table (Table 1) positioning the contribution against prior work.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper engages substantively with MoE routing, multi-agent systems (MetaGPT, AutoGen, ChatDev), LLM routing (RouteLLM, FrugalGPT, IRT-Router), and interpretability research, using Table 1 to explicitly position INFORM against each.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No code repository or release link is mentioned; the INFORM framework is described architecturally but no implementation is provided.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All three evaluation benchmarks (GSM8K, HumanEval, MMLU) are standard publicly available datasets used without modification.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Only hardware is mentioned ('single NVIDIA A100 80GB GPU'); no software environment, Python version, PyTorch/CUDA version, or requirements file is provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided; Table 4 lists hyperparameters but without code or executable procedures to reconstruct orchestrator training and evaluation.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Table 8 reports '± 95% CI' for KL divergence values across epochs; Table 11 reports mean ± standard deviation for routing collapse comparisons.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Table 10 reports Spearman ρ with two-sided p-values; Table 11 uses both Wilcoxon signed-rank test and paired t-test to compare masking intrinsically important vs. frequently routed experts.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "KL divergence magnitudes, Gini coefficients, performance differences ('+9.0% on MMLU for Intrinsic-Only,' '~1.4% gain with ~3.5× speedup' in Table 3), and rank correlations are reported throughout.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 'held-out subset of the test set' used for evaluation is never sized; rank correlations in Table 10 are computed over only N=10 experts, which is underpowered for meaningful significance testing, and no justification is given.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Table 8 reports ± 95% CI; Table 11 reports standard deviations; Figure 11 notes results are 'mean... over 3 runs,' though SDs are not shown for all figures.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Baselines include: Uniform Baseline (uniform transitions), Relational-Only, Intrinsic-Only, MetaGPT (Table 3), individual single-model baselines (Table 9), and static collaboration/sequencing ablations (Figure 11).",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "MetaGPT (2024), RouteLLM (2025), and FrugalGPT are recent systems; individual baselines use current models (LLaMA 3.1, Qwen3, DeepSeek-R1).",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Extensive ablations: static collaboration matrix, static execution sequence, masking most intrinsically important expert, relational-only routing, intrinsic-only routing, and oracle alignment (Appendix F, Section 4.5).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Metrics include: accuracy (MMLU/GSM8K), Pass@1 (HumanEval), KL divergence, collaboration entropy, sequence entropy, Gini coefficient (centralization), Spearman/Kendall rank correlation, and average model calls.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "The paper evaluates automated benchmarks and system-level routing behavior; human evaluation is not applicable to this interpretability analysis of orchestration mechanisms.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper explicitly states 'the orchestrator evaluated on a held-out subset of the test set' for all three benchmarks.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by task (MMLU, HumanEval, GSM8K) throughout the paper, with separate figures and tables for each task revealing meaningfully different dynamics (e.g., HumanEval shows opposite sequencing vs. routing sensitivity to GSM8K/MMLU).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Appendix H extensively documents five failure modes: hub over-centralization, routing-attribution misalignment, early commitment to suboptimal initializers, overconfidence under damaged inputs, and redundancy masking structural dependence.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that HumanEval shows the opposite pattern from GSM8K/MMLU (sequencing divergence > routing divergence); Table 10 shows mostly non-significant rank correlations; FAQ Q8 notes masking important experts does not always reduce task performance.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model versions: LLaMA-3.1 8B, Qwen-3 8B, DeepSeek-R1-0528-Qwen3-8B (distilled variant), LLaMA-3.2 1B, Qwen2.5 3B, Mistral 7B, GPT-OSS-20B oracle (Appendix D).",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Appendix E shows only the structural template header ('Expert 1's Response: ...') without providing the complete prompt, task-specific instructions, or fill values — insufficient to reproduce prompt behavior.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Table 4 provides comprehensive hyperparameters: learning rate, batch size, training epochs, warmup ratio, hidden dimension, attention heads, dropout, Gumbel-Softmax temperature schedule, and all eight loss coefficient weights.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The orchestrator architecture is described in detail: frozen BERT encoder (768-dim), routing adapter, query-key attention for C(x), Gumbel-Softmax selection module P(Ei|x), adaptive-top-k sparsity, and 8-term composite training objective with mathematical formulations.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Only 'initial 512 tokens generated by each expert is used as input to the orchestrator' is mentioned; how benchmark inputs are formatted, tokenized, or filtered for the orchestrator is not documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The collaboration matrices, attribution scores, and routing statistics generated during experiments are not released; only the standard input benchmarks are publicly available.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The process of extracting collaboration matrices C(x), selection distributions s(x), and gradient-based attribution scores I(Ei) is described mathematically in Section 2.2 with sufficient detail to understand what was measured.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; standard benchmark datasets were used.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "While training objectives and probing methodology are described, the full pipeline — how benchmark examples are batched, how routing statistics are aggregated across samples, how held-out splits are constructed — is not documented reproducibly.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training cutoffs for the expert models (LLaMA 3.1, Qwen3, DeepSeek-R1, Mistral) are not stated; GSM8K, HumanEval, and MMLU are well-established benchmarks that may have been included in these models' pretraining data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Potential overlap between training data of expert models and evaluation benchmarks is not discussed, despite all three benchmarks predating the training cutoffs of the models used.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Benchmark contamination is not addressed; MMLU, HumanEval, and GSM8K all predate the LLaMA 3.1, Qwen3, and DeepSeek-R1 models, and inflated performance due to memorization is not discussed.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table 3 reports average model calls per inference (INFORM: 1.44 vs. MetaGPT: 5.00 on HumanEval), providing a direct measure of inference cost efficiency.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Only hardware is mentioned ('single NVIDIA A100 80GB GPU'); total training time, GPU-hours, or dollar cost is not reported.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Routing dominance is a poor proxy for functional necessity: rank correlation between routing mass and intrinsic attribution is weak and unstable across training epochs.",
    374       "evidence": "Table 10 shows Spearman ρ ranging from 0.152 to 0.648 across tasks/epochs, with most p-values exceeding 0.3 (non-significant at N=10 experts); visual comparison of Figures 4 and 5 also shows clear misalignment.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Masking the most intrinsically important expert on MMLU induces 5.5× higher routing KL divergence than sequencing KL divergence.",
    379       "evidence": "Table 8 directly reports KL(Routing)=2.366±0.497 and KL(Sequence)=0.428±0.072 for MMLU (ratio ~5.5×); this is also cited verbatim in the abstract.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Orchestration behaviors emerge asynchronously: expert centralization precedes stable routing confidence during training.",
    384       "evidence": "Figure 6 shows Gini coefficient (centralization) increasing in early epochs while collaboration entropy continues to decrease across all three tasks; this decoupling is observed consistently.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "INFORM achieves 87.1% Pass@1 on HumanEval with 1.44 average model calls vs. MetaGPT's 85.9% with 5.00 calls (~3.5× efficiency gain).",
    389       "evidence": "Table 3 directly reports these numbers; however comparison is only on HumanEval against one baseline system with role-specific rather than task-optimized experts.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "A homogeneous consortium of 8B experts surpasses Qwen3-70B accuracy on MMLU while activating 8.75× fewer parameters.",
    394       "evidence": "Figure 12 shows the consortium crossing the 86% accuracy threshold at ~16B total parameters; however this is shown on MMLU only and with a single expert family (Qwen3-8B).",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "Intrinsic expert importance is sparse and task-dependent: different experts dominate gradient attribution across GSM8K, HumanEval, and MMLU.",
    399       "evidence": "Figures 4 and 13 show sparse heatmaps with different experts having high gradient norms across tasks; this is described as a primary finding in the abstract.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "observational",
    406     "case-study"
    407   ],
    408   "key_findings": "INFORM reveals a systematic divergence between routing frequency and causal necessity in learned multi-expert LLM orchestration: rank correlations between routing mass and gradient-based attribution are weak and unstable (mostly ρ<0.4, non-significant), while masking the highest-attributed expert disrupts routing structure 5.5× more than masking the most-frequented expert on MMLU. Orchestration dynamics emerge asynchronously — expert centralization develops before routing confidence stabilizes, suggesting the system learns who to trust before learning how confident to route. Task-dependent profiles show HumanEval is dominated by initialization sensitivity while GSM8K/MMLU rely on interaction hub stability. These findings suggest that accuracy metrics alone are insufficient to diagnose brittleness or redundancy in multi-expert systems.",
    409   "red_flags": [
    410     {
    411       "flag": "Single orchestrator architecture",
    412       "detail": "All findings derive from one specific design (BERT encoder + attention routing + oracle distillation from GPT-OSS-20B). Claims about 'multi-expert systems' broadly are not validated across diverse orchestration architectures; oracle distillation may create routing patterns uncharacteristic of task-loss-only training."
    413     },
    414     {
    415       "flag": "Causal terminology overuse",
    416       "detail": "The paper uses 'causal attribution' and 'causal importance' throughout, but FAQ Q1 explicitly states INFORM 'does not claim to recover causal structure in the sense of formal causal graphs or interventional guarantees.' Gradient sensitivity is a local correlational measure, not causal identification."
    417     },
    418     {
    419       "flag": "Underpowered rank correlation tests",
    420       "detail": "Table 10 computes rank correlations over N=10 experts per task/epoch. With N=10, Spearman ρ must exceed ~0.63 for p<0.05 (two-tailed). Most reported correlations are non-significant, yet strong claims about divergence are drawn from this analysis."
    421     },
    422     {
    423       "flag": "No code released",
    424       "detail": "Despite being framed as a practical diagnostic tool for practitioners (FAQ Q11), no code or model weights are released, making independent verification and practical adoption impossible."
    425     },
    426     {
    427       "flag": "Held-out evaluation set size unreported",
    428       "detail": "The 'held-out subset of the test set' used for all attribution and routing analyses is never sized, making it impossible to assess statistical validity of the entropy, centralization, and gradient attribution measurements."
    429     },
    430     {
    431       "flag": "Benchmark contamination unaddressed",
    432       "detail": "All three benchmarks (MMLU, HumanEval, GSM8K) predate the training cutoffs of LLaMA 3.1, Qwen3, and DeepSeek-R1; the paper does not discuss whether high expert performance reflects genuine reasoning or benchmark memorization, which could affect which experts appear 'important'."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    438       "relevance": "Primary baseline for rigid role-based multi-agent orchestration; used in efficiency comparison (Table 3) and represents the structured-workflow paradigm that INFORM's learned routing outperforms."
    439     },
    440     {
    441       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    442       "relevance": "Key multi-agent framework representing conversational agent orchestration; positioned as low-interpretability in landscape Table 1."
    443     },
    444     {
    445       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    446       "relevance": "Cascade-based LLM routing; INFORM is applied to this architecture in Appendix I to demonstrate generalizability of interpretability principles."
    447     },
    448     {
    449       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    450       "relevance": "Contemporary LLM routing from preference data; used in landscape comparison Table 1 and extended related work."
    451     },
    452     {
    453       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    454       "relevance": "Output aggregation approach contrasted with INFORM's sequential interaction-based analysis; represents the order-invariant paradigm INFORM critiques."
    455     },
    456     {
    457       "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer",
    458       "relevance": "Foundational MoE routing work; INFORM positions itself as going beyond implicit routing optimization to expose causal expert dependencies."
    459     },
    460     {
    461       "title": "Can Dependencies Induced by LLM-Agent Workflows Be Trusted?",
    462       "relevance": "Related work on trust and causal dependencies in LLM-agent workflows; directly relevant to INFORM's focus on whether orchestration dependencies are genuine."
    463     },
    464     {
    465       "title": "On the Resilience of LLM-Based Multi-Agent Collaboration with Faulty Agents",
    466       "relevance": "Failure propagation in multi-agent systems; motivates INFORM's focus on diagnosing structural dependencies before they manifest as accuracy drops."
    467     },
    468     {
    469       "title": "IRT-Router: Effective and Interpretable Multi-LLM Routing via Item Response Theory",
    470       "relevance": "Interpretable LLM routing method using item response theory; used as the high-interpretability comparison point in Table 1."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "INFORM provides actionable diagnostic insights for practitioners designing multi-expert systems (centralization monitoring, attribution-routing alignment), but the absence of released code limits immediate applicability."
    477     },
    478     "surprise_contrarian": {
    479       "score": 3,
    480       "justification": "The core finding — that routing frequency systematically diverges from causal necessity, and that popular 'hub' experts can be functionally dispensable — directly contradicts the common assumption that routing statistics reflect what the system actually depends on."
    481     },
    482     "fear_safety": {
    483       "score": 1,
    484       "justification": "The paper mentions opaque orchestration as a safety concern for high-stakes and tool-augmented deployments, but does not analyze safety-critical scenarios or failure consequences in depth."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Standard academic positioning against existing frameworks (MetaGPT, AutoGen, FrugalGPT) without significant controversy or community conflict."
    489     },
    490     "demo_ability": {
    491       "score": 1,
    492       "justification": "No code is released and INFORM requires white-box orchestrator access; practitioners cannot easily try this without re-implementing the full training pipeline."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "IIT Delhi is a recognized institution and DRDO is notable in the Indian defense research context, but this is not from a prominent Western AI lab (DeepMind, OpenAI, Meta AI, Google)."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [],
    501     "top_points": 0,
    502     "total_points": 0,
    503     "total_comments": 0
    504   }
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs