scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24825B)
      1 {
      2   "paper": {
      3     "title": "Traceable Latent Variable Discovery Based on Multi-Agent Collaboration",
      4     "authors": ["Huaming Du", "Tao Hu", "Yijie Huang", "Yu Zhao", "Guisong Liu", "Tao Gu", "Gang Kou", "Carl Yang"],
      5     "year": 2026,
      6     "venue": "WWW '26",
      7     "arxiv_id": "2602.14456",
      8     "doi": "10.1145/3774904.3792244"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Code is available at https://github.com/HYJ9999/TLVD.git as stated in Appendix A."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The WCHSU hospital datasets are not publicly available ('newly collected and has not been made publicly available online'). The two benchmark datasets are from prior work but the paper doesn't provide download links or its processed versions."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper mentions 'NVIDIA GeForce A6000 GPU with 48GB of memory' but does not provide requirements.txt, Dockerfile, or detailed library version specifications."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Tables 2, 3, 5, 6 all report results with ± notation (e.g., '0.833±0.000')."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims TLVD outperforms baselines but provides no statistical significance tests (no p-values, t-tests, etc.). Comparisons are based solely on comparing means."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage improvements with baseline context, e.g., '131.68% improvement over MiniMax' and 'average improvements of 32.67% in Acc, 62.21% in CAcc, and 26.72% in ECit' (Section 4.2, Abstract)."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for why 5 runs with different random seeds were chosen, or why the specific dataset sizes were used."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Standard deviations are reported across 5 runs in all main results tables (Tables 2, 3, 5, 6)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 4.1.2 lists 10 baselines across four categories: single LLMs (GPT-5), deep research agents, multi-agent platforms, and multi-LLM reasoning frameworks."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include GPT-5, Gemini-deepresearch, OpenAI-deepresearch, WideSearch, and other recent systems from 2024-2025."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section 4.4 presents ablation studies with 7 variants (TLVD-v, TLVD-d, TLVD-I, TLVD-R1, TLVD-R2, TLVD-R3) testing different components and reward signals."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Three evaluation metrics are used: ACC, CAcc, and ECit (defined in Appendix A)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "For the WCHSU datasets lacking ground truth, 'five experts from relevant fields at Asia's largest hospital' performed consistency evaluation of inferred latent variables (Appendix A)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No discussion of train/test split for the datasets. The paper evaluates on entire datasets with no mention of held-out test sets."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per dataset (Tables 2, 3, 5), per model configuration (Table 4), and per data source (Figure 6)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.7.2 provides detailed failure attribution analysis using the MAST taxonomy across 100 execution traces, identifying three main failure categories (Figure 7)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.5 shows that performance declines when using more than 6 execution LLMs, and heterogeneous models perform worse than homogeneous ones (Table 4). Ablation variants show degraded performance."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of '32.67% in Acc, 62.21% in CAcc, and 26.72% in ECit' improvements are supported by Tables 3 and 5 across five datasets."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims about component contributions are supported by controlled ablation studies (Section 4.4) with single-variable manipulation. The paper also provides theoretical analysis (Section 3.2.3) for convergence."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title and framework are presented as general ('Traceable Latent Variable Discovery') but results are only on 5 datasets (3 medical, 2 social science). No discussion of generalizability boundaries to other domains."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not discuss alternative explanations for why TLVD outperforms baselines. For instance, the game-theoretic framework's advantages are asserted but not compared against simpler coordination mechanisms with equal computational budget."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "ACC is determined by expert consensus for WCHSU datasets, but the paper does not discuss the gap between expert agreement as a proxy and actual causal correctness. CAcc and ECit measure evidence retrieval quality, not causal validity."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper mentions 'GPT-5', 'LLaMA3.1 8B', 'LLaMA3.1 70B', 'Qwen2.5 7B/72B', 'GPT-oss-120B', 'DeepSeek-v3' but does not specify exact version snapshots or API dates for any model."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper describes what queries are sent to the LLMs (e.g., Figure 5 shows a sample query) but does not provide the full system prompts, coordinator prompts, or executor prompt templates used."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4.1.3 reports: episodes=100, buffer size=32, optimizer=Adam, learning rate=0.001, discount factor=0.99, entity dimension=256, belief state dimension=128, MLP hidden size=256, λ_m=0.1."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The multi-agent architecture is described in detail: coordinator LLM, executor LLMs, belief networks, mixing network, reward design (Section 3.2). Workflow diagrams in Figures 2-4."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix A describes how WCHSU-Cancer (n=22) was curated by scoring variable correlation using LLMs with scores 0-5 and filtering >5, and WCHSU-Cancer (n=12) was screened by lung cancer specialists. De-identification and IRB approval mentioned."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section in the paper. The conclusion (Section 5) contains no limitations discussion."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed anywhere in the paper."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not state what the results do NOT show or what settings/domains are excluded from the claims."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "WCHSU datasets are not publicly available. The benchmark datasets are from prior work but the paper does not provide its processed versions."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Appendix A describes WCHSU-Cancer as coming from a hospital health management center with 200,000 participants for lung cancer screening, and WCHSU-Pain from the same hospital with 1,568 patients' perioperative data."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "For the expert evaluation, '5 experts from relevant fields' are mentioned but how they were recruited and selected is not described. For the hospital data, patient selection criteria are not detailed."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from raw data to latent variable discovery is documented: variable selection (LLM scoring or expert screening), causal graph construction (RLCD), multi-agent inference, and web-based validation (Section 3)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 6 (Acknowledgements) lists multiple funding sources including NSFC, Xiangjiang Laboratory, China Postdoctoral Science Foundation, and Sichuan Science and Technology Program."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are listed: Southwestern University of Finance and Economics, Hunan University of Technology and Business, Xiangjiang Laboratory, and Emory University."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funders are Chinese government agencies and academic foundations with no apparent financial stake in the specific outcomes of latent variable discovery performance."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses GPT-5, LLaMA3.1, Qwen2.5, and others but does not state their training data cutoff dates."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The paper notes 'the WCHSU dataset is newly collected and has not been made publicly available online, making it unlikely to have been used in training any existing LLMs, and therefore posing no risk of data leakage' (Section 4.1.1). For benchmark datasets, Appendix D acknowledges they 'might have been included in the training data of LLMs.'"
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Appendix D explicitly acknowledges that benchmark datasets may be in LLM training data and argues the comparison is still fair since all baselines are also LLM-based."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in the study. Expert evaluation was for ground-truth construction, not a human subjects study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "IRB approval is mentioned for the patient dataset usage, but there are no human participants in the study itself. The IRB covers data use, not human experimentation."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in the study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Figure 10 shows token consumption across different modules and sample sizes for the method and baselines."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper mentions 'NVIDIA GeForce A6000 GPU with 48GB' but does not state total GPU hours, training time, or API costs."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section 4.1.1: 'we conduct five experiments with different random seeds and report the average performance.' Standard deviations reported in all tables."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 4.1.1 explicitly states 'five experiments with different random seeds.'"
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Hyperparameters are reported but no search budget (number of configurations tried, search method) is stated."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper does not explain how the reported hyperparameter configuration was selected or whether it was tuned on a validation set."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No statistical tests are performed at all, let alone multiple comparison corrections."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors compare their system against baselines without acknowledging potential bias from implementing/configuring the baselines themselves."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Token consumption is shown in Figure 10 but performance is not plotted as a function of compute budget. No matched-compute comparisons are made."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper does not discuss whether ACC (expert-consensus-based) actually measures causal correctness, or whether the benchmark datasets are valid proxies for real-world latent variable discovery."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "TLVD uses a complex scaffold (belief networks, mixing network, coordinator) while baselines use their own different scaffolds. The scaffold contribution vs. the game-theoretic coordination is not isolated."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper addresses temporal leakage for WCHSU by noting it is newly collected and not publicly available. For benchmark datasets, the fairness argument (all baselines are LLM-based) partially addresses this."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information to the LLMs beyond what would be available in real usage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the benchmark datasets' variables or structure could overlap with LLM training data in ways that create non-independence."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection method is applied. The WCHSU novelty argument is a prevention method, but no detection is applied to the benchmark datasets."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "TLVD achieves average improvements of 32.67% in Acc, 62.21% in CAcc, and 26.72% in ECit across five datasets compared to baselines.",
    363       "evidence": "Tables 2, 3, 5 show TLVD outperforming all baselines on WCHSU-Cancer, WCHSU-Pain, Multitasking Behaviour Study, and Teacher's Burnout Study datasets (Sections 4.2, Appendix D).",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "TLVD achieves performance improvements with fewer communication tokens compared to CAMEL, Multi-Agent Debate, and Multi-Agent Majority.",
    368       "evidence": "Figure 10 shows token consumption comparison. Section 4.2 attributes efficiency to the belief network design.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Heterogeneous execution LLMs perform worse than homogeneous ones due to increased difficulty in reaching BNE.",
    373       "evidence": "Table 4 shows homogeneous LLaMA3.1 70B achieving 0.833 ACC vs heterogeneous LLaMA3.1 70B + Qwen2.5 72B achieving 0.500 ACC on WCHSU-Cancer (n=12).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "The multi-agent collaboration framework converges to Bayesian Nash Equilibrium with sublinear regret O(N√T/(1-γ)).",
    378       "evidence": "Theoretical proof in Section 3.2.3 and Appendices B-C, relying on Glicksberg's Fixed Point Theorem and standard stochastic approximation theory.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "ArXiv data sources contribute more to TLVD performance than Wikipedia or databases for medical domains.",
    383       "evidence": "Figure 6 shows removing arXiv (W/O ARR) causes the largest performance drop (ACC from 0.80 to 0.25) on WCHSU-Pain.",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "TLVD integrates causal discovery algorithms with multi-LLM collaboration modeled as a Bayesian Nash Equilibrium game to discover latent variables and their semantics, then validates them via web evidence retrieval. On 5 datasets (3 medical, 2 social science), TLVD substantially outperforms single LLMs, deep research agents, and multi-agent baselines. The framework shows that game-theoretic coordination of LLM agents can improve causal reasoning compared to simpler multi-agent strategies like debate or majority voting. Failure analysis using MAST taxonomy reveals specification issues and task verification as primary failure modes.",
    389   "red_flags": [
    390     {
    391       "flag": "No limitations section",
    392       "detail": "The paper has no limitations, threats to validity, or scope boundaries discussion despite making broad claims about a novel framework."
    393     },
    394     {
    395       "flag": "Expert-consensus ground truth for WCHSU",
    396       "detail": "ACC on the main WCHSU datasets is determined by 5 experts' consensus, but no inter-rater reliability metrics are reported, and expert selection process is not described. This makes the primary evaluation metric difficult to verify."
    397     },
    398     {
    399       "flag": "No statistical significance tests",
    400       "detail": "Despite comparing against 10 baselines with claims of superiority, no statistical tests are reported. Some reported standard deviations overlap between methods."
    401     },
    402     {
    403       "flag": "Scaffold confound",
    404       "detail": "TLVD uses a sophisticated scaffold (belief networks, mixing network, RL training) while baselines use their native implementations. Performance gains could be due to the scaffold rather than the game-theoretic principles claimed."
    405     },
    406     {
    407       "flag": "Suspiciously large improvements",
    408       "detail": "Claims of 131.68% and 124.30% improvement over some baselines, while many baselines score 0.000 on CAcc and ECit, suggest potential issues with baseline configuration or metric design rather than genuine methodological superiority."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "GPT-4 technical report",
    414       "authors": ["Josh Achiam"],
    415       "year": 2023,
    416       "arxiv_id": "2303.08774",
    417       "relevance": "Foundational LLM used as baseline and component in AI agent systems."
    418     },
    419     {
    420       "title": "DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning",
    421       "authors": ["Daya Guo"],
    422       "year": 2025,
    423       "relevance": "RL-based LLM reasoning improvement relevant to AI capability evaluation."
    424     },
    425     {
    426       "title": "A survey on hallucination in large language models",
    427       "authors": ["Lei Huang"],
    428       "year": 2025,
    429       "relevance": "Survey on LLM hallucination — key limitation addressed by multi-agent frameworks."
    430     },
    431     {
    432       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    433       "authors": ["Sirui Hong"],
    434       "year": 2024,
    435       "relevance": "Multi-agent LLM framework for software engineering tasks."
    436     },
    437     {
    438       "title": "CAMEL: Communicative agents for mind exploration of large language model society",
    439       "authors": ["Guohao Li"],
    440       "year": 2023,
    441       "relevance": "Multi-agent LLM collaboration framework used as baseline."
    442     },
    443     {
    444       "title": "More Agents Is All You Need",
    445       "authors": ["Junyou Li"],
    446       "year": 2024,
    447       "relevance": "Study on scaling multi-agent LLM systems, used as baseline (Multi-Agent Majority)."
    448     },
    449     {
    450       "title": "Improving factuality and reasoning in language models through multiagent debate",
    451       "authors": ["Yilun Du"],
    452       "year": 2024,
    453       "relevance": "Multi-agent debate framework for LLM reasoning, used as baseline."
    454     },
    455     {
    456       "title": "Autogen: Enabling next-gen LLM applications via multi-agent conversations",
    457       "authors": ["Qingyun Wu"],
    458       "year": 2024,
    459       "relevance": "Multi-agent LLM platform used as baseline."
    460     },
    461     {
    462       "title": "From Debate to Equilibrium: Belief-Driven Multi-Agent LLM Reasoning via Bayesian Nash Equilibrium",
    463       "authors": ["Xie Yi"],
    464       "year": 2025,
    465       "relevance": "Game-theoretic multi-agent LLM reasoning framework that this paper builds upon."
    466     },
    467     {
    468       "title": "Why do multi-agent LLM systems fail?",
    469       "authors": ["Mert Cemri"],
    470       "year": 2025,
    471       "arxiv_id": "2503.13657",
    472       "relevance": "Failure taxonomy for multi-agent LLM systems used in this paper's error analysis."
    473     },
    474     {
    475       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs",
    476       "authors": ["Yujia Qin"],
    477       "year": 2024,
    478       "relevance": "LLM-based tool use and agent framework."
    479     },
    480     {
    481       "title": "A survey on large language model based autonomous agents",
    482       "authors": ["Lei Wang"],
    483       "year": 2024,
    484       "relevance": "Survey of LLM-based autonomous agents relevant to agentic AI research."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs