scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25304B)
      1 {
      2   "paper": {
      3     "title": "OPTIMA: Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System",
      4     "authors": ["Weize Chen", "Jiarui Yuan", "Chen Qian", "Cheng Yang", "Zhiyuan Liu", "Maosong Sun"],
      5     "year": 2024,
      6     "venue": "Annual Meeting of the Association for Computational Linguistics",
      7     "arxiv_id": "2410.08115",
      8     "doi": "10.48550/arXiv.2410.08115"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "OPTIMA, an iterative generate-rank-select-train framework for LLM-based multi-agent systems, achieves up to 2.8x performance gain with less than 10% of tokens on information-exchange tasks using Llama 3 8B/3.2 3B. The framework integrates MCTS-inspired DPO data generation and demonstrates cross-task transfer (e.g., MATH to GSM8k). OPTIMA's token efficiency enables improved inference-time scaling laws, matching CoT self-consistency performance with 88.5% fewer tokens on GSM8k.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: HotpotQA, 2WikiMultiHopQA, TriviaQA, CBT, GSM8K, MATH, ARC-C, MMLU. All are standard public datasets."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 8 A100 GPUs and Llama 3 8B / 3.2 3B but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions or scripts are provided. The appendix contains pseudo-code and hyperparameters but not executable reproduction instructions."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1, 2, 6, 7 report only point estimates with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims OPTIMA 'outperforms' baselines but provides no statistical significance tests — comparisons are based solely on comparing numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports effect sizes in context: '38.3% improvement (2.8x)' on 2WMHQA, '90% reduction in token usage', '88.5% fewer tokens'. Baseline numbers are provided alongside for context (Table 1)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the size of the evaluation sets is provided, nor any power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance across seeds, or spread measures are reported in any results table. All appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against CoT, Self-Consistency (n=8), Multi-Agent Debate (MAD), and AutoForm (Table 1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include AutoForm (Chen et al., 2024c) and MAD (Du et al., 2024), which are contemporary multi-agent approaches."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 presents ablation on reward components (removing #Tokens regularization and LM Loss), and the paper compares three OPTIMA variants (iSFT, iDPO, iSFT-DPO) which serve as ablations of the training strategy."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports both task performance (F1/accuracy) and token count (#Tok) across all experiments. Different tasks use F1 and accuracy as appropriate."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated via F1/accuracy metrics. Given claims about communication readability and quality, human evaluation would be relevant."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses standard benchmark test sets (HotpotQA, GSM8K, MATH, etc.) which are separate from training data. Transfer experiments (Table 2) evaluate on entirely different datasets."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-task breakdown across 8 benchmarks. Results are shown separately for information exchange and debate settings."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses cases where OPTIMA underperforms: 'on MATH and GSM8k, OPTIMA variants show comparable or slightly lower performance than SC' (Section 3.1). Transfer limitations are also discussed: 'transferring to more distant domains remains challenging, e.g., we find it hard to transfer from MATH to ARC-C' (Section 3.2)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that iDPO sometimes has performance trade-offs despite token reductions, that transfer from MATH to ARC-C fails, and that removing LM loss leads to unreadable outputs (Table 3, Section 3.4)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'up to 2.8x performance gain with less than 10% tokens' which is supported by Table 1 (2WMHQA: iSFT-DPO F1 74.2 vs MAD 25.9, tokens 54.9 vs 543.7). Other claims about inference-time scaling are supported by Figure 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims via ablation studies (Table 3: removing reward components shows their individual effects). The iterative training progression (Figure 1) and controlled ablations constitute adequate causal evidence for component contributions."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System' broadly, but results are on Llama 3 8B/3.2 3B only in two-agent scenarios on specific QA/reasoning benchmarks. The paper mentions this limitation but the title and abstract frame it as a general MAS solution."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not substantively discuss alternative explanations for the improvements. For example, it does not consider whether the gains come primarily from the format diversity initialization rather than the iterative training, or whether the improvements are specific to the task-model combinations tested."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly distinguishes its measurements (F1, accuracy, token count) from broader claims. It explicitly discusses the relationship between token efficiency and compute utilization, and frames benchmark scores appropriately as task-specific metrics."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies 'Llama 3 8B / 3.2 3B' (Meta, 2024) as the base model. These are specific model sizes. GPT-4 is mentioned for prompt pool generation (Table 12) without version, but GPT-4 is not the evaluated model."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Tables 9-13 in the appendix, including prompts for information exchange, GSM8k/MATH, ARC-C/MMLU, format pool generation, and a format pool example."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 8 provides comprehensive hyperparameters for all OPTIMA variants across all 8 tasks, including learning rates, epochs, batch sizes, λ values, β, α, and threshold parameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent communication scaffold is described in detail: two-agent conversation with turn-based communication, format specification prompts, solver/critic roles for debate tasks. Algorithms 1-5 provide pseudo-code for the full pipeline."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3 describes how multi-hop datasets are split between agents ('we split relevant contexts between two agents'), how TriviaQA/CBT contexts are randomly assigned, and how the format pool was created (Table 12). The data selection pipeline (generate, rank, select with thresholds) is detailed."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "There is a dedicated 'Limitations' section after the conclusion with substantive discussion of multiple limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section discusses specific threats: 'experiments primarily focus on two-agent scenarios with a shared model architecture', 'transferring OPTIMA-trained models to substantially different application areas... remains unexplored', and 'real-world deployment scenarios may involve additional constraints'."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitations section explicitly states what was NOT tested: scaling to 5-10 agents, heterogeneous agent configurations, cross-domain transfer (e.g., QA to coding), and real-world deployment constraints."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (model outputs, generated trajectories, reward scores) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data generation process is thoroughly described: trajectory sampling with N samples per instance, reward computation, selection criteria with thresholds, and MCTS-based DPO data generation (Sections 2.2-2.5, Algorithms 1-5)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The paper uses standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented: initialization with format pool → trajectory generation → reward ranking → top-70% selection → training → iteration. Each step is formalized with equations and pseudo-code."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section is visible in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Tsinghua University and Beijing University of Posts and Telecommunications. No commercial product is being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses Llama 3 8B / 3.2 3B but does not state the training data cutoff date for these models."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the Llama 3 models were pre-trained on any of the benchmark datasets (HotpotQA, GSM8K, MATH, etc. are all public and could appear in training data)."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "All benchmarks used (HotpotQA 2018, GSM8K 2021, MATH 2021, ARC, MMLU) were published well before Llama 3's training. No contamination analysis is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Token counts are reported for all methods across all tasks in Table 1, and inference token scaling is analyzed in Figure 3 and Section 3.3."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 3 states: 'Iterative training completes within 12 hours on 8 A100 GPUs for most tasks, except MATH, which takes around 24 hours.'"
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No results across multiple random seeds are reported. All results appear to be single-run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs producing the reported results is not stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Table 8 reports final hyperparameters but no information about how they were selected or how many configurations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No explanation of how the hyperparameters in Table 8 were selected. The paper does not discuss whether selection was on validation or test data."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparing multiple methods across 8 benchmarks."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement OPTIMA and all baselines (MAD, AutoForm) but do not acknowledge the bias of evaluating their own system against their own baseline implementations."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 3 (right panel) explicitly plots performance as a function of token usage (compute proxy) for various methods, and Section 3.3 analyzes inference-time scaling laws."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the selected benchmarks actually measure multi-agent collaboration effectiveness or whether simpler explanations (e.g., better single-agent fine-tuning) could account for the results."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "OPTIMA changes both the training procedure and the communication scaffold simultaneously. The paper does not isolate whether the gains come from the training or from the structured communication formats introduced during initialization."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "All benchmarks (HotpotQA 2018, MATH 2021, GSM8K 2021, etc.) predate Llama 3 (2024). No discussion of temporal leakage risk."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "In the information exchange setting, contexts are split between agents. No discussion of whether this splitting could leak information (e.g., partial answers visible in context)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of train/test independence. Training data for OPTIMA is generated from the same benchmark distributions used for evaluation."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "OPTIMA achieves up to 2.8x performance gain with less than 10% of tokens on information-asymmetric QA tasks.",
    365       "evidence": "Table 1: On 2WMHQA, iSFT-DPO achieves F1 74.2 vs MAD's 25.9 (2.8x), using 54.9 tokens vs MAD's 543.7 (10.1%). Section 3.1.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "OPTIMA-trained models transfer effectively across tasks within the same domain.",
    370       "evidence": "Table 2: Models trained on HotpotQA transfer to 2WMHQA (iSFT F1 56.5 vs MAD 25.9) and TriviaQA. MATH-trained iDPO achieves 77.9% on GSM8k vs direct MAD 72.5%. Section 3.2.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "OPTIMA improves inference-time scaling laws, matching CoT-SC with 88.5% fewer tokens.",
    375       "evidence": "Figure 3 right panel: GSM8k-iDPO matches CoT-SC accuracy with 88.5% fewer tokens. Section 3.3.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The reward function components (token count, LM loss) each contribute meaningfully to the framework.",
    380       "evidence": "Table 3: Removing token count increases tokens 2.7-6.3x; removing LM loss decreases performance by up to 10.7 F1 points. Section 3.4.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "OPTIMA scales to three-agent scenarios.",
    385       "evidence": "Table 7 in Appendix G shows results for three agents, but performance is worse than two-agent settings for IE tasks. Limited to one task per setting.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No variance or statistical tests",
    392       "detail": "All results are reported as point estimates across 8 benchmarks and 7+ methods. No error bars, standard deviations, or significance tests are provided. Without knowing variance, it is impossible to assess whether differences are meaningful."
    393     },
    394     {
    395       "flag": "Contamination risk unaddressed",
    396       "detail": "All benchmarks predate Llama 3's training data. The base model may have memorized benchmark answers, which could interact differently with the OPTIMA training than with baselines. This is especially concerning since OPTIMA fine-tunes on benchmark-specific data."
    397     },
    398     {
    399       "flag": "No code release",
    400       "detail": "Despite extensive algorithmic detail, no code is released. The framework involves many implementation decisions (MCTS parameters, edit distance thresholds, etc.) that are hard to reproduce from pseudo-code alone."
    401     },
    402     {
    403       "flag": "Self-comparison bias",
    404       "detail": "The authors implement all baselines (MAD, AutoForm) themselves. Lucic et al. (2018) showed that authors' baseline implementations systematically underperform. No independent evaluation or use of official baseline implementations is mentioned."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Beyond natural language: LLMs leveraging alternative formats for enhanced reasoning and communication",
    410       "authors": ["Weize Chen", "Chenfei Yuan", "Jiarui Yuan"],
    411       "year": 2024,
    412       "arxiv_id": "2402.18439",
    413       "relevance": "AutoForm baseline; explores non-natural-language communication formats for LLM agents."
    414     },
    415     {
    416       "title": "Improving factuality and reasoning in language models through multiagent debate",
    417       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"],
    418       "year": 2024,
    419       "relevance": "Multi-Agent Debate (MAD) baseline; foundational work on LLM multi-agent debate."
    420     },
    421     {
    422       "title": "Direct preference optimization: Your language model is secretly a reward model",
    423       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    424       "year": 2023,
    425       "relevance": "DPO algorithm used as a core training method in OPTIMA."
    426     },
    427     {
    428       "title": "Self-consistency improves chain of thought reasoning in language models",
    429       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    430       "year": 2023,
    431       "relevance": "Self-consistency baseline and inference-time scaling technique used in evaluation."
    432     },
    433     {
    434       "title": "ChatDev: Communicative agents for software development",
    435       "authors": ["Chen Qian"],
    436       "year": 2024,
    437       "relevance": "LLM-based multi-agent system for software development with role-based communication."
    438     },
    439     {
    440       "title": "AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors",
    441       "authors": ["Weize Chen"],
    442       "year": 2024,
    443       "relevance": "Multi-agent collaboration framework with role-playing for reasoning."
    444     },
    445     {
    446       "title": "An empirical analysis of compute-optimal inference for problem-solving with language models",
    447       "authors": ["Yangzhen Wu", "Zhiqing Sun"],
    448       "year": 2024,
    449       "arxiv_id": "2408.00724",
    450       "relevance": "Inference-time compute scaling analysis directly relevant to OPTIMA's scaling claims."
    451     },
    452     {
    453       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    454       "authors": ["Bradley Brown"],
    455       "year": 2024,
    456       "arxiv_id": "2407.21787",
    457       "relevance": "Inference-time scaling via repeated sampling, directly compared to OPTIMA's efficiency approach."
    458     },
    459     {
    460       "title": "Iterative reasoning preference optimization",
    461       "authors": ["Richard Yuanzhe Pang", "Weizhe Yuan"],
    462       "year": 2024,
    463       "arxiv_id": "2404.19733",
    464       "relevance": "Iterative DPO/RPO method that OPTIMA builds upon for its iDPO variant."
    465     },
    466     {
    467       "title": "Scaling large-language-model-based multi-agent collaboration",
    468       "authors": ["Chen Qian", "Zihao Xie"],
    469       "year": 2024,
    470       "arxiv_id": "2406.07155",
    471       "relevance": "Addresses MAS scaling challenges that OPTIMA aims to solve."
    472     },
    473     {
    474       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    475       "authors": ["Sirui Hong"],
    476       "year": 2024,
    477       "relevance": "Prominent multi-agent framework for software development."
    478     },
    479     {
    480       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation framework",
    481       "authors": ["Qingyun Wu"],
    482       "year": 2023,
    483       "arxiv_id": "2308.08155",
    484       "relevance": "Multi-agent conversation framework widely used for LLM-based MAS."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs