scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21074B)
      1 {
      2   "paper": {
      3     "title": "Automated Design of Agentic Systems",
      4     "authors": ["Shengran Hu", "Cong Lu", "Jeff Clune"],
      5     "year": 2025,
      6     "venue": "ICLR 2025",
      7     "arxiv_id": "2408.08435"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'All code is open-sourced at https://github.com/ShengranHu/ADAS.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks (ARC, DROP, MGSM, MMLU, GPQA, GSM8K, GSM-Hard). The sampled validation/test splits for ARC are described but it is unclear if the exact splits are released; however the benchmarks themselves are public."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper. The paper mentions using GPT-4 and GPT-3.5 APIs but does not specify library versions or dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper references open-sourced code but does not include step-by-step reproduction instructions in the paper itself. Algorithmic details are in appendices but specific commands or scripts to replicate experiments are not described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Tables 1, 2, and 3 all report 95% bootstrap confidence intervals (e.g., '79.4 ± 0.8'). Figure 3a shows 95% bootstrap confidence intervals."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No formal significance tests (p-values, t-tests, etc.) are reported. Comparisons rely on overlapping/non-overlapping confidence intervals but no explicit statistical tests are conducted."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute improvements with baseline context, e.g., 'improve F1 scores on reading comprehension tasks in DROP by 13.6/100' and 'accuracy rates on math tasks in MGSM by 14.4%', with baseline values visible in tables."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The ARC experiment uses only 20 validation and 60 test questions sampled from the easy subset. No justification for these small sample sizes is provided, nor is a power analysis discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Results are evaluated 5 times and reported with 95% bootstrap confidence intervals, which implicitly convey variance. Figure 3a shows median accuracy with confidence intervals across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Seven hand-designed baselines are compared: Chain-of-Thought, COT-SC, Self-Refine, LLM Debate, Step-back Abstraction, Quality-Diversity, Role Assignment, plus OPRO for prompt optimization."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods like Self-Refine (2024), LLM Debate (2023), OPRO (2024), and Quality-Diversity (2024). These represent current state-of-the-art agentic design patterns."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is presented that systematically removes components of Meta Agent Search (e.g., the archive, self-reflection steps, novelty encouragement) to measure their individual contributions."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses F1 score for DROP and accuracy for MGSM, MMLU, GPQA, GSM8K, GSM-Hard, and ARC — multiple metrics across different domains."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the discovered agents' outputs or designs is reported. All evaluation is automated via benchmark metrics."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper explicitly separates validation and test sets. For ARC: '20 and 60 questions, respectively, for searching and testing.' Results are reported on held-out test sets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by domain (Reading Comprehension, Math, Multi-task, Science) in Table 1, and by transfer target in Table 2, and by model in Table 3."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses where improvements are smaller: 'While Meta Agent Search also outperforms baselines in the Multi-task and Science domains, the gap is smaller' and hypothesizes why (FM knowledge insufficient for hard science questions)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every experiment shows improvement over baselines. No failed configurations, search runs that did not converge, or approaches that were tried and abandoned are reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that discovered agents 'greatly outperform state-of-the-art hand-designed agents' and 'maintain superior performance even when transferred across domains and models.' Tables 1-3 support these claims with quantitative results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper implicitly makes causal claims that Meta Agent Search causes the performance improvements, but the search uses the same validation data for evaluation and selection. No ablation isolates which aspects of the algorithm drive improvements vs. simply more compute/API calls."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Automated Design of Agentic Systems' is broad, but experiments are limited to single-step QA tasks using GPT-3.5/GPT-4. The paper acknowledges this in future work ('Currently, we only evaluate Meta Agent Search on single-step QA tasks') but the title and abstract do not bound claims to this setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations such as whether the improvements come from simply using more API calls (discovered agents make multiple FM queries vs. single-query baselines), or whether prompt optimization with equal compute budget would close the gap."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-4 (OpenAI, 2024)' and 'GPT-3.5 (OpenAI, 2022)' without specifying exact model versions (e.g., gpt-4-0613, gpt-3.5-turbo-0125). Claude-Haiku and Claude-Sonnet are also used without version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper states 'The prompt and more details are presented in Appendix B' and provides framework code in Appendix C. The meta agent prompt and baseline implementations are provided in appendices."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, or other sampling parameters are reported for the LLM API calls. The number of iterations (25 for ARC, 30 for other domains) is stated but API-level hyperparameters are missing."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 3 describes the meta agent search framework in detail: the archive mechanism, self-reflection steps, evaluation loop, and code-based agent definition. Appendices provide pseudocode and framework code."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "For ARC: 'we sample our data from questions with grid dimensions ≤5×5 in the Public Training Set (Easy). We sample a validation set and a test set with 20 and 60 questions.' Dataset details for other benchmarks reference appendices."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The Discussion and Conclusion section (Section 6) includes safety considerations and extensive future work discussion that implicitly acknowledges current limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The future work section lists extensions but does not address specific methodological concerns like small sample sizes, compute fairness of comparisons, or potential overfitting to validation sets."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The future work section explicitly states 'Currently, we only evaluate Meta Agent Search on single-step QA tasks in this paper' and lists several dimensions not tested (multi-step tasks, multi-objective optimization, etc.)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental logs, per-question results, or intermediate search archives are made available. Only aggregate results in tables are shown."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data sources are well-described: publicly available benchmarks with specific sampling criteria (ARC grid dimensions ≤5×5 from Public Training Set Easy, standard splits for other benchmarks)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved; this is a benchmark evaluation study."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from sampling ARC questions to evaluation is described. The meta agent search process (iterate, evaluate, add to archive) is documented with pseudocode in Appendix H."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments list funding: 'Vector Institute, the Canada CIFAR AI Chairs program, grants from Schmidt Futures and Open Philanthropy, an NSERC Discovery Grant, and a generous donation from Rafael Cosman.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: University of British Columbia, Vector Institute, Canada CIFAR AI Chair."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders (Vector Institute, CIFAR, Schmidt Futures, Open Philanthropy, NSERC) are research-oriented organizations without a direct financial stake in whether Meta Agent Search outperforms baselines."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses GPT-4 and GPT-3.5 on benchmarks like MMLU, GSM8K, and DROP without stating the models' training data cutoff dates."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether benchmark examples (especially MMLU, GSM8K, DROP) appeared in GPT-4/GPT-3.5 training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Benchmarks like MMLU (2021), GSM8K (2021), and DROP (2019) were published before GPT-4's training cutoff. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, tokens consumed, or wall-clock time per experiment are reported. The discovered agents make multiple LLM calls per example but the cost is not quantified."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend or compute budget is stated. The paper runs 25-30 iterations of meta agent search with GPT-4 plus evaluation of discovered agents, but total cost is not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Meta Agent Search discovers agents that substantially outperform state-of-the-art hand-designed agents across multiple domains.",
    286       "evidence": "Table 1 shows improvements of 13.6 F1 on DROP and 14.4% accuracy on MGSM over the best baselines. Tables 2 and 3 show consistent outperformance in transfer settings.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Discovered agents transfer well across both domains and models.",
    291       "evidence": "Table 2 shows agents discovered on MGSM outperform baselines when transferred to GSM8K (+25.9%), GSM-Hard (+13.2%), and non-math domains. Table 3 shows GPT-3.5-discovered agents outperform baselines on Claude-Haiku, GPT-4, and Claude-Sonnet.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Code-based search space is superior to prompt-only optimization for discovering agentic systems.",
    296       "evidence": "Table 1 compares Meta Agent Search against OPRO (prompt optimization). Meta Agent Search outperforms OPRO across all four domains.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Meta Agent Search progressively discovers better agents through an archive of stepping stones.",
    301       "evidence": "Figure 3a shows progressive improvement over 25 iterations on ARC, with annotations showing how design patterns build on previous discoveries.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "Meta Agent Search, an algorithm that uses a meta agent to iteratively program new agentic systems in code, discovers agents that outperform hand-designed baselines across reading comprehension (DROP +13.6 F1), math (MGSM +14.4%), and other domains. The discovered agents transfer successfully across both domains (math to reading comprehension) and models (GPT-3.5 to Claude-Sonnet), suggesting the search discovers generalizable design patterns rather than task-specific tricks. The code-based search space outperforms prompt-only optimization (OPRO) across all tested domains.",
    307   "red_flags": [
    308     {
    309       "flag": "Compute fairness not addressed",
    310       "detail": "Discovered agents typically make multiple LLM calls per example (e.g., 5 COTs + critics + refinement), while baselines like Chain-of-Thought use a single call. No cost-controlled comparison is provided, so improvements may partly reflect increased compute rather than better design."
    311     },
    312     {
    313       "flag": "No ablation study",
    314       "detail": "The contribution of individual components of Meta Agent Search (archive, self-reflection, novelty encouragement) is not isolated. It is unclear what drives the algorithm's success."
    315     },
    316     {
    317       "flag": "Small and filtered evaluation set for ARC",
    318       "detail": "ARC experiments use only 60 test questions sampled from the easiest subset (grid dimensions ≤5×5), which may not represent the full ARC challenge difficulty."
    319     },
    320     {
    321       "flag": "Benchmark contamination risk unaddressed",
    322       "detail": "Multiple benchmarks (MMLU, GSM8K, DROP) predate GPT-4's training. No contamination analysis is provided despite this being a known concern for these benchmarks."
    323     },
    324     {
    325       "flag": "No cost reporting",
    326       "detail": "The total cost of running Meta Agent Search (30 iterations with GPT-4 as meta agent + evaluation of all discovered agents) is not reported, making practical feasibility difficult to assess."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    332       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"],
    333       "year": 2024,
    334       "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks."
    335     },
    336     {
    337       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    338       "authors": ["Jason Wei"],
    339       "year": 2022,
    340       "relevance": "Foundational prompting technique used as baseline and building block in agentic systems."
    341     },
    342     {
    343       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    344       "authors": ["Noah Shinn"],
    345       "year": 2023,
    346       "relevance": "Self-reflection technique for LLM agents, used as baseline in this paper."
    347     },
    348     {
    349       "title": "DSPy: Compiling Declarative Language Model Calls into State-of-the-Art Pipelines",
    350       "authors": ["Omar Khattab"],
    351       "year": 2024,
    352       "relevance": "Framework for optimizing LLM pipelines, an alternative ADAS approach discussed as related work."
    353     },
    354     {
    355       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    356       "authors": ["Cong Lu"],
    357       "year": 2024,
    358       "relevance": "Automated research pipeline using LLMs, closely related to automated agent design."
    359     },
    360     {
    361       "title": "FunSearch: Making New Discoveries in Mathematical Sciences Using Large Language Models",
    362       "authors": ["Bernardino Romera-Paredes"],
    363       "year": 2024,
    364       "relevance": "Uses FMs to discover optimization algorithms in code, same paradigm as ADAS code-space search."
    365     },
    366     {
    367       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    368       "authors": ["Yilun Du"],
    369       "year": 2023,
    370       "relevance": "Multi-agent debate technique used as baseline, relevant to agentic system design patterns."
    371     },
    372     {
    373       "title": "Large Language Models as Optimizers",
    374       "authors": ["Chengrun Yang"],
    375       "year": 2024,
    376       "relevance": "OPRO prompt optimization method used as baseline comparison for ADAS."
    377     },
    378     {
    379       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    380       "authors": ["Sirui Hong"],
    381       "year": 2023,
    382       "relevance": "Multi-agent framework incorporating organizational structures, relevant to agentic system design."
    383     },
    384     {
    385       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    386       "authors": ["Guanzhi Wang"],
    387       "year": 2023,
    388       "relevance": "Embodied agent that develops new skills in code, related to code-based agent design."
    389     }
    390   ]
    391 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs