scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32628B)
      1 {
      2   "paper": {
      3     "title": "The AI Scientist-v2: Workshop-Level Automated Scientific Discovery via Agentic Tree Search",
      4     "authors": [
      5       "Yutaro Yamada",
      6       "Robert Tjarko Lange",
      7       "Cong Lu",
      8       "Shengran Hu",
      9       "Chris Lu",
     10       "Jakob Foerster",
     11       "Jeff Clune",
     12       "David Ha"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2504.08066",
     17     "doi": "10.48550/arXiv.2504.08066"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper explicitly states: 'We have open-sourced the code at https://github.com/SakanaAI/AI-Scientist-v2' (abstract/Section 1). Additionally, links to experiment-specific GitHub repositories are provided for each of the three workshop submissions (Appendix C)."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states they 'open-source the full codebase for The AI Scientist-v2 and the ICLR 2025 workshop experiment data' (Section 1, contribution 4). The workshop experiment repositories are linked in Appendix C. The datasets used in the AI-generated papers are public benchmarks (synthetic arithmetic, CIFAR-10, MNIST, Kaggle pest dataset)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specification is provided in the paper. The supplementary material lists models and hyperparameters (Tables 2-3) but not software dependencies or library versions needed to reproduce the system."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "While code is released on GitHub, the paper itself does not contain step-by-step reproduction instructions. There is no 'Reproducing Results' section or detailed README-style instructions in the paper for running the system end-to-end."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The main evaluation is based on three workshop submissions, with one accepted. No confidence intervals or error bars are reported for the system-level evaluation. The AI-generated papers internally use standard deviations across runs (Section 3.2.2 mentions replication nodes for mean and std), but the paper's own evaluation of the AI Scientist-v2 system provides no uncertainty quantification."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims the system can produce 'workshop-level' papers based on one out of three submissions being accepted (score 6.33 vs. acceptance threshold). No statistical test is used to assess whether this result is significantly different from chance or to compare against baselines."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports specific reviewer scores (6, 6, 7 for the accepted paper, average 6.33), contextualizes this as 'roughly in the top 45% of submissions,' and compares against the workshop acceptance threshold. While not a formal effect size metric, it provides enough context (baseline of average human acceptance threshold) to understand magnitude."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only three manuscripts were submitted to the workshop, and only one was accepted. The paper acknowledges this is a small sample ('Our current study aims to see whether The AI Scientist-v2 can produce at least one paper that survives peer review, and not what fraction of the time it can do so') but does not provide a formal justification for why N=3 is sufficient for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance across multiple system runs is reported. The paper mentions running multiple seeds per idea and selecting the best, but does not report the distribution of quality across seeds. Only the selected best outputs are described."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares The AI Scientist-v2 against its predecessor, The AI Scientist-v1, via a feature comparison table (Table 1). The key baseline is that v1 was 'Not Submitted' to peer review while v2 achieved workshop acceptance. The internal author reviews also compare against workshop and conference quality standards."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "While Section 6 (Related Work) discusses contemporary systems like AIDE, Agent Laboratory, CycleResearcher, and AI-Researcher, no direct experimental comparison is made against any of these systems. The only comparison is with the predecessor v1, and it is qualitative (Table 1 feature comparison), not quantitative."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The system has multiple novel components (agentic tree search, experiment manager, VLM reviewer, template-free codebase). No ablation study is conducted to determine which components contribute most to the quality improvement over v1. There is no experiment removing the VLM reviewer, the tree search, or the experiment manager individually."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The evaluation uses multiple dimensions: (1) official peer review scores from the workshop, (2) detailed internal author reviews with multiple scoring dimensions (soundness, presentation, contribution, overall workshop/conference scores), and (3) code reviews of the generated papers. Each of the three papers is assessed along these multiple criteria."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The core evaluation involves blind peer review at the ICLR 2025 ICBINB workshop (Section 4.1). Three manuscripts were reviewed by workshop reviewers who did not know which were AI-generated. Additionally, the authors conducted their own detailed internal review of all three manuscripts (Section 4.2)."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "This is not a benchmark evaluation paper in the traditional sense. The evaluation is via peer review submission, which is inherently a held-out evaluation (reviewers had not seen the papers before), but the concept of train/test split does not apply to the system-level evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per paper: each of the three submissions receives individual reviewer scores, detailed internal reviews (Appendix C.1-C.3), and code reviews. The internal reviews provide per-dimension scores (soundness, presentation, contribution, overall). Per-reviewer scores are also reported for the accepted paper."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper extensively discusses failure cases: two of three submissions were rejected, and the authors provide detailed analyses of weaknesses in all three papers, including inaccurate figure captions, hallucinated citations, dataset overlap issues (57% train-test overlap), contradictions between figures and text claims, and missing references. The code reviews (C.1.2, C.2.3, C.3.3) are particularly thorough in documenting failures."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports multiple negative findings: two of three papers were rejected; the system produces inaccurate citations and hallucinations; figure captions sometimes contradict the actual figures; the system 'does not yet consistently reach the rigorous standard required for top-tier conference publications' (Section 5); code review found 57% train-test overlap in the accepted paper's experiments."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims are supported: the system produced 'the first entirely AI-generated peer-review-accepted workshop paper' — this is documented with specific scores (6.33 average, individual 6/7/6). The claim of eliminating template dependency, using tree search, and VLM feedback are all described in detail in Section 3. The abstract appropriately hedges with 'workshop-level' rather than claiming conference-level quality."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes implicit causal claims that the v2 improvements (tree search, VLM feedback, template removal) led to the ability to produce workshop-quality papers. However, no controlled experiment isolates the contribution of each component. The comparison with v1 is confounded by many simultaneous changes. The paper states v2 'eliminates reliance on human-authored code templates' and 'leverages a novel progressive agentic tree-search methodology' as causes of improvement, but these are not experimentally validated."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper is generally careful about bounding its claims. Section 5 explicitly states: 'the acceptance occurred at a workshop level rather than at the main conference track, and only one of the three AI-generated submissions was accepted.' It notes workshop acceptance rates (60-80%) vs conference rates (20-30%) and states the system 'does not yet consistently reach the rigorous standard required for top-tier conference publications.' The title itself bounds to 'Workshop-Level.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper discusses several alternative explanations and confounds: that workshop acceptance rates are much higher than conference rates (Section 5), that the accepted paper's 100% attention model accuracy was due to task simplicity not model quality (Code Review C.1.2), that the best paper was selected from multiple seeds (selection bias), and that the acceptance may reflect the ICBINB workshop's specific focus on negative results rather than general quality."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 2 in Appendix A specifies the models used: 'Claude 3.5 Sonnet (v2)' for code generation, 'GPT-4o' for LLM/VLM feedback agents and summary report agent. While 'GPT-4o' lacks a specific snapshot date, 'Claude 3.5 Sonnet (v2)' is a specific version identifier. The writeup mentions 'o1' for the reflection stage. This is borderline but provides more specificity than just saying 'Claude' or 'GPT-4.'"
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompts are provided in Appendix B, covering idea generation, experiment prompts, plot aggregation, writeup prompts, writeup reflection, VLM reflection, and VLM image review prompts. Some contain template placeholders (e.g., {tool_descriptions}, {idea_text}), but the actual prompt structure and instructions are provided in full. The fill values are contextual (e.g., experiment summaries) rather than tuned parameters."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 2 reports LLM/VLM hyperparameters including max tokens and temperature for each component. Table 3 reports tree search hyperparameters including debug probability, maximum debug depth, maximum experiment runtime per node, and node allocation per stage. Temperature values are explicitly stated (0.5 for code generation, 0.5 for feedback, 1.0 for summary)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agentic scaffolding is described in substantial detail: the experiment progress manager's four stages (Section 3.2.1), the parallelized agentic tree search with node types (Section 3.2.2), the VLM reviewer pipeline (Section 3.4), the idea generation workflow (Section 3.1), and the writeup/reflection pipeline. Figure 1 provides a workflow diagram and Figure 2 illustrates the tree search stages."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper documents the process: idea generation produced ~40 ideas, from which three were selected based on alignment with workshop theme. For each selected idea, multiple complete manuscripts were generated with different random seeds, and the best was selected for submission (Section 4.2). The pest detection dataset was manually reduced to 1/10th size (noted in Appendix C.3.1). The data pipeline for the system itself is well-documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 is titled 'Limitations & Ethical Considerations' and provides substantive discussion of the system's limitations, including workshop vs. conference quality gaps, challenges in generating novel hypotheses, and the inability to consistently produce even workshop-level papers."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 discusses specific threats: workshop acceptance rates (60-80%) are much higher than conferences (20-30%), making the achievement less impressive than it might seem; only one of three submissions was accepted; the system 'does not yet consistently reach the rigorous standard required for top-tier conference publications'; specific weaknesses like inability to formulate 'genuinely novel, high-impact hypotheses' or 'truly innovative experimental methodologies.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states what it does not show: 'the current version of The AI Scientist-v2 does not yet consistently reach the rigorous standard required for top-tier conference publications, nor does it even reach workshop-level consistently' (Section 5). It also notes the study only addresses whether the system 'can produce at least one paper that survives peer review, and not what fraction of the time it can do so' (Section 4.2)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The full AI-generated manuscripts are included in Appendix C with annotations. The workshop reviewer comments are reproduced (two of three reviewers granted permission). The experiment code repositories are linked on GitHub. The internal reviews and code reviews are provided in full."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper describes how the three manuscripts were generated: broad topical prompts aligned with the ICBINB workshop scope, idea generation producing ~40 ideas, selection of three ideas, running each through the full pipeline with multiple seeds, and selection of the best manuscript per idea for submission (Section 4.1-4.2)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper describes the arrangement with ICLR leadership and workshop organizers: reviewers were informed some submissions might be AI-generated, offered opt-out, and the AI-generated papers were included among 43 total submissions. Post-review, reviewers were informed of the experiment. IRB approval was obtained (H24-02652). Section 4.1 describes the controlled evaluation steps."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline from idea generation to submission is documented: idea generation phase produced ~40 ideas (Section 4.2), three selected for full runs, multiple seeds per idea, best manuscript selected per idea, submitted to blind peer review among 43 total submissions. Each stage of the AI Scientist-v2 pipeline is described (Sections 3.1-3.4)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No explicit funding disclosure or acknowledgments section is present in the paper. Author affiliations include Sakana AI (a company), University of British Columbia, Vector Institute, and FLAIR/Oxford. Jeff Clune is listed as a Canada CIFAR AI Chair. No specific grants or funding sources are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Sakana AI (four authors), University of British Columbia, Vector Institute, FLAIR/University of Oxford, and Canada CIFAR AI Chair. The detailed author contributions section (pages 19) specifies each person's role."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Sakana AI is the primary affiliation for four of eight authors, and the AI Scientist is Sakana AI's product. The company has a direct commercial interest in demonstrating that their AI Scientist system can produce peer-reviewed research. The funder is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure is provided. Multiple authors are affiliated with Sakana AI, the company behind the AI Scientist system. No declaration of patents, equity, or other financial interests is present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It evaluates an agentic system's ability to produce scientific manuscripts. The relevant evaluation is by human peer review, not by benchmarking model knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable for the same reason: the system is not being evaluated on a benchmark where training data contamination would be relevant. The peer review process is the evaluation mechanism."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable: the evaluation is via peer review, not benchmark performance. The AI-generated papers internally use benchmarks, but the paper's own evaluation of the AI Scientist-v2 system is not a benchmark evaluation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The study involves human participants (workshop reviewers evaluating AI-generated papers). No pre-registration is mentioned. The study protocol was arranged with ICLR and workshop organizers, but no pre-registration link (OSF, AsPredicted, etc.) is provided."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "The paper explicitly states: 'we obtained IRB approval from the University of British Columbia (H24-02652)' (Sections 4.1 and 5). This was obtained before the workshop submission."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No demographics of the workshop reviewers are reported. The paper does not describe reviewer expertise, experience level, or any characterization beyond their role as ICLR workshop reviewers."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "The paper states that reviewers 'were informed in advance that some submissions might be AI-generated, but were not told which submissions were produced by The AI Scientist-v2. Reviewers could also opt out of reviewing potentially AI-generated manuscripts' (Section 4.1). This constitutes explicit inclusion/exclusion criteria."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "This is not an experimental study with treatment/control assignment. The AI-generated papers were included among the regular 43 submissions and reviewed through the standard workshop review process. Randomization in the traditional sense does not apply."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Blinding is clearly described: 'Reviewers were informed in advance that some submissions might be AI-generated, but were not told which submissions were produced by The AI Scientist-v2' (Section 4.1). 'Reviewers were informed of the experiment only after peer review' (Section 4.1). This constitutes single-blinding."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "The paper does not report how many reviewers opted out of reviewing potentially AI-generated manuscripts, nor whether any reviewers dropped out during the process. It only reports the final reviewer scores and comments."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "The paper reports: 'The total time required for The AI Scientist-v2 to generate a single paper depends on the complexity of the problems. Based on our experience, this process usually takes anywhere from several hours to a maximum of 15 hours, which is the runtime limit we have set' (Appendix A). However, no API costs in dollars or tokens consumed are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated in terms of API costs, total tokens consumed, or total compute hours across all experiments (multiple seeds per idea, three ideas, idea generation runs). Only the per-paper wall-clock time range (several hours to 15 hours) is mentioned. The total cost of the study is not quantified."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "The AI Scientist-v2 produced the first entirely AI-generated paper to successfully navigate peer review at a recognized workshop.",
    296       "evidence": "One of three AI-generated manuscripts received reviewer scores of 6, 7, and 6 (average 6.33) at the ICLR 2025 ICBINB workshop, surpassing the workshop's acceptance threshold and placing in roughly the top 45% of submissions (Section 4.1).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The AI Scientist-v2 eliminates reliance on human-authored code templates, generalizing effectively across diverse ML domains.",
    301       "evidence": "The paper describes the template-free approach (Section 3.2) and demonstrates the system generating papers across three different domains (compositional generalization, label noise, pest detection). However, the only quantitative evaluation is the workshop submission outcomes (Section 4).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The agentic tree search methodology enables deeper and more systematic exploration of scientific hypotheses compared to v1's linear approach.",
    306       "evidence": "The tree search algorithm is described in detail (Section 3.2.2) with four stages and specialized node types. However, no direct quantitative comparison with v1's linear approach is provided. The claim rests on the architectural description and the workshop acceptance outcome.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "The system does not yet consistently reach the rigorous standard required for top-tier conference publications.",
    311       "evidence": "Two of three submissions were rejected. The accepted paper had identified issues: 57% train-test overlap, incorrect figure captions, hallucinated claims, and missing references (Sections 4.2, 5, Appendix C). Internal review rated it borderline accept for workshop only.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "VLM integration improves the visual quality and clarity of manuscripts.",
    316       "evidence": "The VLM feedback mechanism is described in Section 3.4, but no ablation study compares manuscripts generated with and without VLM feedback. The accepted paper still had figure caption inaccuracies noted by both internal and external reviewers.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "case-study",
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "The AI Scientist-v2 is an end-to-end agentic system for automated scientific discovery that uses tree-search-based experimentation, an experiment manager, and VLM feedback. Three fully AI-generated papers were submitted to the ICLR 2025 ICBINB workshop, and one was accepted with an average reviewer score of 6.33/10. The accepted paper contained notable flaws including 57% train-test overlap, incorrect figure captions, and hallucinated citations, placing the achievement in context of workshop-level (60-80% acceptance rate) rather than conference-level quality. The authors identify that the system does not yet consistently produce even workshop-quality papers.",
    325   "red_flags": [
    326     {
    327       "flag": "Company evaluating its own product",
    328       "detail": "Four of eight authors are affiliated with Sakana AI, the company that builds the AI Scientist system. No competing interests statement is provided. The paper evaluates and promotes Sakana AI's product, creating a clear conflict of interest."
    329     },
    330     {
    331       "flag": "Selection bias in evaluation",
    332       "detail": "Multiple manuscripts were generated per idea across different seeds, and the best was selected for submission. This selection process inflates the apparent success rate. The paper acknowledges this but does not quantify how many total manuscripts were generated or what the distribution of quality looked like."
    333     },
    334     {
    335       "flag": "Very small sample size for headline claim",
    336       "detail": "The claim of producing 'the first AI-generated peer-reviewed workshop paper' is based on submitting only 3 papers to a single workshop with a 60-80% acceptance rate. The 1/3 success rate is not statistically distinguishable from the baseline acceptance rate, yet the paper treats this as a significant milestone."
    337     },
    338     {
    339       "flag": "Known data integrity issues in accepted paper",
    340       "detail": "The authors' own code review found approximately 57% overlap between training and test sets in the accepted paper's experiments (Appendix C.1.2), yet this paper was still submitted and accepted. This undermines the validity of the accepted paper's experimental results and raises questions about the system's scientific rigor."
    341     },
    342     {
    343       "flag": "No ablation study for system components",
    344       "detail": "The paper introduces multiple novel components (tree search, experiment manager, VLM reviewer, template-free generation) simultaneously but provides no ablation to determine which components matter. The improvement over v1 cannot be attributed to any specific innovation."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    350       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    351       "year": 2024,
    352       "arxiv_id": "2408.06292",
    353       "relevance": "Predecessor system (v1) that this paper builds upon; directly relevant to understanding the evolution of AI-driven scientific discovery systems."
    354     },
    355     {
    356       "title": "AIDE: AI-Driven Exploration in the Space of Code",
    357       "authors": ["Zhengyao Jiang", "Dominik Schmidt", "Dhruv Srikanth"],
    358       "year": 2025,
    359       "arxiv_id": "2502.13138",
    360       "relevance": "Tree search approach for LLM-driven code generation that inspired the AI Scientist-v2's experimentation methodology; relevant as an agentic coding system."
    361     },
    362     {
    363       "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
    364       "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"],
    365       "year": 2025,
    366       "relevance": "Benchmark for evaluating ML engineering agents, directly relevant to evaluating agentic AI systems for research tasks."
    367     },
    368     {
    369       "title": "Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers",
    370       "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"],
    371       "year": 2025,
    372       "relevance": "Large-scale study evaluating LLM idea generation capabilities through human evaluation, directly relevant to AI-driven research quality assessment."
    373     },
    374     {
    375       "title": "RE-Bench: Evaluating Frontier AI R&D Capabilities of Language Model Agents Against Human Experts",
    376       "authors": ["Hjalmar Wijk", "Tao R. Lin", "Joel Becker"],
    377       "year": 2024,
    378       "arxiv_id": "2411.15114",
    379       "relevance": "Benchmark comparing AI agent R&D capabilities to human experts, relevant to understanding the frontier of agentic AI research capabilities."
    380     },
    381     {
    382       "title": "An Evaluation of Sakana's AI Scientist for Autonomous Research",
    383       "authors": ["Joeran Beel", "Min-Yen Kan", "Moritz Baumgart"],
    384       "year": 2025,
    385       "arxiv_id": "2502.14297",
    386       "relevance": "Independent evaluation of The AI Scientist-v1, directly relevant as external assessment of automated research systems."
    387     },
    388     {
    389       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    390       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    391       "year": 2024,
    392       "relevance": "Foundational agentic scaffolding framework for iterative self-improvement of LLM agents, relevant to agentic AI methodology."
    393     },
    394     {
    395       "title": "Agent Laboratory: Using LLM Agents as Research Assistants",
    396       "authors": ["Samuel Schmidgall"],
    397       "year": 2025,
    398       "arxiv_id": "2501.04227",
    399       "relevance": "Concurrent work on using LLM agents as research assistants, relevant to the survey scope of agentic AI for research."
    400     },
    401     {
    402       "title": "CycleResearcher: Improving Automated Research via Automated Review",
    403       "authors": ["Yixuan Weng", "Minjun Zhu", "Guangsheng Bao"],
    404       "year": 2025,
    405       "relevance": "System for automated research from idea generation to manuscript drafting, directly relevant as a concurrent automated scientific discovery system."
    406     },
    407     {
    408       "title": "Superintelligent Agents Pose Catastrophic Risks: Can Scientist AI Offer a Safer Path?",
    409       "authors": ["Yoshua Bengio"],
    410       "year": 2025,
    411       "arxiv_id": "2502.15657",
    412       "relevance": "Discusses safety implications of agentic vs. scientist AI systems, relevant to AI safety considerations in the agentic AI space."
    413     },
    414     {
    415       "title": "Towards an AI Co-Scientist",
    416       "authors": ["Juraj Gottweis", "Wei-Hung Weng", "Alexander Daryin"],
    417       "year": 2025,
    418       "arxiv_id": "2502.18864",
    419       "relevance": "Google's AI Research Copilot effort, a major industry contribution to AI-assisted scientific discovery."
    420     },
    421     {
    422       "title": "SciCode: A Research Coding Benchmark Curated by Scientists",
    423       "authors": ["Minyang Tian", "Luyu Gao"],
    424       "year": 2024,
    425       "relevance": "Research-oriented coding benchmark spanning physics, chemistry, and biology, relevant to evaluating AI research capabilities."
    426     }
    427   ]
    428 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs