scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29315B)
      1 {
      2   "paper": {
      3     "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
      4     "authors": [
      5       "Chris Lu",
      6       "Cong Lu",
      7       "Robert Tjarko Lange",
      8       "Jakob Foerster",
      9       "Jeff Clune",
     10       "David Ha"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2408.06292"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'Our code is open-sourced at https://github.com/SakanaAI/AI-Scientist' in the abstract and references the repository throughout."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The experiments use publicly available datasets (Shakespeare, enwik8, text8, 2D geometric datasets) and the ICLR 2022 OpenReview dataset. The code repository includes generated papers, run files, and logs: 'We provide a link to all papers, run files, and logs in our GitHub repository' (Section 6)."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is described in the paper. Hardware is mentioned (8x NVIDIA H100s) but no software dependency versions are provided beyond mentioning Python, PyMuPDF, and Aider without specific versions."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While the code is open-sourced and prompts are provided in Appendix A, the paper itself does not include step-by-step reproduction instructions. The experiment templates and how to run the full pipeline are not spelled out with specific commands or a README-style guide in the paper text."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Table 1 reports 95% bootstrap confidence intervals for the reviewer evaluation metrics (e.g., '0.65 ± 0.04' for balanced accuracy). However, the main generated-paper evaluation (Tables 3-5) only reports mean and max scores without uncertainty."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper makes comparative claims between models (e.g., 'Claude Sonnet 3.5 consistently produces the highest quality papers') and between the LLM reviewer and human baselines, but no statistical significance tests are performed. Comparisons are based on raw numbers and bootstrap CIs only."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper provides comparative metrics with baseline context, e.g., 'the automated reviewer achieves superhuman F1 Scores (0.57 vs. 0.49) and human-level AUC (0.65 for both)' and percentage improvements like '12.8% reduction in KL on the dinosaur dataset' (Section 5). The reviewer evaluation table provides absolute performance numbers alongside baselines."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The reviewer evaluation uses 500 ICLR 2022 papers, and the generation experiments produce ~50 ideas per template per model, but no justification is given for why these sample sizes are adequate. No power analysis is discussed."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "For the generated paper quality scores, only mean and max are reported in Tables 3-5. No standard deviation or variance across papers is given. The violin plots in Figure 4 show distributions visually but no numeric spread measures are in the tables. The reviewer evaluation (Table 1) includes bootstrap CIs but the main experimental results do not."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The reviewer evaluation includes baselines: random decision, always reject, and the NeurIPS 2021 human consistency experiment baseline (Table 1). The generated paper evaluation compares four LLMs (Sonnet 3.5, GPT-4o, DeepSeek Coder, Llama-3.1 405b)."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The baselines include contemporary models as of 2024: Claude Sonnet 3.5, GPT-4o, DeepSeek Coder, and Llama-3.1 405b. The human baseline from NeurIPS 2021 is the most recent available consistency experiment data."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section 4 and Figure 2 present ablations of the reviewer system: varying Reflexion rounds, few-shot examples, and review ensembling. The top-right of Figure 2 shows the impact of each component. However, ablations of the generation pipeline components (ideation, experiment iteration, writing) are not performed."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The reviewer evaluation uses balanced accuracy, accuracy, F1 score, AUC, FPR, and FNR (Table 1). The generated paper evaluation uses mean score, max score, number of completed papers, total cost, and novel ideas count."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 5 provides an in-depth human case study of the 'Adaptive Dual-Scale Denoising' paper, with expert domain analysis of strengths and weaknesses. The authors provide qualitative expert assessment of multiple generated papers across Sections 5 and 6. However, there is no systematic human evaluation with multiple raters or structured rubrics."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The reviewer evaluation uses 500 ICLR 2022 papers with ground truth accept/reject decisions from OpenReview. This is a separate evaluation dataset not used for developing the reviewer system."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down by template (Diffusion, Language Modeling, Grokking) in Tables 3-5 and by model (Sonnet 3.5, GPT-4o, DeepSeek Coder, Llama-3.1) within each. The violin plots in Figure 4 show per-domain and per-model distributions."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 5 discusses specific pathologies (hallucinated hardware details, positive spin on negative results, artifacts from experimental logs). Section 8 has an extensive list of failure modes including self-relaunching code, exceeding storage, cheating by leaking future tokens, and hallucinated results."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports several negative findings: GPT-4o struggles with LaTeX compilation (Section 6), DeepSeek often fails tool calls, Llama-3.1 performed worst overall, review ensembling did not improve performance (Section 4), and many generated papers were rejected by the reviewer. Section 8 extensively discusses limitations."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims: (1) first comprehensive framework for automated scientific discovery - supported by the system description; (2) cost less than $15 per paper - supported by Tables 3-5 showing ~$250 for ~50 papers; (3) automated reviewer achieves near-human performance (65% vs 66%) - supported by Table 1; (4) papers that exceed acceptance threshold - supported by max scores of 5-6 in Tables 3-5, though this is based on the authors' own automated reviewer, not actual conference review."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims through ablation studies of the reviewer (Reflexion and one-shot prompting each contribute +2% accuracy, Figure 2). The claims about the generation framework are primarily descriptive rather than causal. The ablation design for the reviewer is adequate (single-variable controlled manipulation)."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The title claims 'Fully Automated Open-Ended Scientific Discovery' but results are limited to three small-scale ML templates (2D diffusion, NanoGPT, grokking). The abstract claims 'this approach can more generally be applied to almost any other discipline, e.g. biology or physics' without evidence. While Section 1 acknowledges focusing on ML, the framing significantly overstates generalization."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 5 discusses alternative explanations for the generated paper's results (MoE interpretation vs. the AI-claimed global/local explanation). Section 8 acknowledges that improved results may come from simply adding parameters rather than architectural innovation. The authors consider that the novelty check may be biased because it is self-assessed by each model."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper uses 'Claude Sonnet 3.5', 'GPT-4o', 'DeepSeek Coder', and 'Llama-3.1 405b' without specifying exact API versions or snapshot dates. No version like 'gpt-4o-2024-05-13' or similar is provided. These are marketing names that could correspond to different model behaviors over time."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix A provides the full prompts for idea generation (A.1), experiment design (A.2), paper writing (A.3), and paper reviewing (A.4). These include both system prompts and user prompts with the actual template text. The prompts contain placeholders but the fill values are described or available in the code."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Table 6 in Appendix B provides a comprehensive hyperparameter table including number of idea reflections (3), novelty search rounds (10), max experiments (5), experiment timeout (7200s), citation rounds (20), reviewer reflections (5), ensembled reviews (5), and LLM temperature (0.1)."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Section 3 provides detailed description of the three-phase pipeline (Idea Generation, Experimental Iteration, Paper Write-up) with sub-steps. The use of Aider for coding, Semantic Scholar API for literature search, chain-of-thought, self-reflection, and the review/meta-review process are all described. Figure 1 provides a workflow diagram."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "For the reviewer evaluation, the paper describes using 500 ICLR 2022 papers from OpenReview, processing via PyMuPDF, and notes the class imbalance. The three experiment templates are described with their code sources and datasets. Section 6 describes each template's origin and setup."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 8 is titled 'Limitations & Ethical Considerations' and contains multiple subsections covering limitations of the automated reviewer, common failure modes (8 detailed bullet points), safe code execution issues, and broader ethical considerations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 8 discusses specific threats: the ICLR 2022 dataset may be in model pre-training data, accepted papers use camera-ready copies while rejected papers use original submissions (asymmetry), the reviewer cannot view figures, generated papers may have subtle implementation errors, ideas tend to be similar across runs, and results may be deceptive (e.g., token leaking in NanoGPT template)."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 8 states: 'we do not recommend taking the scientific content of this version of The AI Scientist at face value. Instead, we advise treating generated papers as hints of promising ideas.' Section 1 notes 'we focus on Machine Learning (ML) applications.' The paper acknowledges the system cannot download new datasets, cannot view figures, and operates only on small-scale experiments."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper states 'We provide a link to all papers, run files, and logs in our GitHub repository' (Section 6) and the code is open-sourced. The ICLR 2022 evaluation data is from a publicly available dataset (Berto, 2024). Generated papers with full experimental artifacts are available for verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The reviewer evaluation uses '500 ICLR 2022 papers extracted from the publicly available OpenReview dataset' (Section 4). The generation experiments are described: 50 ideas per template, run on 8x H100s over ~12 hours per run, using specified templates from public repositories."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants were recruited. The study uses automated systems and public datasets."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline from idea generation to final paper is well-documented in Section 3 with clear stages. Tables 3-5 show the funnel: total ideas → novel ideas → experiments passed → completed papers, making the filtering transparent. For the reviewer, the pipeline from PDF parsing to review generation is described."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The Acknowledgments section lists funding: 'This work was supported by the Vector Institute, Canada CIFAR AI Chairs program, grants from Schmidt Futures, Open Philanthropy, NSERC, and a generous donation from Rafael Cosman.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Sakana AI, FLAIR/University of Oxford, University of British Columbia, Vector Institute, Canada CIFAR AI Chair. The first author and last author are from Sakana AI, which is a commercial AI lab."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The funders (Vector Institute, CIFAR, Schmidt Futures, Open Philanthropy, NSERC) are research-oriented organizations without direct financial interest in the specific results. While Sakana AI authors have commercial interests, the funding sources themselves appear independent of the outcome."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is provided. Three authors are affiliated with Sakana AI, a commercial AI company that could benefit from demonstrating automated research capabilities, but this conflict is not explicitly acknowledged."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper uses Claude Sonnet 3.5, GPT-4o, DeepSeek Coder, and Llama-3.1 405b to generate papers and review ICLR 2022 submissions, but no training data cutoff dates are stated for any of these models."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Section 8 explicitly discusses this: 'The dataset used, from ICLR 2022, is old enough to potentially appear in the base model pre-training data... However, preliminary analysis showed that LLMs were far from being able to reproduce old reviews exactly from initial segments, which suggests they have not memorized this data.'"
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The paper acknowledges contamination risk for the ICLR 2022 evaluation dataset in Section 8 and provides preliminary evidence against memorization. It also suggests 'Future iterations could use more recent submissions (e.g., from TMLR) for evaluation.'"
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants were involved in this study. The evaluation uses automated systems and public datasets."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants were involved in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants were involved in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants were involved in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants were involved in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants were involved in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants were involved in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "The paper reports costs per paper (~$15), total costs per template/model run (~$10-$300 in Tables 3-5), and reviewer costs ($0.25-$0.50 per review). Section 9 discusses that 'the bulk of the cost... is associated with the LLM API costs for coding and paper writing.'"
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "The paper states 'Each run of around fifty ideas in total takes approximately 12 hours on 8× NVIDIA H100s' (Section 6) and 'our experiments generating hundreds of papers were largely run only using a single 8×NVIDIA H100 node over the course of a week' (Section 9). Total API costs are reported in Tables 3-5."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "The AI Scientist is the first comprehensive framework for fully automatic scientific discovery, enabling frontier LLMs to perform research independently and communicate findings.",
    293       "evidence": "Section 3 describes the three-phase pipeline (ideation, experimentation, write-up). Sections 5-6 demonstrate generation of papers across three ML subfields. Code is open-sourced.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "The automated reviewer achieves near-human performance, with 65% balanced accuracy vs. 66% for the NeurIPS 2021 human consistency experiment.",
    298       "evidence": "Table 1 shows the calibrated GPT-4o (1-shot) @6 achieves 0.65 balanced accuracy with 95% bootstrap CIs of ±0.04, compared to 0.66 for humans. Also achieves superhuman F1 (0.57 vs. 0.49).",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "The AI Scientist can produce papers at a cost of less than $15 per paper.",
    303       "evidence": "Tables 3-5 show total costs divided by completed papers: Sonnet 3.5 on diffusion = ~$250/38 papers ≈ $6.58/paper; GPT-4o = ~$300/16 papers ≈ $18.75/paper. The $15 figure is approximate and varies by model.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "The AI Scientist can produce papers that exceed the acceptance threshold at a top machine learning conference as judged by the automated reviewer.",
    308       "evidence": "Table 3 shows Sonnet 3.5 achieving a max score of 6.0 on diffusion, which corresponds to 'Weak Accept' in NeurIPS guidelines (Section 6). However, this is evaluated by the authors' own automated reviewer, not actual conference review.",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "Claude Sonnet 3.5 consistently produces the highest quality papers among the tested models.",
    313       "evidence": "Tables 3-5 and Figure 4 show Sonnet 3.5 achieves the highest mean scores across all three templates (3.82, 4.05, 3.44) and the highest max score (6.0). GPT-4o comes second.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "The correlation between the LLM reviewer score and the average human reviewer score (0.18) exceeds the correlation between two individual human reviewers (0.14).",
    318       "evidence": "Section 4 and Figure 2 (bottom) report these correlation values on 500 ICLR 2022 papers. The difference (0.18 vs 0.14) is small and no statistical test for the significance of this difference is reported.",
    319       "supported": "weak"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval",
    324     "case-study"
    325   ],
    326   "key_findings": "The AI Scientist introduces a fully automated pipeline for end-to-end scientific paper generation in ML, demonstrating feasibility across diffusion modeling, language modeling, and grokking analysis at ~$15/paper. An automated GPT-4o-based reviewer achieves near-human balanced accuracy (65% vs 66%) on ICLR 2022 papers, with higher F1 than human reviewers. Among four LLMs tested, Claude Sonnet 3.5 produced the best papers, while all models exhibited significant failure modes including hallucination, implementation errors, and inability to view generated figures. The authors candidly acknowledge the system produces medium-quality work and recommend treating outputs as idea suggestions rather than reliable science.",
    327   "red_flags": [
    328     {
    329       "flag": "Evaluating own system with own reviewer",
    330       "detail": "The claim that generated papers 'exceed the acceptance threshold at a top machine learning conference' is based entirely on the authors' own automated reviewer, not actual conference submission. The reviewer itself achieves only 65% balanced accuracy, meaning it is wrong about 35% of the time. Using a self-built, imperfect evaluation tool to validate a self-built generation tool is circular."
    331     },
    332     {
    333       "flag": "Overstated generalization in title and abstract",
    334       "detail": "The title claims 'Fully Automated Open-Ended Scientific Discovery' but experiments are limited to three small-scale ML templates using toy datasets (2D distributions, character-level Shakespeare, modular arithmetic). The abstract claims applicability to 'almost any other discipline, e.g. biology or physics' without evidence."
    335     },
    336     {
    337       "flag": "Sakana AI conflict of interest not acknowledged",
    338       "detail": "Three of six authors are affiliated with Sakana AI, a commercial AI company. Demonstrating automated research capabilities directly benefits Sakana AI's business interests. No competing interests statement is provided despite this clear commercial interest."
    339     },
    340     {
    341       "flag": "No exact model versions specified",
    342       "detail": "Marketing names (Claude Sonnet 3.5, GPT-4o) are used without API version identifiers or snapshot dates. Model behavior changes across versions, making exact reproduction uncertain."
    343     },
    344     {
    345       "flag": "Reviewer evaluation uses potentially contaminated data",
    346       "detail": "The ICLR 2022 papers used to evaluate the automated reviewer likely appeared in the training data of GPT-4o and other models tested. The paper acknowledges this but provides only weak evidence against memorization (inability to reproduce full reviews does not rule out familiarity with content)."
    347     },
    348     {
    349       "flag": "Asymmetric test data for reviewer evaluation",
    350       "detail": "The paper acknowledges that 'rejected papers in our dataset used the original submission file, whereas for the accepted papers only the final camera-ready copies were available.' This systematic difference could inflate reviewer accuracy since camera-ready papers are more polished."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    356       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    357       "year": 2024,
    358       "arxiv_id": "2310.06770",
    359       "relevance": "Major benchmark for evaluating LLM coding capabilities on real-world software engineering tasks."
    360     },
    361     {
    362       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    363       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    364       "year": 2024,
    365       "relevance": "Core technique used in The AI Scientist for iterative self-improvement of generated outputs."
    366     },
    367     {
    368       "title": "Evaluating Large Language Models Trained on Code",
    369       "authors": ["Mark Chen", "Jerry Tworek"],
    370       "year": 2021,
    371       "arxiv_id": "2107.03374",
    372       "relevance": "Foundational work on LLM code generation evaluation (Codex/HumanEval), relevant to AI programming capability assessment."
    373     },
    374     {
    375       "title": "ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models",
    376       "authors": ["Jinheon Baek", "Sujay Kumar Jauhar", "Silviu Cucerzan", "Sung Ju Hwang"],
    377       "year": 2024,
    378       "arxiv_id": "2404.07738",
    379       "relevance": "Closely related work on using LLMs for automated research idea generation from scientific literature."
    380     },
    381     {
    382       "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation",
    383       "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"],
    384       "year": 2024,
    385       "relevance": "Benchmark for measuring LLM ability to write code for ML tasks, directly relevant to evaluating AI research agents."
    386     },
    387     {
    388       "title": "Can Large Language Models Provide Useful Feedback on Research Papers? A Large-Scale Empirical Analysis",
    389       "authors": ["Weixin Liang", "Yuhui Zhang"],
    390       "year": 2024,
    391       "relevance": "Empirical study of LLM reviewing capability, directly comparable to the automated reviewer component."
    392     },
    393     {
    394       "title": "Evolution through Large Models",
    395       "authors": ["Joel Lehman", "Jonathan Gordon", "Shawn Jain", "Kamal Ndousse", "Cathy Yeh", "Kenneth O. Stanley"],
    396       "year": 2022,
    397       "arxiv_id": "2206.08896",
    398       "relevance": "Key prior work on using LLMs as mutation operators in evolutionary search, foundational technique for idea generation."
    399     },
    400     {
    401       "title": "Ideas are Dimes a Dozen: Large Language Models for Idea Generation in Innovation",
    402       "authors": ["Karan Girotra", "Lennart Meincke", "Christian Terwiesch", "Karl T Ulrich"],
    403       "year": 2023,
    404       "relevance": "Study finding LLMs produce higher quality innovation ideas than humans, relevant to AI creativity and research automation claims."
    405     },
    406     {
    407       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    408       "authors": ["Lianmin Zheng", "Wei-Lin Chiang"],
    409       "year": 2024,
    410       "relevance": "Foundational work on LLM-as-judge paradigm, relevant to the automated reviewer evaluation methodology."
    411     },
    412     {
    413       "title": "MARG: Multi-Agent Review Generation for Scientific Papers",
    414       "authors": ["Mike D'Arcy", "Tom Hope", "Larry Birnbaum", "Doug Downey"],
    415       "year": 2024,
    416       "arxiv_id": "2401.04259",
    417       "relevance": "Multi-agent approach to automated paper review generation, directly comparable to the review component."
    418     },
    419     {
    420       "title": "Weak-to-Strong Generalization: Eliciting Strong Capabilities with Weak Supervision",
    421       "authors": ["Collin Burns", "Pavel Izmailov"],
    422       "year": 2023,
    423       "arxiv_id": "2312.09390",
    424       "relevance": "Referenced in context of superalignment challenges when AI generates research too complex for human evaluation."
    425     },
    426     {
    427       "title": "DiscoveryBench: Towards Data-Driven Discovery with Large Language Models",
    428       "authors": ["Bodhisattwa Prasad Majumder"],
    429       "year": 2024,
    430       "arxiv_id": "2407.01725",
    431       "relevance": "Benchmark for evaluating LLM scientific discovery capabilities from data analysis."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs