scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30038B)
      1 {
      2   "paper": {
      3     "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
      4     "authors": [
      5       "Chan Jun Shern",
      6       "Neil Chowdhury",
      7       "Oliver Jaffe",
      8       "James Aung",
      9       "Dane Sherburn",
     10       "Evan Mays",
     11       "Giulio Starace",
     12       "Kevin Liu",
     13       "Leon Maksin",
     14       "Tejal Patwardhan",
     15       "Lilian Weng",
     16       "Aleksander Mądry"
     17     ],
     18     "year": 2024,
     19     "venue": "ICLR 2025",
     20     "arxiv_id": "2410.07095",
     21     "doi": "10.48550/arXiv.2410.07095"
     22   },
     23   "scan_version": 2,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states 'We open-source our benchmark code (github.com/openai/mle-bench/)' in the abstract and conclusion, providing a working URL."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The benchmark uses publicly available Kaggle competitions, and the Reproducibility Statement says 'code is provided to allow users to reproduce datasets in a way that complies with relevant licenses.' The leaderboard snapshots and curation details are also provided."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Appendix A.5 specifies the exact setup: Ubuntu 20.04 Docker container, Microsoft Azure Standard_NV36ads_A10_v5 VM (36 vCPUs, 440GB RAM, Nvidia A10 GPU), sysbox runtime, Python virtual environment with necessary packages."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The Reproducibility Statement says 'We provide all necessary details for reproducing our results, including dataset curation, evaluation metrics, and experimental setup. Our codebase is publicly available, including code for reproducing the full benchmark and experiments.' They also provide examples for running agents."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Table 2 and other results tables report '± one standard error of the mean' for all metrics across seeds."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims o1-preview 'significantly outperforms all other models' (Section 3.1) but uses no formal significance tests — it compares means with standard errors without p-values, t-tests, or other statistical tests."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Absolute performance numbers are reported with context: o1-preview 16.9% vs GPT-4o 8.7% medals (Table 2), pass@1 16.9% doubling to pass@8 34.1% (Figure 3). Baseline and treatment numbers are both provided."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification for why 75 competitions were selected (vs. more or fewer), no power analysis, and no justification for why 3 seeds per experiment (or 16/36 for some) is sufficient."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Standard errors across seeds are reported in all main tables (Table 2, 3, 4, 9, 10). Table 2 notes 'Each experiment is repeated with 3 seeds, except o1-preview (AIDE) and GPT-4o (AIDE) which use 16 and 36 seeds respectively.'"
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baselines are included: three scaffolds (AIDE, MLAB, OpenHands) and four models (o1-preview, GPT-4o, Claude 3.5 Sonnet, Llama 3.1 405B). Human baselines are provided via Kaggle leaderboards."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All models are frontier LLMs from 2024: o1-preview, GPT-4o-2024-08-06, Claude 3.5 Sonnet, Llama 3.1 405B. All scaffolds are contemporary open-source frameworks."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple ablations: scaffolding choice (Table 2), hardware configuration (Table 3: CPU-only, standard, extra GPU), time available (Figure 4: 24h vs 100h), number of attempts (Figure 3: pass@k), and obfuscated descriptions (Table 4)."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 2 reports Made Submission %, Valid Submission %, Above Median %, Bronze %, Silver %, Gold %, and Any Medal %. Per-competition raw scores are also discussed."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation is entirely automated through competition grading code. While the authors manually inspect agent logs for rule-breaking (Appendix A.3) and analyze failure modes (Section 3.1), there is no systematic human evaluation of the agents' ML engineering solutions or processes."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 2.1 states 'We include an additional 7 competitions as a development split, for developing agents without over-fitting to the test set.' The 75 main competitions form the test set."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table 9 breaks down results by complexity level (Low/Medium/High). Table 10 breaks down by 19 task categories. Figure 9 shows performance by competition date."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 3.1 Discussion details failure modes: agents not using the validation server, ending runs early, filling context windows with large files, overloading disk/RAM, and not considering time limitations. Figure 2 shows real agent trajectories."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Table 3 shows extra GPU has no meaningful benefit. MLAB achieves only 0.8% medal rate. Agents often fail to create valid submissions (Section 3.1). Figure 4 shows medals sometimes decreasing over time due to AIDE's imperfect selection method."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims match results: 'best-performing setup — OpenAI's o1-preview with AIDE scaffolding — achieves at least the level of a Kaggle bronze medal in 16.9% of competitions' is directly supported by Table 2. Pass@8 doubling claim matches Figure 3."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Causal claims are made through controlled ablations: varying only the scaffold (Table 2, top vs bottom), varying only the model (Table 2, within AIDE), varying compute (Table 3), varying time (Figure 4). Each varies a single factor while holding others constant."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 6 explicitly bounds scope: 'the tasks included in MLE-bench don't cover the full spectrum of capabilities required for AI R&D,' noting Kaggle tasks have 'clear problem statements, datasets that are clean and well-documented' unlike real-world R&D."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4 investigates contamination as an alternative explanation for performance, testing via familiarity analysis (Section 4.1) and obfuscated descriptions (Section 4.2). Section 6 discusses algorithmic progress giving agents advantages over original competitors."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 6 explicitly distinguishes between what MLE-bench measures (performance on well-specified Kaggle competitions) and the broader claim (ML engineering capabilities for AI R&D), noting 'real-world AI R&D often may not even have a clear problem statement, and figuring out the dataset and metrics is part of the problem.'"
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Footnotes and tables specify: gpt-4o-2024-08-06 (footnote 6), claude-3-5-sonnet-20240620 (footnote 8), meta-llama/llama-3.1-405b-instruct (footnote 9), gpt-4o-mini-2024-07-18 (footnote 14). o1-preview lacks a snapshot date but was a distinct single release at the time."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Figure 8 provides the full verbatim prompt used for all agents, including overall instructions, competition-specific details, submission validation, and rules. Appendix A.6 describes scaffold-specific prompt modifications."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Table 7 reports scaffold-level hyperparameters (max_steps, time_limit, etc.) for all three frameworks. However, LLM API inference parameters (temperature, top-p, sampling settings) are not stated for any model, and the schema requires these."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 3 and Appendix A.6 describe all three scaffolds in detail: AIDE's tree search over solutions, MLAB's tool-based approach, OpenHands' CodeActAgent. All modifications are documented (A.6.1-A.6.3). Figure 2 shows real trajectories."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 2.1 documents curation from 5673 Kaggle competitions through screening criteria to 75 selected. Appendix A.1 lists all filtering criteria. Table 8 documents how each competition's train-test split was created."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 'Limitations' provides substantial discussion across four areas: contamination/plagiarism risks, coverage of AI R&D capabilities, differences to real competitions, and accessibility/cost."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 discusses threats specific to this study: models may have trained on Kaggle materials (contamination), new train-test splits may not perfectly match original distributions, algorithmic progress gives agents advantages over historical competitors, and high compute cost limits reproducibility (1800 GPU hours per run)."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 6 states specific boundaries: 'the tasks included in MLE-bench don't cover the full spectrum of capabilities required for AI R&D,' listing that real R&D involves unclear problem statements, messy data, and choosing metrics — none of which MLE-bench tests."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The underlying Kaggle competition data is publicly available, leaderboard snapshots are included, and benchmark code for reproducing datasets is open-sourced. Raw agent logs were used for analysis."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 2.1 describes the full curation process: starting from 5673 Meta Kaggle competitions, excluding Community Competitions (leaving 586), manual screening for relevance, filtering by reproducibility criteria, arriving at 75 competitions."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants in the study. The data source is publicly available Kaggle competitions, a standard well-known platform."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The pipeline from 5673 competitions through screening criteria (Appendix A.1) to 75 is documented with the counts at each step. Table 8 documents each competition's split procedure. Grading code reimplementation is described in Section 2.1."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding acknowledgments section. All authors are from OpenAI, implying OpenAI funded the work, but this is not explicitly stated as a funding disclosure."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All authors are listed under 'OpenAI' affiliation on the paper's first page."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "OpenAI funds this research and has direct commercial interest in their models (o1-preview, GPT-4o) performing well on ML engineering tasks. The funder is not independent of the outcome."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests statement in the paper. OpenAI employees are evaluating OpenAI models without explicit disclosure of the financial conflict."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The paper does not state the training data cutoff dates for any of the models evaluated (o1-preview, GPT-4o, Claude 3.5 Sonnet, Llama 3.1 405B), despite investigating contamination effects through other means."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section 4 extensively discusses potential overlap: Section 4.1 measures GPT-4o's familiarity with competition pages and winning solutions using token-level probability analysis. Section 6 notes 'GPT-4's base model could reproduce several rows from the dataset of the Titanic competition.'"
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Section 4 comprehensively addresses contamination: familiarity analysis (Section 4.1), obfuscated descriptions experiment (Section 4.2), plagiarism detection with Dolos (Section 2.3.1), and log analysis tools (Appendix A.3). They find 'no systematic effect of contamination for GPT-4o.'"
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study. The benchmark evaluates AI agents, not humans."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 6 reports 'o1-preview with AIDE used 127.5M input tokens and 15.0M output tokens on average for one seed of 75 competitions.' Total GPU hours (1800 per run) are also stated."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 6 states 'A single run of our main experiment setup of 24 hours per competition attempt requires 24 hours × 75 competitions = 1800 GPU hours of compute.' Hardware is specified in Appendix A.5 (Azure VMs with A10 GPUs)."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Results are reported across multiple seeds with standard error: 3 seeds for most experiments, 16 for o1-preview (AIDE), and 36 for GPT-4o (AIDE). Standard errors in Table 2 show variation across seeds."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Table 2 caption states 'Each experiment is repeated with 3 seeds, except o1-preview (AIDE) and GPT-4o (AIDE) which use 16 and 36 seeds respectively.'"
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget is reported. Scaffold parameters in Table 7 (e.g., AIDE max 2000 steps, MLAB max 2000 steps) appear fixed without justification or search."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": true,
    322         "justification": "Table 2 reports all model-scaffold combinations tried, not just the best. The selection of AIDE as the best scaffold for subsequent experiments is transparent and based on the full comparison results."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No formal statistical significance tests are performed in the paper, so multiple comparison correction does not apply."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "OpenAI employees created the benchmark and evaluated OpenAI models (o1-preview, GPT-4o) which achieved the highest scores. The paper does not acknowledge or discuss the bias of evaluating their own models."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Table 3 compares performance across three hardware configurations (CPU-only, standard GPU, extra GPU). Figure 4 shows performance as a function of time (up to 100 hours). These directly relate compute to performance."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section 6 discusses construct validity: MLE-bench tests well-specified Kaggle tasks, which differ from real AI R&D. Section 5 compares to alternative benchmarks (MLAgentBench, DSBench, ML-Bench) and discusses how their design differs."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Table 2 separates scaffold and model effects: GPT-4o is tested across three scaffolds (AIDE, MLAB, OpenHands), and four models are compared within AIDE. This design isolates the scaffold confound."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "Section 4 investigates whether familiarity with competitions correlates with performance (Figure 5). Section 6 notes 'algorithmic progress may result in older competitions being easier.' Figure 9 plots performance vs competition date."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "The evaluation design prevents information leakage: the validation server checks submission validity but does not provide scores (Section 2.3). Rules forbid agents from viewing solutions online (Section 2.3.1). New train-test splits prevent direct answer leakage."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Section 2.1 describes creating new train-test splits to ensure independence, checking that 'the distributions of the original and reconstructed test sets are similar.' Section 4 addresses whether model training data is independent of test data through contamination analysis."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": true,
    369         "justification": "Multiple detection methods: token-level familiarity analysis (Section 4.1), obfuscated description experiment (Section 4.2), Dolos plagiarism detection against top 50 Kaggle notebooks (Section 2.3.1), and GPT-4o-based log analysis for rule-breaking (Appendix A.3)."
    370       }
    371     }
    372   },
    373   "methodology_tags": ["benchmark-eval"],
    374   "key_findings": "MLE-bench evaluates AI agents on 75 Kaggle ML engineering competitions, finding that o1-preview with AIDE scaffolding achieves medals in 16.9% of competitions (pass@1), which doubles to 34.1% with 8 attempts. Scaffolding choice matters substantially (AIDE 8.7% vs MLAB 0.8% for GPT-4o), while hardware scaling shows minimal benefit — the extra GPU did not improve performance. Contamination analysis via familiarity testing and obfuscated descriptions finds no evidence of systematically inflated scores from model memorization.",
    375   "claims": [
    376     {
    377       "claim": "o1-preview with AIDE achieves at least the level of a Kaggle bronze medal in 16.9% of competitions",
    378       "evidence": "Table 2 shows 16.9% ± 1.1% Any Medal rate for o1-preview (AIDE) across 16 seeds, with breakdown: 3.4% Bronze, 4.1% Silver, 9.4% Gold.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Performance roughly doubles from pass@1 to pass@8 for both o1-preview and GPT-4o",
    383       "evidence": "Figure 3 shows o1-preview goes from 16.9% (pass@1) to ~34.1% (pass@8), and GPT-4o from 8.7% to ~17% (pass@6 achieves comparable score to o1-preview pass@1).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "AIDE scaffolding significantly outperforms MLAB and OpenHands for GPT-4o",
    388       "evidence": "Table 2 shows GPT-4o (AIDE) 8.7% vs MLAB 0.8% vs OpenHands 4.4% medal rates. However, no formal significance test is used to support the word 'significantly.'",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Hardware scaling (CPU-only vs GPU vs 2 GPUs) has minimal effect on agent performance",
    393       "evidence": "Table 3 shows CPU-only 9.1%, Standard 8.7%, Extra GPU 10.2% — all within standard error. However, only 3 seeds for non-standard configs.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "GPT-4o's familiarity with competitions does not correlate with performance, suggesting contamination does not systematically inflate scores",
    398       "evidence": "Figure 5 shows Pearson's correlation -0.24 (p=0.04) between familiarity and performance — actually slightly negative. Table 4 shows obfuscated descriptions yield equivalent performance (8.4% vs 8.5%).",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Agents improve with more time, achieving more medals at 100 hours than 24 hours",
    403       "evidence": "Figure 4 shows GPT-4o (AIDE) reaches ~8.7% at 24h and ~11.8% at 100h. However, results are from a single run configuration.",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Agents can score well on competitions solvable with well-known approaches but struggle to debug issues and recover from missteps",
    408       "evidence": "Section 3.1 Discussion describes failure modes: not using validation server, ending runs early, overloading resources, not considering time constraints. Table 10 shows 0% medals on harder categories like Image Segmentation, Object Detection.",
    409       "supported": "moderate"
    410     }
    411   ],
    412   "red_flags": [
    413     {
    414       "flag": "Company evaluating own product",
    415       "detail": "All authors are from OpenAI and evaluate OpenAI models (o1-preview, GPT-4o) which achieve the top results. No explicit acknowledgment of this conflict of interest or self-comparison bias. While third-party models are included, the benchmark was designed by the team whose model leads."
    416     },
    417     {
    418       "flag": "No formal significance tests despite comparative claims",
    419       "detail": "The paper claims o1-preview 'significantly outperforms all other models' (Section 3.1) based solely on comparing means with standard errors, without performing any formal statistical test. The word 'significantly' is used in a statistical context without statistical backing."
    420     },
    421     {
    422       "flag": "Missing LLM inference hyperparameters",
    423       "detail": "Temperature, top-p, and other LLM API sampling parameters are not reported for any model, despite these settings meaningfully affecting output quality and consistency."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating Large Language Models Trained on Code",
    429       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    430       "year": 2021,
    431       "arxiv_id": "2107.03374",
    432       "relevance": "Foundational LLM code evaluation benchmark (HumanEval/Codex) and pass@k metric used in MLE-bench."
    433     },
    434     {
    435       "title": "Measuring Coding Challenge Competence With APPS",
    436       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    437       "year": 2021,
    438       "arxiv_id": "2105.09938",
    439       "relevance": "Early coding challenge benchmark showing LLM capabilities on competitive programming problems."
    440     },
    441     {
    442       "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation",
    443       "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"],
    444       "year": 2024,
    445       "relevance": "Closest prior benchmark for ML agent evaluation using 13 Kaggle and bespoke tasks; MLE-bench extends this with 75 competitions."
    446     },
    447     {
    448       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    449       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    450       "year": 2024,
    451       "arxiv_id": "2310.06770",
    452       "relevance": "Leading software engineering benchmark for AI agents; MLE-bench positions as complementary for ML engineering specifically."
    453     },
    454     {
    455       "title": "AI Agents That Matter",
    456       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel"],
    457       "year": 2024,
    458       "arxiv_id": "2407.01502",
    459       "relevance": "Analysis of agent evaluation methodology including cost-performance tradeoffs and evaluation pitfalls."
    460     },
    461     {
    462       "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents",
    463       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    464       "year": 2024,
    465       "arxiv_id": "2407.16741",
    466       "relevance": "Open-source agent platform (OpenHands) used as one of three scaffolds evaluated in MLE-bench."
    467     },
    468     {
    469       "title": "AgentBench: Evaluating LLMs as Agents",
    470       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    471       "year": 2023,
    472       "arxiv_id": "2308.03688",
    473       "relevance": "Multi-turn agent evaluation benchmark with diverse environments; foundational work for AI agent evaluation."
    474     },
    475     {
    476       "title": "GAIA: a benchmark for General AI Assistants",
    477       "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Craig Swift"],
    478       "year": 2023,
    479       "arxiv_id": "2311.12983",
    480       "relevance": "Benchmark for general-purpose AI agents on real-world tasks requiring tool use and reasoning."
    481     },
    482     {
    483       "title": "ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code",
    484       "authors": ["Xiangru Tang", "Yuliang Liu", "Zefan Cai"],
    485       "year": 2024,
    486       "arxiv_id": "2311.09835",
    487       "relevance": "Evaluates agents on understanding and using existing ML codebases; complementary to MLE-bench's open-ended task formulation."
    488     },
    489     {
    490       "title": "DSBench: How Far Are Data Science Agents to Becoming Data Science Experts?",
    491       "authors": ["Liqiang Jing", "Zhehui Huang", "Xiaoyang Wang"],
    492       "year": 2024,
    493       "arxiv_id": "2409.07703",
    494       "relevance": "Concurrent Kaggle-based benchmark focused on data science tasks; MLE-bench argues their competition selection is more diverse and challenging."
    495     },
    496     {
    497       "title": "ConStat: Performance-Based Contamination Detection in Large Language Models",
    498       "authors": ["Jasper Dekoninck", "Mark Niklas Müller", "Martin Vechev"],
    499       "year": 2024,
    500       "arxiv_id": "2405.16281",
    501       "relevance": "Defines contamination framework used to motivate MLE-bench's contamination analysis methodology."
    502     },
    503     {
    504       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    505       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    506       "year": 2024,
    507       "arxiv_id": "2403.07974",
    508       "relevance": "Contamination-free code evaluation benchmark addressing temporal leakage concerns relevant to MLE-bench."
    509     },
    510     {
    511       "title": "Competition-level code generation with AlphaCode",
    512       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    513       "year": 2022,
    514       "doi": "10.1126/science.abq1158",
    515       "relevance": "Demonstrated LLM capabilities at competition-level code generation; early milestone for autonomous coding agents."
    516     }
    517   ]
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs