scan.json (31066B)
1 { 2 "paper": { 3 "title": "ResearchCodeBench: Benchmarking LLMs on Implementing Novel Machine Learning Research Code", 4 "authors": [ 5 "Tianyu Hua", 6 "Harper Hua", 7 "Violet Xiang", 8 "Benjamin Klieger", 9 "Sang T. Truong", 10 "Weixin Liang", 11 "Fan-Yun Sun", 12 "Nick Haber" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2506.02314", 17 "doi": "10.48550/arXiv.2506.02314" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "ResearchCodeBench evaluates 32 LLMs on 212 coding challenges from 20 recent ML papers, finding that even the best model (Gemini-2.5-Pro-Preview at 37.3% Scaled Pass@1) implements less than 40% of tasks correctly. Reasoning-oriented models consistently outperform standard models, and higher-performing models benefit substantially from paper context while LLaMA-based models actually perform worse with paper access. The dominant error type is functional/semantic errors (59%), indicating models struggle with algorithmic reasoning rather than syntax.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper states 'Code and data are available: https://researchcodebench.github.io/' in the abstract (footnote 2), providing a URL for the benchmark framework and data." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "Same URL provides access to the benchmark data (212 coding challenges, test cases, paper metadata). The abstract footnote says 'Code and data are available.'" 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 2.4 states 'the full benchmark executes locally in a single Docker or Conda environment' and specifies it runs on 'a standard personal laptop such as an M1 MacBook Pro.' The released code framework includes environment specifications." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper describes the full benchmark execution pipeline (Section 2.2), provides exact prompt templates (Appendix E), and releases an 'open-source benchmarking framework that is easy to use' with code and data at the project URL." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results (Figures 2, 3, 6) report only point estimates (e.g., '37.3% success rate'). No confidence intervals, error bars, or uncertainty quantification for any model's performance." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper makes comparative claims (e.g., 'Gemini-2.5-Pro-Preview to perform best') based solely on comparing raw pass rate numbers. No statistical significance tests are used to validate any model comparison." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports absolute scores with context for comparison (e.g., '37.3% success rate' vs '32.3%' and '30.8%'), and relative improvements (e.g., 'relative improvements of up to 30% over their original scores without the paper' in Section 3.3). This provides sufficient magnitude context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification is provided for why 20 papers and 212 tasks were chosen. No power analysis or discussion of whether this sample size is sufficient for the claims being made." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "All models are evaluated using greedy decoding with a single run. No variance, standard deviation, or spread measures are reported across any experimental conditions." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper evaluates 32 models from 10 different companies (Figure 2), including both open-source and proprietary models, providing extensive cross-model comparison." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Models evaluated are highly contemporary (Gemini-2.5-Pro-Preview-05-06, GPT-4.1, O3, O4-Mini, Claude-3.7-Sonnet, DeepSeek-R1), representing the state of the art at time of publication." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 3.3 presents a paper reliance ablation comparing model performance with and without paper context. Section 3.2 provides a contamination-safe subset analysis isolating the effect of potential data exposure." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Two metrics are reported: Scaled Pass@1 (primary metric, weighted by executable lines of code) in Figure 2, and standard (vanilla) Pass@1 in Figure 6 (Appendix F)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "All evaluation is automated using equivalence tests and unit tests. No human evaluation of model outputs is performed. The paper notes human baseline is absent due to cost constraints (Section 5)." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "The entire benchmark serves as a held-out test set — models are not tuned on any portion of it. Models are evaluated with greedy decoding on the full set of 212 tasks with no development or tuning split." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Figure 3 (right panel) provides per-paper performance breakdowns. Results are broken down by contamination-safe vs full benchmark, and by with/without paper context (Figure 4). Per-model error breakdowns are provided (Appendix G)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 3.4 provides detailed error analysis with a taxonomy of 7 error categories (functional, name, type, syntax, import, attribute, index/key errors), with per-model breakdowns in Appendix G." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that LLaMA-based models perform worse when given the paper (Section 3.3, 'suggesting that long academic documents may introduce context dilution or confusion'), and that smaller models show no benefit from paper context." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims (Gemini-2.5-Pro at 37.3%, O3 at 32.3%, O4-mini at 30.8%, best models below 40%) are directly supported by Figure 2's results." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper's causal claims about paper context improving performance are supported by a controlled ablation (Section 3.3) comparing identical models with and without paper access, which is an adequate design for this claim." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper restricts claims to ML papers specifically, acknowledges 'the current benchmark includes only 20 papers and focuses exclusively on machine learning' (Section 5), and explicitly discusses plans for broader coverage as a future direction." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 3.1-3.2 discusses contamination as an alternative explanation for high scores. Section 3.3 explores paper reliance vs pure code completion. The paper also considers 'context dilution or confusion' for LLaMA's degraded performance with paper access." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper measures Scaled Pass@1 on fill-in-the-blank code completion with hints and surrounding context, but claims to evaluate 'ability to translate cutting-edge ML contributions from recent research papers into executable code.' The gap between fill-in-the-blank snippet completion (with visible surrounding code and hints) and the actual task of implementing novel research from scratch is not acknowledged or discussed." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Many model names are marketing names without specific API versions or snapshot dates: 'GPT-4.1', 'Claude-3.7-Sonnet', 'O3 (High)', 'DeepSeek-R1', 'Grok-3-Beta'. While some include dates (e.g., 'Gemini-2.5-Pro-Preview-05-06', 'DeepSeek-Chat-V3-0324'), most lack the snapshot-level specificity required for reproducibility." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Appendix E.1 provides the exact prompt templates used for both with-paper and without-paper settings, including the complete system instructions and formatting guidelines given to models." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3 states 'All models are evaluated using greedy decoding' which specifies the key inference parameter (temperature=0, deterministic generation). This is consistently applied across all 32 models." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. Models receive a single prompt with paper context and code, and generate a direct completion. There is no tool use, retry logic, or multi-step workflow." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 2.1 documents the full pipeline: paper selection criteria (top venues, well-documented contributions, author collaboration), LaTeX processing via latexpand, PDF-to-Markdown conversion via OCR, snippet annotation with XML tags, and contextual dependency identification via import analysis." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 5 ('Discussion') includes a dedicated 'Limitations and future directions' subsection discussing specific constraints of the benchmark." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 5 discusses specific threats: limited scope to 20 ML papers only, manually written test cases limiting scalability, no human baseline making it impossible to contextualize LLM performance. These are specific to this study." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 5 explicitly states 'the current benchmark includes only 20 papers and focuses exclusively on machine learning' and notes 'we do not include a human baseline,' clearly delineating what the results do and do not show." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The benchmark code, data, coding challenges, and test cases are released at https://researchcodebench.github.io/, allowing independent verification and re-evaluation of results." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 2.1 describes the data collection procedure in detail: paper selection from top venues, identification of core contributions, contextual dependency analysis, snippet annotation with XML tags, and test case co-development with paper authors or domain experts." 201 }, 202 "recruitment_methods_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "Section 2.1 describes paper selection: 'We begin by selecting 20 recent ML papers from top-tier conferences... We prioritize papers whose core contributions are both well-documented in the paper and cleanly implemented in their open-source repositories. Additionally, we consider whether the authors may be interested in collaborating.'" 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "Section 2.1 documents each pipeline stage: paper selection → core contribution identification → contextual code extraction → snippet annotation → task construction → test creation. The pipeline from paper to benchmark task is fully described." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "The Acknowledgments section states 'This work was supported in part by the National Science Foundation under Grant No. 2302701, and by the Stanford HAI Hoffman-Yee Research Grant.'" 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All authors are listed as affiliated with Stanford University. The paper does not evaluate any Stanford-affiliated AI product, so there is no undisclosed conflict." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "Funding comes from NSF and Stanford HAI, neither of which has a financial interest in which LLM performs best on the benchmark. The funders are independent of the outcome." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement is present in the paper. There is no declaration of financial interests, patents, or equity related to the findings." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": true, 239 "justification": "Section 3.1 discusses model knowledge cutoff dates extensively, with Figure 3 (left panel) showing 'the most recent known cutoff date for each company's models' as horizontal colored lines ranging from August 2023 to January 2025." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Section 3.1-3.2 extensively analyzes potential train/test overlap by comparing model cutoff dates with repository commit dates. They identify that '13 out of 20 have their first commits in 2025, after the most recent Gemini-2.5-Pro-Preview's knowledge cutoff date.'" 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": true, 249 "justification": "Section 3.2 creates a 'contamination-safe subset of 13 papers whose initial commits occurred after the latest known model cutoff (January 2025)' and reports separate results for this subset, demonstrating concrete contamination mitigation." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved. The study evaluates LLMs on automated coding tasks." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in the study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants or experimental conditions requiring randomization." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants or evaluators requiring blinding." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants are involved." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "The paper reports evaluation time ('each task takes just 1.25 seconds to evaluate') but does not report API inference costs, total tokens consumed, or per-model inference costs for running 32 models across 212 tasks." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total computational budget is stated. The paper does not report total API spend, total inference time across all models, or hardware requirements for running the full evaluation suite of 32 models." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "All models are evaluated with a single greedy decoding pass. No analysis of sensitivity to random seeds, sampling temperature, or other sources of variance." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper states 'All models are evaluated using greedy decoding' (Section 3) and reports 'scaled pass@1... computed over the first (i.e., top-1) model completion' (Section 2.2), clearly indicating a single deterministic run per model." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper uses greedy decoding without discussing whether other decoding strategies (temperature sampling, nucleus sampling, multiple completions) were explored. No hyperparameter search budget is reported." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "The paper uses a single fixed configuration (greedy decoding, pass@1) for all models, explicitly described in Section 2.2 and Section 3. There is no selection from multiple configurations, eliminating cherry-picking concerns." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "The paper compares 32 models and makes ranking claims without any statistical tests or corrections for multiple comparisons. All comparisons are based on raw pass rate differences." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors created the benchmark and evaluate all models on it without acknowledging potential bias in benchmark design (e.g., whether task construction or test design might favor certain model families). The concern from Lucic et al. about author-evaluation bias is not discussed." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Reasoning models like O3 (High) and O4-Mini (High) likely consume significantly more inference-time compute than standard models like GPT-4o, but performance is not compared at matched compute budgets. The compute disparity is not discussed." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper does not discuss whether fill-in-the-blank code completion with hints and surrounding context actually measures the claimed capability of 'translating novel ML contributions into executable code.' The gap between the task design and the broader research implementation skill is not analyzed." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is used. All models receive the same direct prompt and generate a single completion. There is no scaffolding confound to address." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "Sections 3.1-3.2 extensively analyze temporal leakage by comparing model training cutoff dates with repository first-commit dates, and creating a contamination-safe subset of 13 papers with commits after all model cutoffs." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The paper provides hints and surrounding code context to models but does not analyze whether these features leak information about the expected solution. The interplay between visible context code and the masked snippet is not discussed as a potential leakage source." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "The 212 tasks come from only 20 papers, with multiple tasks per paper. Tasks from the same paper likely share structural similarities and dependencies, but this non-independence is not discussed or addressed in the evaluation." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "The paper uses a concrete temporal-based detection method: comparing model knowledge cutoff dates with repository first commit dates (Figure 3 left panel), and creating a contamination-safe subset for comparison (Section 3.2)." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Even the best LLMs correctly implement less than 40% of the benchmark tasks (Gemini-2.5-Pro-Preview at 37.3% Scaled Pass@1).", 374 "evidence": "Figure 2 shows Scaled Pass@1 results for 32 models, with Gemini-2.5-Pro-Preview-05-06 achieving the highest score of 37.3%.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Reasoning-oriented models (Gemini-2.5-Pro, O3, O4-mini) consistently outperform standard models, even on the contamination-safe subset.", 379 "evidence": "Figure 2 shows reasoning models ranked highest. Figure 3 (right panel) shows the advantage persists on the contamination-safe 13-paper subset.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "All models show a noticeable performance drop on the contamination-safe subset, suggesting most tasks are genuinely novel.", 384 "evidence": "Figure 3 (right panel) shows consistent performance drops across all 32 models when evaluated on the 13-paper contamination-safe subset vs. the full 20-paper benchmark.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Higher-performing models benefit substantially from paper context, with relative improvements up to 30%.", 389 "evidence": "Figure 4 (left panel) shows per-model differences in Scaled Pass@1 with vs. without paper context. Top models like O4-Mini and Gemini-2.5-Pro show the largest positive deltas.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "LLaMA-based models perform worse when given the research paper, suggesting long academic documents introduce context dilution.", 394 "evidence": "Figure 4 (left panel) shows all LLaMA models (Llama-4-Maverick, Llama-4-Scout, Llama-3.3-70B) have negative deltas, performing worse with paper context.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Functional/semantic errors dominate at 58.6% of all failures, indicating the primary challenge is algorithmic reasoning rather than syntax.", 399 "evidence": "Figure 5 shows the error distribution. Functional errors account for 59%, followed by name errors (9%), type errors (9%), and syntax errors (8%).", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No statistical uncertainty quantification", 406 "detail": "All 32 models are compared using single-run point estimates with greedy decoding. No confidence intervals, error bars, or significance tests support the model rankings. Small differences between closely-ranked models cannot be distinguished from noise." 407 }, 408 { 409 "flag": "Error classification using LLM", 410 "detail": "The paper uses GPT-4o-Mini to classify errors (Section 3.4) while arguing against LLM-based evaluation in Section 2.1 ('LLM judges for code are often evaluated using tests themselves'). The reliability of this LLM-based error classification is not validated." 411 }, 412 { 413 "flag": "Selection bias in paper choice", 414 "detail": "Papers were selected partly based on 'whether the authors may be interested in collaborating on the benchmark' (Section 2.1). This introduces selection bias: papers whose authors collaborate may have cleaner, better-documented code, potentially making tasks easier or harder in non-representative ways." 415 }, 416 { 417 "flag": "Small and narrow benchmark", 418 "detail": "Only 20 ML papers and 212 tasks. The paper draws broad conclusions about LLM research code capability from a very limited sample that covers only machine learning, with no diversity in scientific domain." 419 }, 420 { 421 "flag": "No human baseline", 422 "detail": "Without a human baseline, it is impossible to assess whether 37.3% represents a meaningful gap from human capability or whether the tasks are inherently difficult even for expert humans. The paper acknowledges this limitation but draws conclusions about LLM capability regardless." 423 }, 424 { 425 "flag": "Non-independence of tasks", 426 "detail": "212 tasks drawn from only 20 papers means roughly 10 tasks per paper on average. Tasks from the same paper share code context, difficulty characteristics, and domain. This clustering is not accounted for in the evaluation, potentially inflating or deflating performance estimates." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "SWE-bench: Can language models resolve real-world github issues?", 432 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 433 "year": 2024, 434 "relevance": "Major code generation benchmark evaluating LLMs on resolving real-world GitHub issues, directly comparable benchmark in the LLM coding evaluation space." 435 }, 436 { 437 "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering", 438 "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"], 439 "year": 2024, 440 "arxiv_id": "2410.07095", 441 "relevance": "Benchmark evaluating LLM agents on ML engineering tasks, closely related benchmark for assessing AI capability in research-adjacent coding." 442 }, 443 { 444 "title": "PaperBench: Evaluating AI's ability to replicate AI research", 445 "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"], 446 "year": 2025, 447 "relevance": "Benchmark for AI replication of ML research papers end-to-end, the most directly comparable benchmark to ResearchCodeBench but using LLM-based judging rather than execution-based tests." 448 }, 449 { 450 "title": "SciCode: A research coding benchmark curated by scientists", 451 "authors": ["Minyang Tian", "Luyu Gao", "Shizhuo Dylan Zhang"], 452 "year": 2024, 453 "relevance": "Scientific coding benchmark emphasizing domain-specific research code generation, directly relevant to evaluating LLM capability in research contexts." 454 }, 455 { 456 "title": "SUPER: Evaluating agents on setting up and executing tasks from research repositories", 457 "authors": ["Ben Bogin", "Kejuan Yang", "Shashank Gupta"], 458 "year": 2024, 459 "arxiv_id": "2409.07440", 460 "relevance": "Benchmark evaluating LLM agents on setting up and executing research code from repositories, addressing computational reproducibility of research." 461 }, 462 { 463 "title": "MLAgentBench: Evaluating language agents on machine learning experimentation", 464 "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"], 465 "year": 2024, 466 "relevance": "Benchmark for LLM agents performing ML experimentation tasks, relevant to understanding agentic AI capability in research settings." 467 }, 468 { 469 "title": "CORE-Bench: Fostering the credibility of published research through a computational reproducibility agent benchmark", 470 "authors": ["Zachary S Siegel", "Sayash Kapoor", "Nitya Nagdir", "Benedikt Stroebl", "Arvind Narayanan"], 471 "year": 2024, 472 "arxiv_id": "2409.11363", 473 "relevance": "Benchmark focused on computational reproducibility of research, directly relevant to research code quality and methodology assessment." 474 }, 475 { 476 "title": "MLGym: A new framework and benchmark for advancing AI research agents", 477 "authors": ["Deepak Nathani", "Lovish Madaan", "Nicholas Roberts"], 478 "year": 2025, 479 "relevance": "Framework and benchmark for AI research agents performing ML tasks, relevant to evaluating agentic AI capability in research workflows." 480 }, 481 { 482 "title": "The AI Scientist: Towards fully automated open-ended scientific discovery", 483 "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"], 484 "year": 2024, 485 "arxiv_id": "2408.06292", 486 "relevance": "System for automated scientific discovery including code generation, directly relevant to AI capability in research code implementation." 487 }, 488 { 489 "title": "CodeJudge: Evaluating code generation with large language models", 490 "authors": ["Weixi Tong", "Tianyi Zhang"], 491 "year": 2024, 492 "doi": "10.18653/v1/2024.emnlp-main.1118", 493 "relevance": "LLM-as-judge approach for code evaluation, relevant to methodology of evaluating AI-generated code quality." 494 }, 495 { 496 "title": "Can LLMs generate novel research ideas? A large-scale human study with 100+ NLP researchers", 497 "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"], 498 "year": 2025, 499 "relevance": "Large-scale evaluation of LLM capability in research idea generation, directly relevant to understanding AI's role in scientific research." 500 }, 501 { 502 "title": "Towards an AI co-scientist", 503 "authors": ["Juraj Gottweis", "Wei-Hung Weng", "Alexander Daryin"], 504 "year": 2025, 505 "arxiv_id": "2502.18864", 506 "relevance": "System exploring AI as a research collaborator, relevant to the broader context of AI capability in scientific research workflows." 507 } 508 ] 509 }