scan.json (30870B)
1 { 2 "paper": { 3 "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts", 4 "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker", "Sami Jawhar", "Neev Parikh", "Thomas Broadley", "Lawrence Chan", "Michael Chen", "Josh Clymer", "Jai Dhyani", "Elena Ericheva", "Katharyn Garcia", "Brian Goodrich", "Nikola Jurkovic", "Holden Karnofsky", "Megan Kinniment", "Aron Lajko", "Seraphina Nix", "Lucas Sato", "William Saunders", "Maksym Taran", "Ben West", "Elizabeth Barnes"], 5 "year": 2024, 6 "venue": "arXiv.org", 7 "arxiv_id": "2411.15114", 8 "doi": "10.48550/arXiv.2411.15114" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "qualitative"], 13 "key_findings": "RE-Bench introduces 7 hand-crafted ML research engineering environments with data from 71 attempts by 61 human experts. AI agents (o1-preview, Claude 3.5 Sonnet) achieve 4x higher scores than humans at 2-hour budgets, but humans surpass agents at 8 hours and achieve 2x agent scores at 32 hours. Agents generate and test solutions 10x faster than humans but struggle with long-horizon agency, variety in approaches, and recovering from failures.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Environments are open-sourced at github.com/METR/ai-rd-tasks, agent trajectories at transcripts.metr.org. The abstract states 'We open-source the evaluation environments, human expert data, analysis code and agent trajectories.' The footnote notes analysis code and anonymized human data are 'coming soon.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Agent trajectories are released at transcripts.metr.org. Anonymized human expert data is stated as 'coming soon' but agent data is available. The environments themselves are open-sourced." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Hardware specs are given per environment (e.g., '4×H100, 1×52 core CPU, 400 GB RAM') but no requirements.txt, Dockerfile, or detailed software dependency list is provided in the paper itself. They reference Vivaria platform but do not provide a full environment specification." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While environments are open-sourced and the paper describes the evaluation procedure, there are no step-by-step reproduction instructions in the paper. The open-source release may contain these, but the paper itself lacks a 'Reproducing Results' section with specific commands." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Figure 2 shows '95% confidence intervals generated via percentile bootstrapping.' Shaded regions on the main comparison plots represent confidence intervals." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No formal statistical significance tests are reported. Comparisons between agents and humans are made by comparing score@k curves and averages without p-values or statistical tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports concrete effect sizes: 'best AI agents achieve a score 4× higher than human experts when both are given a total time budget of 2 hours' and 'achieving 2× the score of the top AI agent when both are given 32 total hours.' These provide magnitude context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses 7 environments and 71 human attempts by 61 experts but provides no justification for these sample sizes. No power analysis is discussed. The paper acknowledges noise from the small number of environments (Section 6.3) but does not justify the chosen N." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Variance is reported in multiple forms: Figure 4 shows percentile ranges of human performance, histograms in Appendix G show score distributions per environment, and confidence intervals via bootstrapping are shown in main results. Table 6 reports standard deviation of starting scores." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Human expert performance serves as the primary baseline. Multiple agent configurations (two scaffolds, multiple models) are compared. Starting solutions provide a zero-score baseline." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper evaluates o1-preview and Claude 3.5 Sonnet (claude-3-5-sonnet-20241022), which were frontier models at time of writing. AIDE scaffold was chosen as it 'performed the best in recent studies of MLE-bench.'" 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper compares multiple scaffolds (Modular vs AIDE), multiple models, multiple time allocations (30min vs 2h vs 8h), and best-of-k with varying k. This serves as a form of ablation over key design dimensions." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: normalized score per environment, average normalized score across environments, score@k at various time budgets, per-environment breakdowns, and cost comparisons (Figure 11)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Human expert performance is the core comparison. Additionally, agent runs were manually inspected: 'we only carefully inspected the 2 best performing runs of each task' and cheating was manually checked (Appendix A.2)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Most environments use held-out test sets separate from what the agent can observe. For example, Scaffolding for Rust CodeContests uses 'a held-out test set' of 175 problems. The paper acknowledges the overfitting concern for environments that show test scores to agents (Section 6.3)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Figure 9 provides per-environment breakdowns. Appendix G-I provides detailed per-environment histograms, score@k curves, and time series for all 7 environments." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.2 extensively analyzes agent failures: lack of variety in solutions, misunderstandings of instructions, stubborn incorrect assumptions, difficulty recovering from failures. Annotated agent failure transcripts are provided in Appendix D (Figures 13-15)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that most agent runs score close to 0, that agents accumulate 'more issues and false assumptions than useful insights,' and that specific environments like Scaling Law Experiment involve agent guesswork rather than skill (Section 6.3)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims (4x score at 2h, humans exceeding at 8h, 2x at 32h, agents generating solutions 10x faster, Triton kernel result) are all supported by figures and data in the paper (Figure 2, Section 5.1)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper is careful about causal language. Claims are comparative ('agents achieve a score 4× higher') based on controlled comparisons under equivalent conditions. The paper explicitly notes limitations (Section 6) and avoids strong causal claims about why agents perform differently." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 6.1 extensively bounds generalization: 'We expect the human–AI gap in real-world AI R&D to be much larger than the gap observed on these evaluations.' Table 7 explicitly compares RE-Bench scale to real AI R&D across multiple dimensions. The paper states these results should not imply agents can automate real AI R&D." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 6.1-6.2 discusses multiple alternative explanations: agents may be cheaper per unit, different workflows could change results, environments may overestimate agent capability due to limited scope, or underestimate due to cost advantages. Section 5.1 discusses noise/overfitting as alternative explanation for high scores." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper is explicit that RE-Bench measures performance on short self-contained tasks, not real AI R&D automation. Section 6.1 and Table 7 lay out exactly why benchmark performance is a proxy and what it does not capture (long time horizons, engineering complexity, coordination)." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Exact model versions are specified: 'claude-3-5-sonnet-20241022' and 'o1-preview' and 'claude-3-5-sonnet-20240620' (Section 4). GPT-3.5-turbo-0125 is specified for the scaffolding task." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full task prompts/instructions are provided for all 7 environments in Appendix C. System prompts for the agent scaffolding are described in detail in Appendix A.2.1." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Temperature, top-p, and other sampling parameters for the model API calls are not reported. The paper describes the scaffold architecture but does not state LLM sampling hyperparameters used for agent generation." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Both scaffolds are described in detail in Appendix A.2.1: Modular agent (system prompt, tool access, context management, timeout handling) and AIDE (modifications for compatibility). The scaffolding code modifications are enumerated." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Score normalization is documented (Section 3.2.2). Human expert selection criteria are described (Section 3.4). Agent run procedures, clock pausing for API issues, and cheating detection procedures are described (Appendix A)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6.3 'Other Limitations' is a dedicated subsection. Sections 6.1 and 6.2 also discuss over/underestimation of capabilities extensively." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper discusses specific threats: noisy agent results from rare successes, Finetune GPT-2 for QA having high variance, agent solutions being overfitted to test scores, Scaling Law Experiment scores involving luck, and environments having unrepresentative design constraints." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Table 7 explicitly compares RE-Bench to real AI R&D across dimensions. Section 6.1 states: 'we find it fairly plausible that the first agents that match top human performance in these environments may still be far from capable of AI R&D automation.' Section 3.3 identifies specific missing skill areas." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Agent trajectories are available at transcripts.metr.org. The paper states 'We open-source the evaluation environments, human expert data, analysis code and agent trajectories.' Anonymized human expert data is noted as 'coming soon.'" 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Human data collection is described in detail: SSH into VMs, 8-hour limit, instructions to be 'greedy,' allowed tools, frequent commits, score logging, research logs, post-run manual inspection (Appendix A.1). Agent data collection is described in Appendix A.2." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Three recruitment sources are described: professional networks of METR staff, ML RS/RE hiring process applicants, and graduate student outreach (Section 3.4, Table 5). Selection criteria for each source are stated. The paper notes meaningful score differences between sources." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from raw scores to normalized results is documented: scoring function records timestamped entries, log-linear interpolation is used for time curves, normalization formula is given (Section 3.2.2), bootstrapping for confidence intervals is described. Cheating detection and invalid run filtering procedures are described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding section or acknowledgment of financial support is present. The work is by METR (Model Evaluation and Threat Research) but no funding sources are disclosed." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: METR is the primary affiliation, with footnotes for Joel Becker (Qally's), Josh Clymer and Aron Lajko (Redwood Research), Jai Dhyani (Independent), Nikola Jurkovic (Harvard), and Holden Karnofsky (Anthropic/Carnegie Endowment)." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. METR is an AI safety organization that has an organizational interest in demonstrating the value of AI R&D evaluations, which is the core contribution of this paper. Holden Karnofsky is affiliated with Anthropic, whose model is evaluated." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates o1-preview and Claude 3.5 Sonnet on novel benchmark tasks but does not state the training data cutoff dates for these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "The paper argues contamination is mitigated by novelty: environments are 'novel, hand-crafted' (Section 1), Table 1 lists 'Novel, non-contaminated' as a criterion and marks RE-Bench as satisfying it. Appendix B.1 discusses novelty of reference solutions." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "The paper explicitly addresses contamination by designing novel environments. Table 1 marks RE-Bench as novel/non-contaminated. The creation process (Section 3.2.3) involved hand-crafting environments that are not directly available online, and Section 2.2 discusses contamination risk as a design concern." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "No pre-registration is mentioned. The study involves 61 human experts as participants but there is no link to any pre-registration." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No IRB or ethics board approval is mentioned despite involving 71 human expert attempts." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": true, 257 "justification": "Table 5 breaks down experts by source. Section 3.4 describes experience levels: professional network experts have '5+ years experience' or worked at 'Google DeepMind, Google, Anthropic, OpenAI, FAR Labs, Redwood Research.' Hiring applicants passed CV/CodeSignal screens. Graduate students are from UCB, CMU, Stanford, MIT. Appendix F shows experience measures by score quintile." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": true, 262 "justification": "Selection criteria are described for each source: professional network experts needed 5+ years relevant experience or recent work at top ML orgs; hiring applicants passed CV screen, CodeSignal, interview, and short task; graduate students were selected for relevant advisors or being in ML PhD programs at specific universities (Section 3.4)." 263 }, 264 "randomization_described": { 265 "applies": true, 266 "answer": false, 267 "justification": "The paper states 'We attempted to match experts to environments where they had relevant experience, whilst trying to maintain a balanced distribution' but does not describe a formal randomization procedure. Assignment was purposive, not randomized." 268 }, 269 "blinding_described": { 270 "applies": true, 271 "answer": false, 272 "justification": "No blinding is described. Participants were told context about what METR was trying to accomplish (Appendix A.1). No discussion of whether this knowledge could affect performance." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "The paper reports 71 attempts by 61 experts but does not discuss whether any experts dropped out, were excluded from analysis, or failed to complete their sessions." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 6.2 reports: 'agents use ~29M input tokens and ~499K output tokens in each 8-hour run, at a cost of approximately $123' compared to '$1,855' paid to human experts on average. Figure 11 shows results by token/labor cost." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Hardware is specified per environment (Appendix C): e.g., '4×H100, 1×52 core CPU, 400 GB RAM.' Token usage and costs are reported. Time budgets (8h, 30min, 2h) are explicitly stated. Number of runs per configuration is reported." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "The paper reports results across multiple independent runs (3-5 per environment per agent for 8h runs, up to 128 for 30min runs). Score distributions and variance across runs are shown in Appendix G histograms and Figures 7-8." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Number of runs is stated: '3 to 5 8-hour attempts per environment' (Section 4.1), and specific k values for best-of-k experiments are shown in Figures 6-8. Table 5 reports 71 total human runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported for the agent scaffolding. The paper notes 'Preliminary studies found that o1-preview performed poorly in the Modular scaffold' but does not quantify how many configurations were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper transparently reports results for all configurations tested (both scaffolds, multiple models, multiple time allocations) rather than cherry-picking. Figure 2 aggregates using the best allocation found for each agent, which is clearly stated." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "METR designed the benchmark and evaluated agents on it. The paper does not explicitly discuss the bias of being both benchmark creators and evaluators. However, they do note that better scaffolds could improve results (Section 6.3)." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Figure 11 explicitly shows 'Best score@k results by token cost for AI agents and by money paid to human experts.' Figure 2 shows performance vs total time budget. The paper extensively discusses compute/cost tradeoffs between agents and humans." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 3.3 maps environments to AI R&D skills identified by Epoch AI's survey (Table 4), and explicitly identifies gaps (missing areas like distributed training, research direction setting). Section 6.1 and Table 7 discuss why RE-Bench may over/underestimate real AI R&D capability." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "The paper explicitly evaluates two different scaffolds (Modular and AIDE) and reports how results differ between them. Section 6.3 acknowledges 'Different agent scaffolds or prompts might be able to achieve a much better score.' Results are reported separately per scaffold." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "The paper addresses temporal leakage by designing novel environments that did not exist online before the evaluation. Table 1 marks RE-Bench as 'Novel, non-contaminated.' Environments were hand-crafted specifically for this evaluation." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "Section 6.3 discusses that agent solutions may be overfitted because 'all environments except Scaling Law Experiment provide the agents with the test score output.' The paper acknowledges this as a form of information leakage and plans to use validation-only scoring in future iterations." 349 }, 350 "non_independence_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "Non-independence between train and test data is not a primary concern here since the benchmark consists of novel environments, not a dataset split from a larger corpus." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No formal leakage detection method (canary strings, membership inference, etc.) is used. The mitigation is entirely through novelty of environments rather than detection." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Best AI agents achieve a score 4× higher than human experts when both are given a total time budget of 2 hours per environment.", 365 "evidence": "Figure 2 shows score@k results by total time budget. At 2 hours, agents significantly outperform humans across the aggregate metric.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Humans narrowly exceed top AI agent scores given an 8-hour budget, and achieve 2× the score of the top AI agent at 32 total hours.", 370 "evidence": "Figure 2 shows human performance crossing agent performance around 8 hours and reaching approximately 2× agent scores at 32 hours.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "AI agents generate and test solutions over ten times faster than humans.", 375 "evidence": "Section 5.1: 'AIDE and modular agents run score 36.8 and 25.3 times per hour respectively, while human experts only do so 3.4 times.'", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Both o1-preview and Claude 3.5 Sonnet found solutions to the kernel optimization problem that beat all 9 human experts.", 380 "evidence": "Section 5.1 and Figure 18 show agent solutions achieving faster kernel execution times than all human experts.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Agents in the Modular scaffold do significantly better over many short 30-minute attempts, while AIDE agents do best with fewer 2-hour attempts.", 385 "evidence": "Figure 6 directly compares three time allocation strategies across scaffolds.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "82% of expert attempts achieve a non-zero score and 24% match or exceed the reference solutions.", 390 "evidence": "Table 6 shows per-environment data on experts scoring > 0.05, and the text states these aggregate statistics.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Small number of environments", 397 "detail": "Only 7 environments are used, acknowledged by the authors as a significant source of noise. Results may not generalize to the broader space of AI R&D tasks." 398 }, 399 { 400 "flag": "Non-random expert-task assignment", 401 "detail": "Experts were matched to environments based on relevant experience rather than random assignment. This introduces potential selection bias and makes aggregate human performance estimates harder to interpret." 402 }, 403 { 404 "flag": "Test score leakage to agents", 405 "detail": "6 of 7 environments show test scores to agents during evaluation, potentially enabling overfitting. The Finetune GPT-2 for QA result dropped from 0.88 to 0.69 upon re-running, indicating overfitting to evaluation noise." 406 }, 407 { 408 "flag": "Heterogeneous expert quality", 409 "detail": "Average scores differ substantially by recruitment source (0.48 for hiring applicants vs 0.98 for professional network), making the 'average human expert' a potentially misleading comparison point." 410 }, 411 { 412 "flag": "METR evaluates its own benchmark", 413 "detail": "METR designed the benchmark, selected the environments, recruited the experts, and ran the evaluations. No independent replication or evaluation is provided." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Evaluating large language models trained on code", 419 "authors": ["M. Chen"], 420 "year": 2021, 421 "relevance": "Foundational work on code generation evaluation (Codex/HumanEval), introduces pass@k metric referenced in this paper." 422 }, 423 { 424 "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering", 425 "authors": ["J.S. Chan"], 426 "year": 2024, 427 "arxiv_id": "2410.07095", 428 "relevance": "Directly comparable ML engineering benchmark for AI agents; AIDE scaffold was selected based on its MLE-bench performance." 429 }, 430 { 431 "title": "SWE-bench: Can language models resolve real-world github issues?", 432 "authors": ["C.E. Jimenez"], 433 "year": 2023, 434 "relevance": "Major code generation benchmark; paper discusses its feasibility issues as motivation for RE-Bench design." 435 }, 436 { 437 "title": "GPQA: A graduate-level google-proof q&a benchmark", 438 "authors": ["D. Rein"], 439 "year": 2023, 440 "relevance": "Benchmark with human comparisons used in frontier model evaluation." 441 }, 442 { 443 "title": "The AI scientist: Towards fully automated open-ended scientific discovery", 444 "authors": ["C. Lu"], 445 "year": 2024, 446 "arxiv_id": "2408.06292", 447 "relevance": "Directly relevant work on AI agents automating scientific research." 448 }, 449 { 450 "title": "MLAgentBench: Evaluating language agents on machine learning experimentation", 451 "authors": ["Q. Huang"], 452 "year": 2024, 453 "arxiv_id": "2310.03302", 454 "relevance": "Prior benchmark for ML agent capabilities, compared in Table 1." 455 }, 456 { 457 "title": "Sabotage evaluations for frontier models", 458 "authors": ["J. Benton"], 459 "year": 2024, 460 "arxiv_id": "2410.21514", 461 "relevance": "AI safety evaluation work on autonomous agent risks, directly motivates RE-Bench." 462 }, 463 { 464 "title": "OpenDevin: An open platform for AI software developers as generalist agents", 465 "authors": ["X. Wang"], 466 "year": 2024, 467 "arxiv_id": "2407.16741", 468 "relevance": "AI agent platform for software development tasks." 469 }, 470 { 471 "title": "GAIA: a benchmark for general AI assistants", 472 "authors": ["G. Mialon"], 473 "year": 2023, 474 "arxiv_id": "2311.12983", 475 "relevance": "General AI assistant benchmark with human comparisons, compared in Table 1." 476 }, 477 { 478 "title": "WebArena: A realistic web environment for building autonomous agents", 479 "authors": ["S. Zhou"], 480 "year": 2024, 481 "arxiv_id": "2307.13854", 482 "relevance": "Realistic agent evaluation environment with human comparisons." 483 }, 484 { 485 "title": "Interviewing AI researchers on automation of AI R&D", 486 "authors": ["D. Owen"], 487 "year": 2024, 488 "relevance": "Epoch AI survey on AI R&D automation used to assess RE-Bench coverage of key AI R&D skills (Table 4)." 489 }, 490 { 491 "title": "DISCOVERYWORLD: A virtual environment for developing and evaluating automated scientific discovery agents", 492 "authors": ["P. Jansen"], 493 "year": 2024, 494 "arxiv_id": "2406.06769", 495 "relevance": "Scientific discovery agent evaluation environment with human comparisons." 496 } 497 ] 498 }