scan.json (31887B)
1 { 2 "paper": { 3 "title": "REXBENCH: Can coding agents autonomously implement AI research extensions?", 4 "authors": [ 5 "Nicholas Edwards", 6 "Yukyung Lee", 7 "Yujun (Audrey) Mao", 8 "Yulu Qin", 9 "Sebastian Schuster", 10 "Najoung Kim" 11 ], 12 "year": 2025, 13 "venue": "arXiv preprint (under review)", 14 "arxiv_id": "2506.22598", 15 "doi": "10.48550/arXiv.2506.22598" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Agent implementations released at https://github.com/tinlaboratory/RexBench and benchmark at https://rexbench.com/. Gold solutions deliberately held private in Bitbucket for evaluation integrity." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Benchmark tasks, extension instructions, and original codebases are publicly available at rexbench.com. Gold solutions are held out by design for fair evaluation, which is standard for benchmarks with leaderboards." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": true, 34 "justification": "Each task has an environment.yml file with package versions (Section 3.2). Evaluation uses Apptainer containers with specified hardware (Table 2). 'We added version information for all of the packages (via an environment.yml file in the repository).'" 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "Section 3.5 details the full evaluation pipeline: submission format (git patches), infrastructure setup (OpenStack VMs, Apptainer containers), and execution procedure. Appendix C describes agent configurations. The website hosts submission instructions." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "Figure 2: 'Error bars show standard error of the mean of all runs per model computed using the closed form formula (2σ, no normality assumption).' Figure 5 shows regression coefficients with 95% confidence intervals." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 5.2 uses a mixed-effects linear regression with significance tests (p-values reported: β = −0.036, p < 0.01 for line changes; p < .05 for repository popularity). However, agent-level performance comparisons (e.g., 'best performing agents achieving 25%') lack formal significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Regression coefficients with CIs reported in Figure 5 (e.g., β = −0.036 for line changes). Success rates are absolute values with context (e.g., '25% final success rate' out of 12 tasks × 3 runs)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for why 12 tasks or 3 runs per task were chosen. No power analysis. The paper acknowledges the benchmark is small but does not justify the specific sample size." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "'We run each task three times with the same agent model to account for agent random variation.' Standard error bars are shown in Figure 2. Per-task results across 3 runs shown in Tables 8-16." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Nine agent configurations compared across 3 frameworks (aider, Claude Code, OpenHands) and 4 LLM backbones. Different configurations serve as baselines for each other." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "All models are recent: Claude 3.7 Sonnet, o1, o4-mini, DeepSeek R1. All frameworks are current: aider, Claude Code, OpenHands. The paper evaluates the state of the art in coding agents." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The hint experiment (Section 4.2) systematically ablates information provided: no hints → localization hints → step-by-step hints. This tests which capabilities (information finding vs. planning vs. implementation) are bottlenecks." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Three metrics defined in Section 3.4: final success rate, execution success rate, and file recall. Additionally, cost and time efficiency are analyzed (Figure 4)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Evaluation is fully automated via numerical match with gold solutions. Section 5.1 includes manual error analysis of top-2 agents' outputs, but this is post-hoc qualitative analysis, not a systematic human evaluation metric. The paper notes this limitation: 'the cause of failure is difficult to identify' without process-level metrics." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Gold solutions are held in private Bitbucket repositories. Evaluation infrastructure is privately hosted. 'We store all the gold extensions in private Bitbucket repositories.' Agents have no access to evaluation scripts or reference solutions." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Per-task breakdowns provided in Tables 8-16 for every agent-LLM-hint combination. Results also broken down by framework, LLM backbone, and hint level. Error distribution in Table 7." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.1 extensively discusses explicit errors (empty diffs, Python errors, timeouts) and implicit errors (logic errors, value errors). Error distribution in Figure 7/Table 7. Debugging difficulty estimated per error." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Most results are negative — most agents achieve near-zero success. DeepSeek R1 achieves 0% across most configurations. The 'overthinking' failure mode is reported. Detailed hints sometimes hurt performance (e.g., Othello task drops from 100% to 0% with detailed hints)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims match results: 'all agents evaluated fail to autonomously implement the majority of the extensions' (best is 25%), 'best performance under this setting remains below 40%' (39% with hints for OpenHands + Claude). All claims in the abstract are supported." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The hint experiment (Section 4.2) is a controlled manipulation supporting causal claims about hint effects. The regression in Section 5.2 uses appropriate language ('significant negative effect') with a mixed-effects model including random effects for model identity. Ablation-style claims (hint levels) have adequate design." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper explicitly bounds scope: 'with an initial focus on Natural Language Processing (NLP) and Machine Learning (ML)' (Section 1). Limitations section notes the setting is 'idealized' and more constrained than real research. Acknowledges 12 tasks is a starting point for a 'community-driven effort.'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 5.1.1 discusses multiple explanations for failures: overthinking behavior, models varying in ability to implement different solution strategies, interaction between hints and model capabilities. Section 5.2 considers four possible sources of difficulty. Limitations discuss how controlled setting differs from real scenarios." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper explicitly discusses the proxy gap: 'Realistic research extensions tend to be quite open-ended, which makes automatic assessment challenging' (Section 3.1). They acknowledge that numerical match with gold solutions is a proxy for genuine research extension capability and discuss how the idealized setting differs from real research." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models identified by marketing names only: 'Claude 3.7 Sonnet', 'o1', 'o4-mini', 'DeepSeek R1'. No API versions, snapshot dates, or model IDs provided. The Claude system card is cited but no specific API version is used. These marketing names can map to different model checkpoints over time." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Agent prompts provided: Claude Code prompt is 'Read the instructions in instructions.md and carry out the specified task' plus 'Please do not execute any code' (Appendix C). OpenHands uses same prompt. Aider's two-stage approach described. Full task instruction example in Appendix A (WinoDict)." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 4.1.1: 'we set the temperature to 0.7 for Claude 3.7 Sonnet and DeepSeek-R1, and specified reasoning effort as medium for all OpenAI models.' Max retries (5) for aider, max steps (250) for OpenHands also stated." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "Table 3 shows capabilities per framework (repo navigation, tool use, bash execution, Python execution). Appendix C provides detailed configuration for each framework. Appendix D analyzes OpenHands tool usage distribution (Figure 6). Agent interaction patterns described." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.3 documents the benchmark construction pipeline: code replication verification → gold implementation → instruction writing → multi-round revision for clarity and self-containment. Papers converted from PDF to markdown using PyMuPDF4LLM. Environment.yml files added." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Explicit 'Limitations and broader impacts' subsection at the end of Section 6 with substantive discussion of automatic evaluation constraints, idealized setting, and missing process-level metrics." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats discussed: tasks are 'much more informative and clearer than an actual task a human researcher may face'; automatic evaluation constrains task open-endedness; process-level metrics like landmark evaluation are missing; risk of 'reward hacking or gamification of the benchmark.'" 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Multiple explicit boundaries: 'primarily in the AI domain with a focus on topics aligning with the expertise of our team'; 12 tasks as 'a motivating start'; setting is more constrained than real research; agents were not given code execution capability; extensions are 'specifically-scoped questions' not open-ended exploration." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Agent log files and patch files from individual runs are not publicly released. The paper mentions requesting agent logs for submissions but does not state these are publicly available. Only aggregate results (tables, figures) are provided in the paper." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.3 describes benchmark construction in detail: domain experts verified codebase replication, implemented gold solutions, validated by co-authors, wrote instructions through multiple revision rounds. Section 4.1 describes experimental procedure." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. The benchmark consists of research extension tasks created by the author team. Task selection criteria are described in Section 3.2." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "Full pipeline documented: Figure 1 shows end-to-end workflow. Section 3.5 details evaluation infrastructure (patch application → container execution → result collection → evaluation). Section 4.1 describes experimental procedure for running agents." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Acknowledgments section: 'This work was supported by funding from Good Ventures Foundation and Open Philanthropy awarded to NK and SS, from Google awarded to NK, and from WWTF through the project \"Understanding Language in Context\" (WWTF Vienna Research Group VRG23-007) awarded to SS.'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All author affiliations clearly listed: University College London, Boston University, University of Vienna. No authors are affiliated with the companies whose models are evaluated (Anthropic, OpenAI, DeepSeek)." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "Funders are Good Ventures/Open Philanthropy (foundation), WWTF (Austrian research fund), and Google. None have direct stake in whether specific agents succeed. Google's models are not evaluated. The research is conducted by academic researchers." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement in the paper. While there may be no conflicts, the absence of an explicit declaration is noted." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "Training data cutoff dates are not stated for any of the four models evaluated (Claude 3.7 Sonnet, o1, o4-mini, DeepSeek R1). The paper addresses contamination through benchmark design (novel extensions) rather than by checking model training cutoffs." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Extensively discussed in Section 3.1: 'If solutions to any of the tasks are openly available on the web, LLMs that serve as the backbone for the agents may have been trained on the solutions.' They use novel extensions stored in private Bitbucket repos to prevent overlap." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Core design principle: 'We circumvent this problem by including only novel research extensions.' Gold solutions in private Bitbucket repos. 'We use Bitbucket instead of GitHub since GitHub data has been used in the past to train LLMs.' Source papers are public, but the specific extensions and solutions are novel." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates AI coding agents on benchmark tasks." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 6 reports average cost per task and total cost for each agent-model combination (e.g., aider + o4-mini: $0.03 avg, $1.02 total). Figure 4 plots cost vs. final success on the Pareto frontier." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": true, 294 "justification": "Table 2 specifies hardware per task (A100 40GB, K80 12GB, or CPU). Table 6 reports total duration per agent. Gold solution runtimes in Table 2 (20min to 6h). 'Including preliminary and failed runs...approximately 4–5x the reported amount.'" 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": true, 301 "justification": "'We run each task three times with the same agent model to account for agent random variation.' Results in Tables 8-16 show per-run variation. Gold solutions control random seeds: 'we (1) fix all random seeds in the codebase wherever possible.'" 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 4.1: 'We run each task three times with the same agent model to account for agent random variation.' Three runs explicitly stated." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search reported. Temperature set to 0.7 and reasoning effort to 'medium' without justification for these choices. No search budget or alternatives explored." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "The choice of temperature=0.7, reasoning effort='medium', and other agent configurations are not justified. No validation set used for selection. These appear to be default or arbitrary choices." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Section 5.2 tests four predictors in a regression model with individual p-values but no multiple comparison correction applied. Nine agent configurations are compared without correction for the number of comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors designed the benchmark and evaluate agents on it. While they evaluate third-party tools (not their own system), they do not discuss potential bias in benchmark construction (e.g., task selection, instruction design) that could systematically favor certain agent architectures." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Figure 4 explicitly plots cost vs. final success and time vs. final success, identifying Pareto-optimal agents. 'aider + o4-mini and OpenHands + Claude 3.7 Sonnet lie on the Pareto frontier for both cost and time.'" 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "Section 3.1 discusses desiderata and validity: tension between realism and automatic evaluation, calibration of instruction granularity, self-containment requirements. Limitations section discusses how the idealized setting differs from real research extension. Multiple revision rounds ensured task quality." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "The experimental design crosses frameworks with models: 3 frameworks × 4 LLMs. Table 3 documents capability differences across frameworks. Results clearly show scaffold effects (e.g., same Claude 3.7 Sonnet backbone yields different results across aider/Claude Code/OpenHands). This design separates scaffold from model effects." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "Core design feature: extensions are novel implementations that do not exist publicly. 'To the best of our knowledge, none of our extensions exist on top of the existing codebases publicly.' Bitbucket chosen over GitHub to avoid training data inclusion. Source papers are public but solutions are new." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether task instructions or the self-contained input design might leak implementation hints. The paper ensures self-containment (all needed info is provided) but does not discuss whether this constitutes feature leakage relative to what agents would have in realistic settings." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the 12 tasks share structural similarities that could inflate apparent generalizability. Tasks span different extension types (model, algorithm, data, evaluation) but potential dependencies between tasks are not analyzed." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": true, 363 "justification": "Concrete prevention methods: gold solutions stored in private Bitbucket repositories (not GitHub); evaluation infrastructure privately hosted; novel extensions with no public implementations. 'We use Bitbucket instead of GitHub since GitHub data has been used in the past to train LLMs.'" 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Current coding agents fail to autonomously implement the majority of research extensions, with the best agents achieving only 25% success rate.", 370 "evidence": "Figure 2 and Section 4.3: OpenHands + Claude 3.7 Sonnet and Claude Code both achieve 25% final success rate across 12 tasks × 3 runs. Most other configurations achieve near-zero success.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Even with step-by-step implementation hints, the best agent achieves only 39% success rate.", 375 "evidence": "Figure 3 and Table 4: OpenHands + Claude 3.7 Sonnet with first-level hints reaches 39% final success. Second-level hints did not yield additional benefits.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Claude 3.7 Sonnet is the strongest backbone LLM for agent-based research extension tasks.", 380 "evidence": "Figure 2: Claude 3.7 Sonnet yields the best performance across frameworks — 14% (aider), 25% (Claude Code), 25% (OpenHands). o4-mini is second with 3-8% depending on framework.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The amount of code changes required (lines of gold solution) is the strongest predictor of task difficulty for agents.", 385 "evidence": "Section 5.2, Figure 5: Mixed-effects regression shows β = −0.036, p < 0.01 for line changes. Other factors (file count, citations, repository popularity) had negligible or non-significant effects.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Agents generally locate the correct files to edit but fail at implementation.", 390 "evidence": "Figure 2: File recall ranges from 0.18-0.87 across agents, generally high (0.64-0.87 for top agents), while final success rates are much lower (0-25%). The gap between file recall and final success indicates implementation, not localization, is the bottleneck.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Overthinking is a prominent failure mode for reasoning models (DeepSeek R1, o1, o4-mini).", 395 "evidence": "Section 5.1.1: 'aider + DeepSeek-R1 was especially prone to this behavior, overthinking being one of the most prominent failure modes (close to one third of total failures).' Described qualitatively without systematic quantification.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "REXBENCH is robust to data contamination by using novel extensions stored in private repositories.", 400 "evidence": "Section 3.1: Extensions are novel implementations not publicly available. Gold solutions in private Bitbucket repos. 'To the best of our knowledge, none of our extensions exist on top of the existing codebases publicly.'", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": ["benchmark-eval"], 405 "key_findings": "REXBENCH evaluates 9 LLM agent configurations on 12 research extension tasks and finds all agents struggle, with the best (OpenHands + Claude 3.7 Sonnet and Claude Code) achieving only 25% success rate. Hints improve performance to 39% but inconsistently, and detailed step-by-step hints do not help beyond information localization hints. Task difficulty is primarily driven by the amount of code changes required (β = −0.036, p < 0.01). Agents show high file recall but low execution and final success rates, indicating implementation logic rather than code navigation is the primary bottleneck.", 406 "red_flags": [ 407 { 408 "flag": "Very small benchmark", 409 "detail": "Only 12 tasks with 3 runs each — total of 36 data points per agent. This severely limits statistical power and generalizability. A single task (Othello, which swings from 0% to 100% depending on agent) can substantially shift overall success rates." 410 }, 411 { 412 "flag": "No formal agent comparison tests", 413 "detail": "Claims about which agents/models are 'best' (e.g., Claude 3.7 Sonnet as strongest backbone) are based on comparing raw percentages without significance tests. With only 12 tasks × 3 runs and high variance, many of these differences may not be statistically significant." 414 }, 415 { 416 "flag": "Missing model version specificity", 417 "detail": "All models identified by marketing names (Claude 3.7 Sonnet, o1, o4-mini, DeepSeek R1) without API versions or snapshot dates. Model behavior changes across versions, making exact reproduction impossible." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 423 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"], 424 "year": 2024, 425 "relevance": "Foundational benchmark for LLM coding agent evaluation using real GitHub issues, directly relevant as a comparison point for agent benchmarking methodology." 426 }, 427 { 428 "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery", 429 "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"], 430 "year": 2024, 431 "arxiv_id": "2408.06292", 432 "relevance": "Attempts to automate the full research pipeline, directly relevant to the survey's coverage of AI research automation capabilities." 433 }, 434 { 435 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 436 "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn", "James Aung", "Jun Shern Chan"], 437 "year": 2025, 438 "arxiv_id": "2504.01848", 439 "relevance": "Most directly comparable benchmark to REXBENCH; evaluates paper replication rather than extension, relevant to research automation evaluation methodology." 440 }, 441 { 442 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 443 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 444 "year": 2025, 445 "relevance": "Open-source agent framework evaluated in this paper, relevant as a major agentic coding platform." 446 }, 447 { 448 "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 449 "authors": ["Chan Jun Shern", "Neil Chowdhury", "Oliver Jaffe"], 450 "year": 2024, 451 "arxiv_id": "2410.07095", 452 "relevance": "Benchmark for ML engineering agents using Kaggle-style tasks, relevant to understanding agent capabilities in ML contexts." 453 }, 454 { 455 "title": "SUPER: Evaluating Agents on Setting Up and Executing Tasks from Research Repositories", 456 "authors": ["Ben Bogin", "Kejuan Yang", "Shashank Gupta", "Kyle Richardson"], 457 "year": 2024, 458 "relevance": "Evaluates agent ability to set up research environments, directly relevant as a benchmark for coding agents in research settings." 459 }, 460 { 461 "title": "MLAgentBench: evaluating language agents on machine learning experimentation", 462 "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"], 463 "year": 2024, 464 "relevance": "Benchmark for ML experimentation agents covering classical ML tasks and Kaggle challenges, relevant to agent capability evaluation." 465 }, 466 { 467 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 468 "authors": ["Daya Guo", "Dejian Yang"], 469 "year": 2025, 470 "arxiv_id": "2501.12948", 471 "relevance": "Open-weight reasoning model evaluated as agent backbone; relevant to understanding reasoning model capabilities for agentic tasks." 472 }, 473 { 474 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 475 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 476 "year": 2023, 477 "relevance": "Prompting methodology for deliberate problem solving with LLMs, one of the benchmark tasks tests extension of this work." 478 }, 479 { 480 "title": "Curie: Toward Rigorous and Automated Scientific Experimentation with AI Agents", 481 "authors": ["Patrick Tser Jern Kon", "Jiachen Liu"], 482 "year": 2025, 483 "arxiv_id": "2502.16069", 484 "relevance": "Evaluates AI agents on scientific experiment planning and execution, closely related benchmark for research automation." 485 }, 486 { 487 "title": "Can LLMs generate novel research ideas? a large-scale human study with 100+ NLP researchers", 488 "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"], 489 "year": 2024, 490 "arxiv_id": "2409.04109", 491 "relevance": "Large-scale study evaluating LLM research ideation capability, relevant to understanding AI research automation potential." 492 }, 493 { 494 "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks", 495 "authors": ["Frank F. Xu", "Yufan Song", "Boxuan Li"], 496 "year": 2024, 497 "arxiv_id": "2412.14161", 498 "relevance": "Benchmark for LLM agents on real-world consequential tasks, relevant to evaluating agent capabilities beyond coding." 499 } 500 ] 501 }