scan.json (31932B)
1 { 2 "paper": { 3 "title": "Measuring AI Ability to Complete Long Software Tasks", 4 "authors": ["Thomas Kwa", "Ben West", "Joel Becker", "Amy Deng", "Katharyn Garcia", "Max Hasin", "Sami Jawhar", "Megan Kinniment", "Nate Rush", "Sydney Von Arx", "Ryan Bloom", "Thomas Broadley", "Haoxing Du", "Brian Goodrich", "Nikola Jurkovic", "Luke Harold Miles", "Seraphina Nix", "Tao Lin", "Chris Painter", "Neev Parikh", "David Rein", "Lucas Jun Koba Sato", "Hjalmar Wijk", "Daniel M. Ziegler", "Elizabeth Barnes", "Lawrence Chan"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.14499" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": false, 16 "justification": "Section I states 'Code and data to reproduce some of the core figures in this paper will be provided in the supplementary material' — future tense, not yet released. No repository URL is provided." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "The paper states data will be provided in supplementary material (Appendix I) but no actual download link or dataset is provided. HCAST tasks are noted as available 'on request' (Appendix B.1.1 footnote 12). SWAA tasks are not publicly released." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using Vivaria platform and various APIs but does not specify environment details for reproduction." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "No step-by-step reproduction instructions are provided. Appendix I notes code will be provided in supplementary material but offers no concrete instructions." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": true, 38 "justification": "95% CI is reported for the main time horizon trend via hierarchical bootstrap (Figure 1, Section 3.2). The shaded region in figures represents 95% CI calculated by bootstrapping over task families, tasks, and runs." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper reports p = 0.006 for o3 lying above the long-run trend (Section 3.2). Bootstrap tests are used for confidence intervals." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "Effect sizes are reported in context: doubling time of 207 days (95% CI 166-240 days), o3 has 110-minute time horizon vs GPT-2's 2-second horizon, 80% time horizons are 4-6x shorter than 50% horizons (Section 3.2.1). R² values reported (0.83, 0.80)." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "No power analysis or explicit justification for why 170 tasks, 12 frontier models, or ~800 baselines were chosen. The paper acknowledges limited models ('only seven frontier models in this time span' for 2024-2025) but does not formally justify sample sizes." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Variance is reported via hierarchical bootstrap with 10,000 samples over task families, tasks, and runs (Section 3.2). Error bars shown on figures. Run-to-run variance analyzed in Figure 6 sensitivity analysis." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Human baselines are included throughout: 800+ baselines totaling 2,529 hours (Section 2.2). 12 frontier models compared against each other and against human performance." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "Models span 2019-2025, including contemporary frontier models: o3, Claude 3.7 Sonnet, o1, Claude 3.5 Sonnet (New). The most recent models are from early 2025." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Extensive ablation/sensitivity analysis in Section 5 and Appendix H: alternative curve fits, task family weighting, regularization, baseline noise, task suite composition (removing RE-Bench), continuous vs binary scoring." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "Multiple metrics used: 50% time horizon, 80% time horizon (Section 3.2.1), average success rate (Figure 3), per-task-family success rates, messiness-adjusted performance, continuous scoring (Appendix H), cost per successful run (Figure 14)." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": true, 85 "justification": "Human evaluation is included for internal PR tasks (Section C.2): repository maintainers manually scored model and baseliner solutions on a 0-1 scale as if reviewing PRs. Qualitative analysis of 63 failed runs was done manually (Section 3.3, Appendix D)." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "SWAA tasks were developed blind to AI performance (Appendix B.1.3): 'all tasks were written before seeing AI attempts, and elicitation was carried out on a separate development task suite.' Agent scaffolds were developed on held-out dev tasks not in HCAST (Appendix C.3.1)." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Results broken down by task source (Table 7: HCAST, RE-Bench, SWAA), by task length buckets (Figure 3 right, Figure 4), by messiness score (Figure 12), and per-model (Figure 3 left)." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Extensive failure analysis in Section 3.3 and Appendix D: 31 GPT-4 and 32 o1 failed runs categorized into failure types (Table 3.3). Specific examples of failure modes provided (poor planning, incorrect reasoning, premature abandonment, repeating failed actions)." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "Multiple negative results: models struggle on 'messier' tasks (Section F.2), performance much lower on less structured tasks, AI agents fail to proactively seek information (Section D.2.2), internal PR results show models scoring 0 on several issues (Table 4)." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims match results: 50% time horizon of ~110 minutes for o3 (Section 3.2), doubling every ~7 months (207 days, Section 3.2), improvements driven by reliability/reasoning/tool use (Section 3.3), 5-year extrapolation to 1-month tasks (Section 5). Abstract appropriately hedges with 'If these results generalize.'" 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": false, 117 "justification": "Section 3.3 claims improvements are 'primarily driven by greater reliability, ability to adapt to mistakes, logical reasoning, and capacity for tool use' but this is based on qualitative analysis of 63 transcripts, not controlled experiments isolating these factors. The causal attribution is not rigorously supported." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper extensively bounds generalizations: Section 4 on external validity, Section B.2 lists 5 specific limitations of the task suite, Section 5 explicitly states 'If these results generalize to real-world software tasks', and Section E.3 discusses interpreting time horizon caveats (context effects, task distribution effects)." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": true, 127 "justification": "Multiple alternative explanations discussed: SWE-bench doubling time differences may be due to annotator time bias (Section 4.1), messiness factors could explain benchmark-vs-real-world gaps (Section F.2), baseline selection effects (Section C.1.4), elicitation quality differences (Section E.4), compute scaling vs algorithmic improvement (Section E.2)." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper is explicit that time horizon on benchmark tasks is a proxy for real-world capability. Section 4 is dedicated to external validity. Section B.2 lists systematic differences between tasks and real work. Section E.3 notes 'time horizon is always measured relative to a domain, task distribution, and baseliners' level of skill and context.'" 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Specific model versions provided in Table 6 and Appendix C.3: 'GPT-4 0314', 'GPT-4 1106', 'GPT-4 Turbo', 'Claude 3.5 Sonnet (Old)', 'Claude 3.5 Sonnet (New)', 'davinci-002', 'gpt-3.5-turbo-instruct', 'o1', 'o1-preview'. Footnote 6 explains release date mapping for API model changes." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "No actual prompt text is provided. The paper describes scaffolding approaches (Appendix C.3.1) and mentions 'a simple prompting scaffold' for SWAA tasks but does not include the actual prompts used." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "No temperature, top-p, or sampling parameters are reported for any model. Token limits are mentioned indirectly ('the system reaches its predefined usage threshold') but no specific values given." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "Scaffolding described in Appendix C.3.1: modular-public scaffold provides Python/Bash commands with context management; triframe scaffold generates plans and multiple candidate actions with scoring. Both use ReAct-style reasoning. Table 6 maps models to scaffolds." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Data processing documented: task scoring thresholds explained (Section 3.1), baseline filtering for successful runs (Section C.1.4), geometric mean of baseline times (Section C.1.4), task family weighting by 1/√n (Section C.4), binarization of continuous scores described." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 6 'Discussion' includes a 'Limitations and future work' subsection. Section E.4 provides detailed limitations. Section B.2 'Limitations of the task suite' is a dedicated subsection." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Highly specific threats discussed: task suite lacks automatic scoring realism, no multi-agent interaction, lax resource constraints, static environments (Section B.2). Baseliner skill effects quantified (Section C.2 shows 5-18x speed difference). Elicitation effort varies by model (Section E.4)." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "Clear scope boundaries: 'we evaluate models only on software and research tasks' (Section 5 footnote 8). Section E.3 states time horizon is relative to domain/distribution/baseliner skill. Section B.2 lists 5 specific ways tasks differ from real work." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "Raw data not available. Appendix I states code and data 'will be provided' in supplementary material (future tense). 'The full codebase is infeasible to publicize and anonymize.'" 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Data collection thoroughly described: HCAST tasks from Rein et al. [8], RE-Bench from Wijk et al. [2], SWAA created by the team with blinding to model performance (Appendix B.1.3). Human baseline collection procedure detailed in Appendix C.1. Agent runs described in Appendix C.3." 189 }, 190 "recruitment_methods_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Baseliner recruitment described: 'skilled professionals in software engineering, ML, and cybersecurity, with the majority having attended world top-100 universities' with 'an average of about 5 years of relevant experience' (Section 2.2). Internal employees used for SWAA (Section C.1.3). Payment incentives described (Section C.1.4)." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "Pipeline documented: task creation → human baselining → agent evaluation → score binarization → logistic regression → time horizon computation (Figure 2). Baseline filtering criteria stated (Section C.1.4). Task family weighting explained (Section C.4)." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding source is disclosed. The Acknowledgments section thanks reviewers but does not mention any grants or funding agencies. METR is the author organization but its funding is not disclosed." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All authors listed as affiliated with METR (Model Evaluation & Threat Research). Daniel Ziegler noted as Anthropic (work done at METR). Luke Harold Miles noted as Ohm Chip (work done at METR)." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "METR's funding sources are not disclosed, so independence cannot be assessed. METR is an AI safety evaluation organization; its institutional interest in demonstrating rapid AI capability growth could create a non-independent dynamic, though this is not disclosed or discussed." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement is present in the paper. No declaration of financial interests, patents, or equity." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "Training cutoff dates are not stated for any of the 12+ models evaluated. The paper discusses contamination conceptually (footnotes about release dates vs training dates) but does not state specific training data cutoffs." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": true, 232 "justification": "Partially addressed: SWAA tasks were 'written before seeing AI attempts' (Appendix B.1.3). HCAST tasks are not publicly shared to 'reduce the likelihood of AI systems accidentally or intentionally being trained on them' (footnote 12). Internal PR tasks described as 'uncontaminated' (Section C.2)." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": true, 237 "justification": "Contamination addressed through design: tasks are kept private (HCAST footnote 12), SWAA developed blind to model performance, internal PRs are uncontaminated by design. For SWE-bench Verified, the paper notes potential contamination but uses it only for external validity, not as the main benchmark." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": true, 243 "answer": false, 244 "justification": "No pre-registration mentioned. The human baselining study involves human participants performing tasks but no pre-registration link (OSF, etc.) is provided." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": true, 248 "answer": false, 249 "justification": "No IRB or ethics board approval is mentioned despite the study involving human participants (baseliners completing tasks with screen and audio recording)." 250 }, 251 "demographics_reported": { 252 "applies": true, 253 "answer": true, 254 "justification": "Demographics partially reported: 'skilled professionals in software engineering, ML, and cybersecurity, with the majority having attended world top-100 universities' and 'an average of about 5 years of relevant experience' (Section 2.2)." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": true, 258 "answer": false, 259 "justification": "No explicit inclusion/exclusion criteria for baseliner selection. The paper states they are 'skilled professionals' but does not specify screening criteria, minimum qualifications, or exclusion rules." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "This is not an experimental study comparing treatment vs control conditions for human participants. Baseliners are performing tasks to establish timing, not randomized into conditions." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "Blinding is not applicable — baseliners are performing tasks to establish timing benchmarks, not participating in a blinded experimental comparison." 270 }, 271 "attrition_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Attrition partially reported: 558 baselines with 286 successful from HCAST/RE-Bench (Section C.1), 249 with 236 successful from SWAA. Section C.1.4 discusses why failed baselines were filtered and the bias this introduces." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Figure 14 shows cost of a successful run as a fraction of human expert salary ($143.61/hour). Section E.4 states 'more than 80% of successful runs cost less than 10% of what it would cost for a human.' Cost information not included for o3 and o4-mini (Figure 18 caption)." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Section C.3.2: 'about 2,000 H100-hours for RE-Bench environments and 50,000 CPU hours for other environments... plus roughly 50,000 H100-hour equivalents of compute used internally or from API providers for inference.'" 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Multiple runs per task: 'We perform 8 runs per agent/task pair' (Section 2.3). Variance across runs is captured in the hierarchical bootstrap. Run-to-run variance is one component of the sensitivity analysis (Figure 6)." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Section 2.3: 'We perform 8 runs per agent/task pair' with a footnote that 'This number is approximate, because a small number of runs failed due to internal infrastructure issues.'" 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": false, 303 "justification": "No hyperparameter search budget is reported for the scaffolding or elicitation. Section E.4 notes varying elicitation effort ('around 2-3 engineer weeks of iterative development' for o1 and Claude 3.5 Sonnet) but no systematic search budget." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": true, 308 "justification": "Scaffold selection is documented: Table 6 maps each model to its scaffold. Section E.4 acknowledges differential elicitation effort. The sensitivity analysis (Appendix H) tests robustness across multiple configurations." 309 }, 310 "multiple_comparison_correction": { 311 "applies": false, 312 "answer": false, 313 "justification": "Only one formal statistical test is reported (p = 0.006 for o3). No multiple comparison issue arises." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": true, 318 "justification": "Section E.4 explicitly acknowledges: 'We have put a limited amount of effort into eliciting models to get good performance on our tasks, so while our results are a reasonable lower bound, some models may have somewhat greater capabilities than we demonstrate.' This addresses the bias of authors controlling evaluation quality." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": true, 323 "justification": "Figure 18 plots success rate vs inference cost, showing models plateau below their token limits. Section E.4 notes 'more than 80% of successful runs cost less than 10% of human equivalent' and discusses room for more inference compute." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": true, 328 "justification": "Extensive construct validity discussion: Section B.2 lists 5 systematic differences between tasks and real work. Section F.2 analyzes 16 'messiness factors' to assess how representative the benchmark is. Section 4 dedicates multiple experiments to external validity." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": true, 333 "justification": "Table 6 documents which scaffold was used for each model. Appendix C.3.1 describes the two scaffold types (modular-public and triframe/duet). The paper acknowledges scaffold differences could affect results, and o1/o1-preview use different scaffolds due to their tool-use limitations." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "Tasks designed to minimize temporal leakage: HCAST tasks kept private (footnote 12), SWAA developed blind to model performance (Appendix B.1.3), internal PRs described as 'uncontaminated' (Section C.2). SWE-bench contamination acknowledged (Section 4.1)." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No explicit discussion of whether evaluation setup provides hints not available in real usage. The paper does not address whether task instructions or environment features leak information about expected solutions." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "Task families used to address non-independence: 'we weight each task by the inverse square root of the number of tasks in the family it belongs to' (Section 2.3). Correlation between task families is reported (Figure 22, average correlation ~0.73)." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": true, 355 "justification": "Prevention methods used: tasks kept private and not published (footnote 12), SWAA tasks blinded during development, internal PRs confirmed uncontaminated. While no post-hoc detection method (canary strings, membership inference) is used, proactive prevention is implemented." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "The 50% task-completion time horizon for frontier AI models has been doubling approximately every 7 months (207 days) since 2019.", 362 "evidence": "Logistic regression on 12 frontier models from 2019-2025, with 95% bootstrapped CI of 166-240 days doubling time (Section 3.2, Figure 1).", 363 "supported": "strong" 364 }, 365 { 366 "claim": "Current frontier models (o3) have a 50% time horizon of approximately 110 minutes.", 367 "evidence": "Logistic regression fit on 170 tasks with human baselines (Section 3.2, Figure 4).", 368 "supported": "strong" 369 }, 370 { 371 "claim": "The 80% time horizon follows a similar doubling rate (204 days) but is 4-6x shorter than the 50% horizon.", 372 "evidence": "Figure 17 shows 80% horizon trend with similar slope (Section 3.2.1).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "AI improvement is primarily driven by greater reliability, tool use, logical reasoning, and adaptation to mistakes.", 377 "evidence": "Qualitative analysis of 63 failed agent transcripts (31 GPT-4, 32 o1) categorized into failure types (Section 3.3, Table 3.3).", 378 "supported": "weak" 379 }, 380 { 381 "claim": "If extrapolated, AI will reach a 1-month (167 hours) time horizon between mid-2028 and mid-2031.", 382 "evidence": "Sensitivity analysis in Figure 6 with 80% CI spanning roughly 2 years. Central estimate mid-2029 (Section 5).", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Models perform worse on 'messier' tasks, but the rate of improvement over time is similar for high and low messiness subsets.", 387 "evidence": "16 messiness factors rated on HCAST/RE-Bench tasks. Success rate decreases ~8.1% per messiness point. Figure 12 shows similar improvement slopes (Section F.2).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Contract baseliners take 5-18x longer than repository maintainers to complete internal PR tasks.", 392 "evidence": "Table 5 comparing maintainer vs baseliner times on 5 internal PRs (Section C.2).", 393 "supported": "moderate" 394 } 395 ], 396 "methodology_tags": ["benchmark-eval", "qualitative"], 397 "key_findings": "The 50% task-completion time horizon for frontier AI models has been doubling approximately every 7 months from 2019-2025, with current models (o3) reaching ~110 minutes. The trend appears consistent across 50% and 80% success rates and multiple robustness checks, though external validity remains uncertain. Models still struggle significantly on 'messier' tasks lacking clear feedback loops, though improvement rates appear similar across messiness levels. Naive extrapolation predicts AI capable of month-long software tasks by mid-2028 to mid-2031.", 398 "red_flags": [ 399 { 400 "flag": "Causal claims from qualitative analysis", 401 "detail": "The claim that improvements are 'primarily driven by' specific factors (Section 3.3) is based on qualitative analysis of 63 transcripts, not controlled experiments. This causal attribution is not well-supported by the study design." 402 }, 403 { 404 "flag": "Differential elicitation effort", 405 "detail": "Section E.4 acknowledges '2-3 engineer weeks of iterative development' for o1 and Claude 3.5 Sonnet while 'All other models use the same scaffolding with at most minor changes.' This differential effort could inflate apparent improvement rates for better-elicited models." 406 }, 407 { 408 "flag": "Baseline selection bias", 409 "detail": "Section C.1.4 acknowledges that filtering for successful baselines biases toward shorter task length ratings, potentially underestimating model performance. The payment scheme incentivized contractors to give up early on difficult tasks." 410 }, 411 { 412 "flag": "No competing interests disclosure", 413 "detail": "METR as an AI safety evaluation organization has institutional interest in demonstrating rapid AI capability growth. No funding sources or competing interests are disclosed." 414 }, 415 { 416 "flag": "Extrapolation beyond data range", 417 "detail": "The extrapolation to 1-month horizon (Section 5) extends well beyond the observed data range (2 seconds to 110 minutes). While the paper acknowledges this limitation, the forecasts receive prominent treatment in the abstract and conclusions." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts", 423 "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker"], 424 "year": 2024, 425 "arxiv_id": "2411.15114", 426 "relevance": "Core benchmark used in this study for evaluating AI agent capabilities on ML research engineering tasks." 427 }, 428 { 429 "title": "HCAST: Human-Calibrated Autonomy Software Tasks", 430 "authors": ["David Rein", "Joel Becker", "Amy Deng"], 431 "year": 2025, 432 "relevance": "Core benchmark providing 97 diverse software tasks with human baselines used in this study." 433 }, 434 { 435 "title": "Evaluating frontier models for dangerous capabilities", 436 "authors": ["Mary Phuong", "Matthew Aitchison"], 437 "year": 2024, 438 "arxiv_id": "2403.13793", 439 "relevance": "Related work on contextualizing AI benchmark performance for dangerous capability assessment." 440 }, 441 { 442 "title": "AgentBench: Evaluating LLMs as agents", 443 "authors": ["Xiao Liu", "Hao Yu"], 444 "year": 2023, 445 "arxiv_id": "2308.03688", 446 "relevance": "Agentic AI capability benchmark evaluating agents across diverse environments." 447 }, 448 { 449 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 450 "authors": ["Carlos E. Jimenez", "John Yang"], 451 "year": 2024, 452 "relevance": "Industry-standard software engineering benchmark used for external validity analysis in this paper." 453 }, 454 { 455 "title": "Evaluating large language models trained on code", 456 "authors": ["Mark Chen", "Jerry Tworek"], 457 "year": 2021, 458 "arxiv_id": "2107.03374", 459 "relevance": "HumanEval benchmark for code generation, foundational benchmark in AI coding evaluation." 460 }, 461 { 462 "title": "SWE-Lancer: Can frontier LLMs earn $1 million from real-world freelance software engineering?", 463 "authors": ["Samuel Miserendino", "Michele Wang"], 464 "year": 2025, 465 "arxiv_id": "2502.12115", 466 "relevance": "Concurrent work on evaluating AI agents on real-world freelance software engineering tasks." 467 }, 468 { 469 "title": "TheAgentCompany: benchmarking LLM agents on consequential real world tasks", 470 "authors": ["Frank F. Xu", "Yufan Song"], 471 "year": 2024, 472 "arxiv_id": "2412.14161", 473 "relevance": "Benchmark requiring multi-agent interaction in realistic settings, relevant to agentic AI evaluation." 474 }, 475 { 476 "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering", 477 "authors": ["Jun Shern Chan", "Neil Chowdhury"], 478 "year": 2024, 479 "arxiv_id": "2410.07095", 480 "relevance": "ML engineering benchmark for evaluating AI agent capabilities." 481 }, 482 { 483 "title": "Forecasting frontier language model agent capabilities", 484 "authors": ["Govind Pimpale", "Axel Højmark"], 485 "year": 2025, 486 "arxiv_id": "2502.15850", 487 "relevance": "Related work on forecasting AI agent capabilities using compute and other metrics." 488 }, 489 { 490 "title": "Algorithmic progress in language models", 491 "authors": ["Anson Ho", "Tamay Besiroglu"], 492 "year": 2024, 493 "arxiv_id": "2403.05812", 494 "relevance": "Documents algorithmic improvements reducing compute requirements, relevant to AI capability trends." 495 }, 496 { 497 "title": "AIDE: AI-driven exploration in the space of code", 498 "authors": ["Zhengyao Jiang", "Dominik Schmidt"], 499 "year": 2025, 500 "arxiv_id": "2502.13138", 501 "relevance": "Shows that proper elicitation can dramatically improve AI agent performance on software tasks." 502 } 503 ] 504 }