scan.json (29084B)
1 { 2 "paper": { 3 "title": "Remote Labor Index: Measuring AI Automation of Remote Work", 4 "authors": [ 5 "Mantas Mazeika", 6 "Alice Gatti", 7 "Cristina Menghini", 8 "Udari Madhushani Sehwag", 9 "Shivam Singhal", 10 "Yury Orlovskiy", 11 "Steven Basart", 12 "Manasi Sharma", 13 "Denis Peskoff", 14 "Elaine Lau", 15 "Jaehyuk Lim", 16 "Lachlan Carroll", 17 "Alice Blair", 18 "Vinaya Sivakumar", 19 "Sumana Basu", 20 "Brad Kenstler", 21 "Yuntao Ma", 22 "Julian Michael", 23 "Xiaoke Li", 24 "Oliver Ingebretsen", 25 "Aditya Mehta", 26 "Jean Mottola", 27 "John Teichmann", 28 "Kevin Yu", 29 "Zaina Shaik", 30 "Adam Khoja", 31 "Richard Ren", 32 "Jason Hausenloy", 33 "Long Phan", 34 "Ye Htet", 35 "Ankit Aich", 36 "Tahseen Rabbani", 37 "Vivswan Shah", 38 "Andriy Novykov", 39 "Felix Binder", 40 "Kirill Chugunov", 41 "Luis Ramirez", 42 "Matias Geralnik", 43 "Hernán Mesura", 44 "Dean Lee", 45 "Ed-Yeremai Hernandez Cardona", 46 "Annette Diamond", 47 "Summer Yue", 48 "Alexandr Wang", 49 "Bing Liu", 50 "Ernesto Hernandez", 51 "Dan Hendrycks" 52 ], 53 "year": 2025, 54 "venue": "arXiv", 55 "arxiv_id": "2510.26787" 56 }, 57 "scan_version": 2, 58 "active_modules": ["experimental_rigor", "data_leakage"], 59 "checklist": { 60 "artifacts": { 61 "code_released": { 62 "applies": true, 63 "answer": true, 64 "justification": "The evaluation platform code is open-sourced (Section 3.4: 'The code for the evaluation platform is open-sourced'). However, the benchmark itself is largely private (230/240 projects in private test set)." 65 }, 66 "data_released": { 67 "applies": true, 68 "answer": false, 69 "justification": "Only 10 of 240 projects are publicly released. The private test set of 230 projects is withheld to prevent contamination (Section 3.2). The benchmark data is largely not available for independent verification." 70 }, 71 "environment_specified": { 72 "applies": true, 73 "answer": false, 74 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper describes agent scaffolds at a high level but does not specify reproducible environment configurations." 75 }, 76 "reproduction_instructions": { 77 "applies": true, 78 "answer": false, 79 "justification": "No step-by-step reproduction instructions are provided. The private test set prevents full reproduction, and the paper does not provide scripts or commands to replicate the evaluation pipeline." 80 } 81 }, 82 "statistical_methodology": { 83 "confidence_intervals_or_error_bars": { 84 "applies": true, 85 "answer": true, 86 "justification": "Figure 8 shows 95% confidence intervals for Elo scores computed via 100 bootstrap samples (Section B.2: 'We use 100 bootstrap samples to compute 95% confidence intervals')." 87 }, 88 "significance_tests": { 89 "applies": true, 90 "answer": false, 91 "justification": "No statistical significance tests are used to compare model automation rates. Differences between models (e.g., 2.5% vs 2.1%) are presented without any test of whether they are statistically distinguishable." 92 }, 93 "effect_sizes_reported": { 94 "applies": true, 95 "answer": true, 96 "justification": "Effect sizes are reported in context: automation rates as percentages, dollars earned, and Elo differences with the interpretation that '400 corresponds to 10:1 odds of winning' (Section 3.3)." 97 }, 98 "sample_size_justified": { 99 "applies": true, 100 "answer": false, 101 "justification": "The benchmark size of 240 projects is not justified via power analysis or other formal reasoning. No discussion of whether 240 projects is sufficient to distinguish models at the observed ~1-2.5% automation rates." 102 }, 103 "variance_reported": { 104 "applies": true, 105 "answer": false, 106 "justification": "No variance or standard deviation is reported for automation rates. Elo scores have bootstrap CIs, but automation rates are reported as single point estimates without any spread measure." 107 } 108 }, 109 "evaluation_design": { 110 "baselines_included": { 111 "applies": true, 112 "answer": true, 113 "justification": "The human gold-standard deliverable serves as the baseline. Multiple AI agents are compared against each other and against this human baseline (Section 3.3, Table 1)." 114 }, 115 "baselines_contemporary": { 116 "applies": true, 117 "answer": true, 118 "justification": "The evaluated models include frontier agents as of 2025: GPT-5, Claude Sonnet 4.5, Grok 4, Gemini 2.5 Pro, Manus, and ChatGPT agent. These are current state-of-the-art systems." 119 }, 120 "ablation_study": { 121 "applies": true, 122 "answer": false, 123 "justification": "No ablation study is performed. The paper does not systematically vary benchmark components (e.g., task complexity, category) to understand which factors drive the low automation rates." 124 }, 125 "multiple_metrics": { 126 "applies": true, 127 "answer": true, 128 "justification": "Four metrics are used: automation rate, Elo score, dollars earned, and autoflation (Section 3.3)." 129 }, 130 "human_evaluation": { 131 "applies": true, 132 "answer": true, 133 "justification": "All evaluation is manual. Trained human evaluators assess AI deliverables against human gold standards using a 3-point scale, with majority voting across 3 independent evaluations (Section 3.4)." 134 }, 135 "held_out_test_set": { 136 "applies": true, 137 "answer": true, 138 "justification": "230 of 240 projects are maintained as a private test set, with only 10 released publicly (Section 3.2)." 139 }, 140 "per_category_breakdown": { 141 "applies": true, 142 "answer": false, 143 "justification": "Results are reported only as aggregate automation rates and Elo scores. No per-category breakdown (e.g., by Upwork category) is provided in the main tables or appendix, despite 23 categories being represented." 144 }, 145 "failure_cases_discussed": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 4.3 provides qualitative analysis of failure modes with specific examples: truncated videos, child-like drawings, inconsistent 3D renderings, robotic voiceovers. Table 2 gives failure category frequencies." 149 }, 150 "negative_results_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "The entire paper is essentially reporting negative results — all agents perform near the floor (max 2.5% automation rate). GPT-5 CUA underperformed CLI (Appendix A.3)." 154 } 155 }, 156 "claims_and_evidence": { 157 "abstract_claims_supported": { 158 "applies": true, 159 "answer": true, 160 "justification": "The abstract claims 'highest-performing agent achieving an automation rate of 2.5%' which matches Table 1 (Manus at 2.5%). The claim of 'near the floor' is supported by all models being below 3%." 161 }, 162 "causal_claims_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "The paper makes implicit causal claims about AI capability limitations ('current systems remain far from capable'), but the low scores could be partly due to scaffold/tooling limitations, evaluation criteria strictness, or format compatibility issues rather than fundamental capability gaps. This is not adequately addressed." 166 }, 167 "generalization_bounded": { 168 "applies": true, 169 "answer": true, 170 "justification": "The Limitations section (Section 5) explicitly states RLI 'does not represent several types of remote work' and lists excluded categories (tutoring, team projects, client interaction). The title specifies 'Remote Work' rather than all work." 171 }, 172 "alternative_explanations_discussed": { 173 "applies": true, 174 "answer": false, 175 "justification": "The paper does not substantively discuss alternative explanations for low automation rates. Scaffold limitations are briefly mentioned (Appendix A.3) but not explored as a systematic confound. Format compatibility, evaluation strictness, and prompt engineering quality are not discussed as factors." 176 }, 177 "proxy_outcome_distinction": { 178 "applies": true, 179 "answer": true, 180 "justification": "The paper is explicit that automation rate measures whether deliverables would be 'accepted by a reasonable client as the commissioned work' (Section 3.3) — a specific proxy for economic automation. Section 5 discusses the gap between task-specific automation and general labor automation." 181 } 182 }, 183 "setup_transparency": { 184 "model_versions_specified": { 185 "applies": true, 186 "answer": false, 187 "justification": "Models are identified by marketing names only: 'Claude Sonnet 4.5', 'GPT-5', 'Gemini 2.5 Pro', 'Grok 4'. No API versions, snapshot dates, or exact model identifiers are provided." 188 }, 189 "prompts_provided": { 190 "applies": true, 191 "answer": true, 192 "justification": "Full prompts are provided in Appendix B.6 for the base prompt, OpenHands extensions, computer-use agent prompt, and Claude-specific extensions." 193 }, 194 "hyperparameters_reported": { 195 "applies": true, 196 "answer": false, 197 "justification": "No hyperparameters (temperature, top-p, max tokens, session timeout details beyond 'default: 1 hour') are reported for the model API calls." 198 }, 199 "scaffolding_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Agent scaffolds are described: OpenHands CLI environment, Scale AI computer-use scaffold with MCP tools (mouse, keyboard, bash, file editor), and integrated agents (Manus, ChatGPT agent). Available tools including gpt-image-1, tts-1, and veo-3.0 are listed (Appendix B.6)." 203 }, 204 "data_preprocessing_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "Section 3.2 and Appendix C.2 document the extensive filtering pipeline: 550 initial projects → multiple rounds of review, cleaning, and standardization → 240 final projects. Filtering criteria are stated in detail." 208 } 209 }, 210 "limitations_and_scope": { 211 "limitations_section_present": { 212 "applies": true, 213 "answer": true, 214 "justification": "Section 5 'Discussion' contains a 'Limitations' subsection discussing excluded work types and cost reporting limitations." 215 }, 216 "threats_to_validity_specific": { 217 "applies": true, 218 "answer": true, 219 "justification": "Specific threats are discussed: exclusion of interactive work (tutoring), team work (project management), work requiring time to evaluate (SEO), and the fact that costs may be underestimated due to inflation (Section 5)." 220 }, 221 "scope_boundaries_stated": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 5 explicitly states: 'an AI obtaining 100% automation rate on RLI may still underperform humans on types of work that we do not evaluate.' The excluded categories are enumerated in Appendix C.2." 225 } 226 }, 227 "data_integrity": { 228 "raw_data_available": { 229 "applies": true, 230 "answer": false, 231 "justification": "The private test set of 230 projects is not available. Only 10 projects are released publicly. Individual evaluator annotations are not released. Independent verification of the results is not possible." 232 }, 233 "data_collection_described": { 234 "applies": true, 235 "answer": true, 236 "justification": "Section 3.2 describes the two-stage collection process: freelance platform sourcing (207 projects from Upwork) and long-tail sourcing (40 additional projects). The pipeline is visualized in Figure 5." 237 }, 238 "recruitment_methods_described": { 239 "applies": true, 240 "answer": true, 241 "justification": "Section 3.2 describes recruiting 358 freelancers with verified Upwork accounts: 'on average, they had 2,341 hours worked, 89 prior jobs, and $23,364 in total earnings.' Freelancers were paid $15-$200 per project (average $41)." 242 }, 243 "data_pipeline_documented": { 244 "applies": true, 245 "answer": true, 246 "justification": "Figure 5 and Section 3.2 document the pipeline: 550 collected → cleaning → spot check → final filter → 240 final tasks. Appendix C.2 lists filtering criteria and specific stages." 247 } 248 }, 249 "conflicts_of_interest": { 250 "funding_disclosed": { 251 "applies": true, 252 "answer": false, 253 "justification": "No funding disclosure or acknowledgments section listing grants or sponsors. The work is affiliated with Scale AI and CAIS but no explicit funding statement is provided." 254 }, 255 "affiliations_disclosed": { 256 "applies": true, 257 "answer": true, 258 "justification": "Author affiliations with Scale AI and Center for AI Safety are clearly listed. Some authors are noted as having done work while at Scale AI or CAIS." 259 }, 260 "funder_independent_of_outcome": { 261 "applies": true, 262 "answer": false, 263 "justification": "Scale AI is a data labeling and AI evaluation company with a direct commercial interest in AI benchmark results. Multiple authors are Scale AI employees. The funder has a stake in demonstrating the value of AI evaluation services and human labor for AI tasks." 264 }, 265 "financial_interests_declared": { 266 "applies": true, 267 "answer": false, 268 "justification": "No competing interests or financial disclosure statement is provided. Scale AI co-founder Alexandr Wang is a senior author, and Scale AI has obvious financial interests in the AI evaluation space." 269 } 270 }, 271 "contamination": { 272 "training_cutoff_stated": { 273 "applies": true, 274 "answer": false, 275 "justification": "No training data cutoff dates are stated for any of the evaluated models." 276 }, 277 "train_test_overlap_discussed": { 278 "applies": true, 279 "answer": true, 280 "justification": "Section 3.2 states: 'None of the project descriptions in RLI are searchable' and describes contamination prevention measures including a blocklist of domains for long-tail data." 281 }, 282 "benchmark_contamination_addressed": { 283 "applies": true, 284 "answer": true, 285 "justification": "Section 3.2 describes multiple contamination prevention strategies: private test set of 230 projects, non-searchable project descriptions, domain blocklist for online deliverables. The benchmark is designed to be contamination-resistant." 286 } 287 }, 288 "human_studies": { 289 "pre_registered": { 290 "applies": false, 291 "answer": false, 292 "justification": "This is a benchmark evaluation paper. While freelancers provided work samples and evaluators annotated deliverables, neither group is studied as human subjects — they are data providers and annotators respectively." 293 }, 294 "irb_or_ethics_approval": { 295 "applies": false, 296 "answer": false, 297 "justification": "Not a human subjects study. Freelancers sold work samples; evaluators performed annotation work." 298 }, 299 "demographics_reported": { 300 "applies": false, 301 "answer": false, 302 "justification": "Not a human subjects study. Freelancer statistics (hours worked, prior jobs, earnings) are reported as data source quality indicators, not participant demographics." 303 }, 304 "inclusion_exclusion_criteria": { 305 "applies": false, 306 "answer": false, 307 "justification": "Not a human subjects study." 308 }, 309 "randomization_described": { 310 "applies": false, 311 "answer": false, 312 "justification": "Not a human subjects study." 313 }, 314 "blinding_described": { 315 "applies": false, 316 "answer": false, 317 "justification": "Not a human subjects study." 318 }, 319 "attrition_reported": { 320 "applies": false, 321 "answer": false, 322 "justification": "Not a human subjects study." 323 } 324 }, 325 "cost_and_practicality": { 326 "inference_cost_reported": { 327 "applies": true, 328 "answer": true, 329 "justification": "Figure 12 shows the distribution of model running costs: 'average cost of generating AI deliverables was $2.34' with max $29.51." 330 }, 331 "compute_budget_stated": { 332 "applies": true, 333 "answer": true, 334 "justification": "Figure 12 shows API cost distribution. Session timeout is stated as 1 hour default (Appendix B.6). The total human labor cost of the benchmark is stated: 'over 6,000 hours of real work valued at over $140,000' (Section 3.1)." 335 } 336 }, 337 "experimental_rigor": { 338 "seed_sensitivity_reported": { 339 "applies": true, 340 "answer": false, 341 "justification": "No discussion of random seed sensitivity. Each model appears to have been run once per project with no analysis of result variability across runs." 342 }, 343 "number_of_runs_stated": { 344 "applies": true, 345 "answer": false, 346 "justification": "The number of generation runs per model per project is not explicitly stated. It appears each model generated one deliverable per project, but this is not confirmed." 347 }, 348 "hyperparameter_search_budget": { 349 "applies": true, 350 "answer": false, 351 "justification": "No hyperparameter search budget is reported. The paper mentions tuning prompts ('we tune prompts and provide standardized tooling scaffolds', Section 4.1) but does not report how many prompt variants were tried." 352 }, 353 "best_config_selection_justified": { 354 "applies": true, 355 "answer": false, 356 "justification": "For GPT-5, two scaffolds are compared and the better one (CLI) is reported in main tables. However, the selection process for prompt tuning and scaffold configuration is not documented." 357 }, 358 "multiple_comparison_correction": { 359 "applies": false, 360 "answer": false, 361 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 362 }, 363 "self_comparison_bias_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "Scale AI developed the computer-use scaffold and the evaluation platform, and is evaluating models on their own benchmark. No discussion of self-comparison bias or author-evaluation bias." 367 }, 368 "compute_budget_vs_performance": { 369 "applies": true, 370 "answer": false, 371 "justification": "Models are given different scaffolds and tools but performance is not analyzed as a function of compute budget. Some models may have spent more tokens/compute than others, which is not controlled for." 372 }, 373 "benchmark_construct_validity": { 374 "applies": true, 375 "answer": true, 376 "justification": "Section 3.1 and Figure 6 extensively discuss what RLI measures vs. what prior benchmarks measure. The paper compares RLI's task distribution to actual Upwork distribution, arguing it better captures real remote labor than prior benchmarks." 377 }, 378 "scaffold_confound_addressed": { 379 "applies": true, 380 "answer": false, 381 "justification": "Different models use different scaffolds (OpenHands CLI, Scale AI CUA, integrated agents) but this confound is not systematically addressed. Only GPT-5 is tested on two scaffolds. The paper acknowledges 'more vertical integration of model scaffolds will yield stronger performance' (Appendix A.3) but does not control for scaffold differences when comparing models." 382 } 383 }, 384 "data_leakage": { 385 "temporal_leakage_addressed": { 386 "applies": true, 387 "answer": true, 388 "justification": "The benchmark uses private projects sourced from freelancers' past work, with non-searchable descriptions and a domain blocklist. This inherently addresses temporal leakage since the projects are not in public training data." 389 }, 390 "feature_leakage_addressed": { 391 "applies": true, 392 "answer": false, 393 "justification": "No discussion of whether the evaluation compatibility prompt or format specifications could leak information about what constitutes a successful deliverable." 394 }, 395 "non_independence_addressed": { 396 "applies": true, 397 "answer": false, 398 "justification": "No discussion of whether projects from the same freelancer or category share structural similarities that could affect results." 399 }, 400 "leakage_detection_method": { 401 "applies": true, 402 "answer": true, 403 "justification": "Concrete prevention methods are used: private test set, non-searchable project descriptions, and domain blocklist for online deliverables (Section 3.2)." 404 } 405 } 406 }, 407 "claims": [ 408 { 409 "claim": "The best-performing AI agent (Manus) achieves an automation rate of only 2.5% on RLI", 410 "evidence": "Table 1 shows Manus at 2.5%, Grok 4 and Sonnet 4.5 at 2.1%, GPT-5 at 1.7%, ChatGPT agent at 1.3%, Gemini 2.5 Pro at 0.8%", 411 "supported": "strong" 412 }, 413 { 414 "claim": "RLI is substantially more complex than prior benchmarks, with median completion time of 11.5 hours exceeding previous benchmarks by more than 2x", 415 "evidence": "Figure 4 shows completion time distribution; Figure 6 compares against HCAST and GDPval, showing RLI matches Upwork distribution", 416 "supported": "strong" 417 }, 418 { 419 "claim": "Inter-annotator agreement is 94.4% for automation rate evaluation", 420 "evidence": "Section 3.4 states this figure. Section B.5 reports auditing of false positives and estimation of false negative rate ≤5.8% with 95% confidence", 421 "supported": "strong" 422 }, 423 { 424 "claim": "AI agents are steadily improving as measured by Elo scores, with newer models achieving higher scores", 425 "evidence": "Figure 8 shows Elo progression from Gemini 2.5 Pro (~412) to Manus (~510), with 95% CIs from bootstrap", 426 "supported": "moderate" 427 }, 428 { 429 "claim": "RLI captures the true diversity and complexity of remote labor markets better than prior benchmarks", 430 "evidence": "Figure 6 shows RLI's project type distribution is more diverse than HCAST and GDPval; Figure 14 shows more unique file types; completion time matches Upwork distribution", 431 "supported": "moderate" 432 } 433 ], 434 "methodology_tags": ["benchmark-eval"], 435 "key_findings": "The Remote Labor Index (RLI) benchmark of 240 real-world freelance projects shows that frontier AI agents achieve at most 2.5% automation rate, with the best model (Manus) earning only $1,720 of $143,991 in potential project value. Common failure modes include corrupted files (17.6%), incomplete deliverables (35.7%), poor quality (45.6%), and inconsistencies (14.8%). AI successes cluster in audio editing, image generation, data visualization, and writing — domains where current models already have strong skills — while complex multimodal work like architecture, 3D modeling, and video production remains far beyond current capabilities.", 436 "red_flags": [ 437 { 438 "flag": "Company evaluating benchmark relevance to its own business", 439 "detail": "Scale AI is a data labeling and AI evaluation company. The benchmark's conclusion — that human evaluation is needed because AI cannot automate most work — directly supports Scale AI's business model. Co-founder Alexandr Wang is a senior author. This conflict of interest is not disclosed or discussed." 440 }, 441 { 442 "flag": "Scaffold confound across model comparisons", 443 "detail": "Different models use different scaffolds (OpenHands, Scale AI CUA, integrated agents) but are compared as if differences reflect model capability. Only GPT-5 is tested on two scaffolds, showing scaffold choice matters (CLI outperforms CUA). The other models' rankings could partly reflect scaffold quality." 444 }, 445 { 446 "flag": "No per-category breakdown despite 23 categories", 447 "detail": "The paper reports only aggregate automation rates. With 240 projects across 23 categories, per-category results would reveal whether low scores are uniform or driven by specific hard categories. The qualitative analysis hints at category-dependent performance but this is not quantified." 448 }, 449 { 450 "flag": "Single-run results", 451 "detail": "Each model appears to have generated one deliverable per project. LLM outputs are stochastic; a single run could underperform or overperform. No variance analysis is provided for automation rates." 452 }, 453 { 454 "flag": "Private benchmark limits independent verification", 455 "detail": "230 of 240 projects are private. While this protects against contamination, it also means the results cannot be independently verified. The benchmark design, task selection, and evaluation quality must be taken on trust." 456 } 457 ], 458 "cited_papers": [ 459 { 460 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 461 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"], 462 "year": 2023, 463 "arxiv_id": "2310.06770", 464 "relevance": "Major software engineering benchmark that RLI positions against as more limited in scope" 465 }, 466 { 467 "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?", 468 "authors": ["Samuel Miserendino", "Michele Wang", "Tejal Patwardhan"], 469 "year": 2025, 470 "arxiv_id": "2502.12115", 471 "relevance": "Economically-grounded software engineering benchmark that directly inspired RLI's approach to measuring economic value" 472 }, 473 { 474 "title": "GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks", 475 "authors": ["Tejal Patwardhan", "Rachel Dias"], 476 "year": 2025, 477 "arxiv_id": "2510.04374", 478 "relevance": "Most similar prior work: shows AI near human parity on specific cross-profession tasks but limited to task-level evaluation, not end-to-end projects" 479 }, 480 { 481 "title": "HCAST: Human-Calibrated Autonomy Software Tasks", 482 "authors": ["David Rein", "Joel Becker", "Amy Deng"], 483 "year": 2025, 484 "arxiv_id": "2503.17354", 485 "relevance": "Software autonomy benchmark compared against in terms of completion time and task diversity" 486 }, 487 { 488 "title": "AgentBench: Evaluating LLMs as Agents", 489 "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"], 490 "year": 2023, 491 "arxiv_id": "2308.03688", 492 "relevance": "Multi-environment agent benchmark evaluating autonomous agent capabilities" 493 }, 494 { 495 "title": "MLE-Bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 496 "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"], 497 "year": 2024, 498 "arxiv_id": "2410.07095", 499 "relevance": "Domain-specific ML engineering benchmark for AI agents" 500 }, 501 { 502 "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments", 503 "authors": ["Tianbao Xie", "Danyang Zhang"], 504 "year": 2024, 505 "relevance": "Computer-use benchmark for multimodal agents in realistic environments" 506 }, 507 { 508 "title": "RE-Bench: Evaluating Frontier AI R&D Capabilities of Language Model Agents Against Human Experts", 509 "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker"], 510 "year": 2024, 511 "arxiv_id": "2411.15114", 512 "relevance": "AI R&D capability benchmark comparing agent performance to human expert baselines" 513 }, 514 { 515 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 516 "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"], 517 "year": 2025, 518 "arxiv_id": "2504.01848", 519 "relevance": "Measures AI agents' ability to replicate research papers — related to end-to-end task automation" 520 }, 521 { 522 "title": "The AI Productivity Index (APEX)", 523 "authors": ["Bertie Vidgen", "Abby Fennelly"], 524 "year": 2025, 525 "arxiv_id": "2509.25721", 526 "relevance": "AI productivity benchmark measuring economically valuable task completion" 527 }, 528 { 529 "title": "A Definition of AGI", 530 "authors": ["Dan Hendrycks", "Dawn Song", "Christian Szegedy"], 531 "year": 2025, 532 "relevance": "Defines AGI in terms of cognitive skills, directly informs RLI's cognitive skills analysis of failure modes" 533 }, 534 { 535 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 536 "authors": ["Wei-Lin Chiang", "Lianmin Zheng"], 537 "year": 2024, 538 "relevance": "Elo methodology for LLM comparison that RLI adapts for its pairwise evaluation system" 539 } 540 ] 541 }