scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (20983B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks",
      6     "authors": [
      7       "Tejal Patwardhan",
      8       "Rachel Dias",
      9       "Elizabeth Proehl",
     10       "Grace Kim",
     11       "Michele Wang",
     12       "Olivia Watkins",
     13       "Simón Posada Fishman",
     14       "Marwan Aljubeh",
     15       "Phoebe Thacker",
     16       "Laurance Fauconnet",
     17       "Natalie S. Kim",
     18       "Patrick Chao",
     19       "Samuel Miserendino",
     20       "Gildas Chabot",
     21       "David Li",
     22       "Michael Sharman",
     23       "Alexandra Barr",
     24       "Amelia Glaese",
     25       "Jerry Tworek"
     26     ],
     27     "year": 2025,
     28     "venue": "Robotics",
     29     "arxiv_id": "2510.04374",
     30     "doi": "10.48550/arXiv.2510.04374"
     31   },
     32   "checklist": {
     33     "claims_and_evidence": {
     34       "abstract_claims_supported": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Abstract claims about linear improvement (fig. 6), approaching expert parity (fig. 5), speed/cost savings (Table 2, fig. 7), reasoning effort improvements (fig. 9a), and open-sourcing (Section 4) are all supported by data presented in the paper.",
     38         "source": "opus"
     39       },
     40       "causal_claims_justified": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Causal claims about reasoning effort improving performance are supported by controlled single-variable experiments (fig. 9a, varying reasoning level while holding model constant). The prompt-tuning claim is supported by a before/after intervention design (fig. 9b). These are adequate for the causal claims made.",
     44         "source": "opus"
     45       },
     46       "generalization_bounded": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section 5 (Limitations) explicitly bounds the results: 'only 44 occupations and 30 total tasks per occupation,' 'oriented around knowledge work that can be performed on a computer,' 'Manual labor and physical tasks are not included,' and tasks are 'precisely-specified and one-shot, not interactive.'",
     50         "source": "opus"
     51       },
     52       "alternative_explanations_discussed": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper does not substantively discuss alternative explanations for the core findings. For example, the linear improvement claim (fig. 6) does not consider whether the trend could be driven by improved prompting/tooling rather than raw model capability. The win rate differences across models are not analyzed for confounds beyond stylistic identification.",
     56         "source": "opus"
     57       },
     58       "proxy_outcome_distinction": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The introduction explicitly distinguishes between measuring AI capabilities and predicting economic impact: 'While informative when available, these methods are lagging indicators of AI impacts. We consider an alternate method for understanding the potential economic impacts of AI: directly measuring AI model capabilities.' The limitations section further notes the gap between one-shot task completion and real-world work.",
     62         "source": "opus"
     63       }
     64     },
     65     "limitations_and_scope": {
     66       "limitations_section_present": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 5 is a dedicated 'Limitations' section covering dataset size, focus on knowledge work, task specification, grader performance, and cost.",
     70         "source": "opus"
     71       },
     72       "threats_to_validity_specific": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The limitations are specific to this study: 'only 44 occupations and 30 total tasks per occupation,' 'Manual labor and physical tasks are not included,' 'tasks that involve extensive tacit knowledge, access to personally identifiable information, use of proprietary software tools, or communication between individuals are out of scope.' Expert time self-reporting bias is acknowledged in footnote 6.",
     76         "source": "opus"
     77       },
     78       "scope_boundaries_stated": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Section 5 explicitly states what's excluded: manual labor, physical tasks, tasks requiring tacit knowledge, PII access, proprietary tools, or interpersonal communication. Tasks are described as 'precisely-specified and one-shot, not interactive,' and the benchmark covers only 'a limited, initial cut of knowledge work tasks.'",
     82         "source": "opus"
     83       }
     84     },
     85     "conflicts_of_interest": {
     86       "funding_disclosed": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding disclosure is present. All authors are affiliated with OpenAI, which implicitly funds the research, but there is no explicit funding statement or acknowledgment of funding sources.",
     90         "source": "opus"
     91       },
     92       "affiliations_disclosed": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "All authors are listed under the OpenAI affiliation prominently at the top of the paper.",
     96         "source": "opus"
     97       },
     98       "funder_independent_of_outcome": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "OpenAI has a direct financial interest in demonstrating that its GPT models perform well on economically valuable tasks. The benchmark was designed and evaluated by OpenAI employees testing OpenAI products (GPT-4o, o4-mini, o3, GPT-5) alongside competitors.",
    102         "source": "opus"
    103       },
    104       "financial_interests_declared": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No competing interests or financial interests statement is provided. All authors are OpenAI employees whose livelihoods depend on the commercial success of the models being evaluated.",
    108         "source": "opus"
    109       }
    110     },
    111     "scope_and_framing": {
    112       "key_terms_defined": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Key terms are operationalized: 'economically valuable tasks' via BLS GDP sectors and wage data, 'digital knowledge work' via O*NET task classification with a 60% threshold, and 'win rate' via an explicit pairwise comparison formula in Section A.6.1.",
    116         "source": "haiku"
    117       },
    118       "intended_contribution_clear": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The conclusion enumerates 5 explicit contributions: dataset, capability benchmarking, experiments, open-sourcing, and automated grader.",
    122         "source": "haiku"
    123       },
    124       "engagement_with_prior_work": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The introduction situates GDPval against existing AI benchmarks (MMLU, GPQA, AgentBench, SWE-Lancer) and labor market literature (Brynjolfsson, Tamkin, Eloundou), explaining what distinguishes GDPval from each approach.",
    128         "source": "haiku"
    129       }
    130     }
    131   },
    132   "type_checklist": {
    133     "benchmark-creation": {
    134       "construct_design": {
    135         "construct_validity_argued": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "The paper argues construct validity via top-down O*NET Work Activities coverage, GDP-weighted sector/occupation selection, and expert-validated task creation, further validated against the Acemoglu & Autor (2011) task-content framework (Section A.7.1).",
    139           "source": "haiku"
    140         },
    141         "difficulty_distribution_characterized": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Tables 3 and 4 report difficulty distributions (expert-rated 1–5, mean ~3.2); Fig 13 shows win rates decline predictably with task duration, demonstrating difficulty discrimination.",
    145           "source": "haiku"
    146         },
    147         "ceiling_floor_effects_checked": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "The pairwise win rate metric is explicitly designed to avoid ceiling effects ('No upper limit' in Section 1); observed win rates range from 12.5% (GPT-4o) to ~48% (Claude/GPT-5), demonstrating meaningful discrimination across models.",
    151           "source": "haiku"
    152         },
    153         "human_baseline_included": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Human expert deliverables are the literal baseline for all win rate comparisons; human inter-rater agreement (71%) is also reported as a calibration ceiling.",
    157           "source": "haiku"
    158         },
    159         "scoring_rubric_justified": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Pairwise expert comparison is justified because tasks involve subjective quality factors (structure, style, aesthetics) that preclude absolute scoring; formal agreement metrics are provided for the automated grader alternative (66% vs. 71% human-human baseline).",
    163           "source": "haiku"
    164         }
    165       },
    166       "robustness": {
    167         "contamination_resistance_designed": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No contamination resistance measures are described; the paper does not test whether tasks or similar prompts appeared in model training data, and no canary strings, temporal split methodology, or dynamic generation is mentioned.",
    171           "source": "haiku"
    172         },
    173         "temporal_robustness_discussed": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "The 'No upper limit' metric design explicitly addresses temporal robustness by allowing the human baseline to be replaced with stronger models over time; future iterations are planned to expand breadth and realism.",
    177           "source": "haiku"
    178         },
    179         "failure_modes_discussed": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Section 5 and Section A.6.3 document specific benchmark failure modes: grader limitations with internet access, font rendering, speech-to-text, and Python-only execution; meta-failure of automated grader self-preference is acknowledged in Section A.6.2.",
    183           "source": "haiku"
    184         },
    185         "baseline_implementations_provided": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "220 tasks with prompts and reference files are open-sourced; a public automated grading service is provided at evals.openai.com; the full scaffolding prompt used for experiments is published in Section A.3.",
    189           "source": "haiku"
    190         }
    191       },
    192       "documentation": {
    193         "dataset_documentation_complete": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Comprehensive documentation covers task creation methodology (Sections 2.1–2.4), multi-stage quality control pipeline (Section A.5), task statistics tables (Tables 3–7), O*NET coverage analysis (A.4.2), and expert qualification requirements.",
    197           "source": "haiku"
    198         },
    199         "licensing_and_access_clear": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Tasks are described as 'open-sourced' at evals.openai.com but no specific license terms are stated; 'open source' here appears to mean publicly available without formal licensing, and reuse conditions are undefined.",
    203           "source": "haiku"
    204         },
    205         "intended_use_specified": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "The paper states a general goal ('facilitate future research in understanding real-world model capabilities') but provides no formal specification of what should or should NOT be concluded from benchmark results, nor guidance on inappropriate uses.",
    209           "source": "haiku"
    210         }
    211       }
    212     }
    213   },
    214   "claims": [
    215     {
    216       "claim": "Frontier model performance on GDPval is improving roughly linearly over time.",
    217       "evidence": "Figure 6 shows OpenAI model win rates from GPT-4o (12.5%) through o4-mini, o3, to GPT-5 (39%) with an approximately linear trend across release dates.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Current best frontier models are approaching industry experts in deliverable quality.",
    222       "evidence": "Claude Opus 4.1 achieves 47.6% win-or-tie rate against human experts on the 220-task gold subset (Fig 5); 'approaching' is accurate but models have not yet reached parity.",
    223       "supported": "moderate"
    224     },
    225     {
    226       "claim": "Incorporating frontier AI with human oversight can save time and cost relative to unaided experts.",
    227       "evidence": "Table 2 shows 'try n times' speed improvements of 1.28x–1.39x and cost improvements of 1.47x–1.63x for o3 and GPT-5; GPT-4o at 12.5% win rate actually slows work (0.46x speed ratio).",
    228       "supported": "moderate"
    229     },
    230     {
    231       "claim": "Increased reasoning effort improves model performance on GDPval tasks.",
    232       "evidence": "Figure 9a shows monotonically increasing win rates for both o3 and GPT-5 from low to medium to high reasoning effort settings.",
    233       "supported": "strong"
    234     },
    235     {
    236       "claim": "Prompt tuning and scaffolding improve GPT-5 win rate by 5 percentage points.",
    237       "evidence": "Figure 9b shows before/after prompting experiment; paper states prompting 'improved human preference win rates by 5 percentage points' and eliminates specific failure modes (black-square artifacts, formatting errors).",
    238       "supported": "strong"
    239     },
    240     {
    241       "claim": "GDPval tasks are representative of economically valuable digital knowledge work across U.S. occupations.",
    242       "evidence": "O*NET coverage analysis shows 71.4% of skills and 63.4% of work activities covered in the gold set (Table 6); validated against Acemoglu & Autor (2011) task-content framework in Section A.7.1.",
    243       "supported": "moderate"
    244     }
    245   ],
    246   "methodology_tags": [
    247     "benchmark-eval"
    248   ],
    249   "key_findings": "GDPval introduces a benchmark of 1,320 tasks (220 open-sourced) spanning 44 occupations across the 9 largest U.S. GDP sectors, created by industry professionals averaging 14 years of experience. The best frontier models (Claude Opus 4.1, GPT-5) achieve approximately 47–48% win-or-tie rates against human expert deliverables on the gold subset, with OpenAI models improving roughly linearly over successive releases. Models provide speed and cost savings when integrated into expert workflows only at sufficiently high win rates — GPT-4o at 12.5% win rate slows work, while GPT-5 at 39% yields meaningful savings under a 'try and review' strategy. Reasoning effort, task context, and prompt engineering all predictably improve model performance, with large gains from basic scaffolding suggesting substantial headroom remains.",
    250   "red_flags": [
    251     {
    252       "flag": "OpenAI evaluating own models",
    253       "detail": "All 19 authors are OpenAI employees evaluating OpenAI's own models (GPT-5, o3, o4-mini, GPT-4o) as primary subjects. The automated grader is GPT-5-high and Section A.6.2 explicitly notes it shows lower human-agreement for capable OpenAI models, consistent with self-preference bias."
    254     },
    255     {
    256       "flag": "Model identifiability in human grading",
    257       "detail": "Footnote 2 acknowledges graders could likely identify model outputs via stylistic cues (em dashes for OpenAI models, first-person phrasing for Claude, Grok self-references), introducing potential evaluator bias despite attempted blinding."
    258     },
    259     {
    260       "flag": "Headline results from small gold subset",
    261       "detail": "Primary human evaluation covers only 220 of the 1,320 tasks; no human-graded win rates are reported for the full set, making representativeness of the headline figures uncertain."
    262     },
    263     {
    264       "flag": "No contamination analysis",
    265       "detail": "Tasks were created in 2025 from real expert work, but the paper does not test whether any tasks or similar prompts appeared in model training data, particularly relevant for GPT-5 which was trained closer to the task creation date."
    266     },
    267     {
    268       "flag": "Speed/cost analysis uses modeled projections",
    269       "detail": "Time and cost savings in Table 2 are computed from expected-value formulas using empirical win rates and average times, not from direct measurement of actual human-AI workflows; the model overpenalizes by assuming constant win rate across resamples."
    270     }
    271   ],
    272   "cited_papers": [
    273     {
    274       "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
    275       "relevance": "Foundational paper classifying occupational tasks as automatable via O*NET; GDPval's occupation selection methodology directly builds on Eloundou et al.'s digital/non-digital task classification approach"
    276     },
    277     {
    278       "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?",
    279       "relevance": "Closely related economically-grounded AI capability benchmark; GDPval explicitly positions itself as broader in occupational coverage than SWE-Lancer's software engineering focus"
    280     },
    281     {
    282       "title": "AgentBench: Evaluating LLMs as Agents",
    283       "relevance": "Related agent evaluation benchmark contrasted with GDPval's economically grounded real-world task framing"
    284     },
    285     {
    286       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    287       "relevance": "Prominent academic-style benchmark that GDPval contrasts with in motivating the need for real-world economic evaluation over reasoning-difficulty tests"
    288     },
    289     {
    290       "title": "CLIO: Privacy-Preserving Insights into Real-World AI Use",
    291       "relevance": "Production AI usage analysis used to identify sectors where model adoption is still emerging, informing GDPval's coverage priorities beyond already-saturated use cases"
    292     },
    293     {
    294       "title": "LLM Evaluators Recognize and Favor Their Own Generations",
    295       "relevance": "Cited to motivate concern about automated grader self-preference bias — a critical validity threat for a benchmark where the creator's own model (GPT-5-high) serves as the automated grader"
    296     },
    297     {
    298       "title": "Generative AI at Work",
    299       "relevance": "RCT measuring AI productivity effects in workplace settings; key prior work in the AI/labor economics literature that GDPval aims to complement with direct capability measurement"
    300     },
    301     {
    302       "title": "Skills, Tasks and Technologies: Implications for Employment and Earnings",
    303       "relevance": "Acemoglu & Autor (2011) task-content framework used to validate GDPval's digital-task classification methodology in Section A.7.1"
    304     }
    305   ],
    306   "engagement_factors": {
    307     "practical_relevance": {
    308       "score": 3,
    309       "justification": "Directly measures AI capability on real professional deliverables across 9 GDP sectors; practitioners can assess AI automation potential in specific occupations and model cost/time savings under concrete scenarios."
    310     },
    311     "surprise_contrarian": {
    312       "score": 2,
    313       "justification": "Finding that frontier models approach but don't exceed human expert quality on long-horizon professional tasks is moderately surprising; speed/cost benefit only emerging at higher win rates challenges naive automation optimism."
    314     },
    315     "fear_safety": {
    316       "score": 2,
    317       "justification": "Benchmark quantifies AI's potential to match professionals on $3T in annual wages; sector- and occupation-level win rate breakdowns make displacement risk concrete and specific."
    318     },
    319     "drama_conflict": {
    320       "score": 2,
    321       "justification": "OpenAI evaluating its own models alongside competitors; Claude Opus 4.1 outperforms GPT-5 on aesthetics while GPT-5 leads on instruction-following — competitive cross-lab framing with named model rankings."
    322     },
    323     "demo_ability": {
    324       "score": 3,
    325       "justification": "Public automated grader at evals.openai.com; 220 tasks with prompts and reference files open-sourced; researchers can immediately run their own models against the benchmark."
    326     },
    327     "brand_recognition": {
    328       "score": 3,
    329       "justification": "Published by OpenAI with acknowledgment from Sam Altman; evaluates GPT-5, Claude Opus 4.1, Gemini 2.5 Pro, and Grok 4 — maximum lab/product name recognition."
    330     }
    331   },
    332   "hn_data": {
    333     "threads": [
    334       {
    335         "hn_id": "33314496",
    336         "title": "A study of malicious CVE proof of concept exploits in GitHub",
    337         "points": 3,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=33314496",
    340         "created_at": "2022-10-24T09:30:54Z"
    341       },
    342       {
    343         "hn_id": "45836230",
    344         "title": "The Distribution of Earth-Impacting Interstellar Objects",
    345         "points": 1,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=45836230",
    348         "created_at": "2025-11-06T15:21:24Z"
    349       }
    350     ],
    351     "top_points": 3,
    352     "total_points": 4,
    353     "total_comments": 0
    354   }
    355 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs