ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32532B)


      1 {
      2   "paper": {
      3     "title": "GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks",
      4     "authors": [
      5       "Tejal Patwardhan",
      6       "Rachel Dias",
      7       "Elizabeth Proehl",
      8       "Grace Kim",
      9       "Michele Wang",
     10       "Olivia Watkins",
     11       "Simón Posada Fishman",
     12       "Marwan Aljubeh",
     13       "Phoebe Thacker",
     14       "Laurance Fauconnet",
     15       "Natalie S. Kim",
     16       "Patrick Chao",
     17       "Samuel Miserendino",
     18       "Gildas Chabot",
     19       "David Li",
     20       "Michael Sharman",
     21       "Alexandra Barr",
     22       "Amelia Glaese",
     23       "Jerry Tworek"
     24     ],
     25     "year": 2025,
     26     "venue": "arXiv",
     27     "arxiv_id": "2510.04374",
     28     "doi": "10.48550/arXiv.2510.04374"
     29   },
     30   "scan_version": 2,
     31   "active_modules": ["experimental_rigor", "data_leakage"],
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No source code repository URL is provided. The paper open-sources task prompts and reference files, and provides an automated grading service at evals.openai.com, but no code repository (e.g., GitHub) for the evaluation pipeline, grading code, or analysis scripts is released."
     38       },
     39       "data_released": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 4 states: 'We open-source the prompts and reference files in our 220-task gold subset.' The 220-task gold subset including prompts and reference files is publicly available."
     43       },
     44       "environment_specified": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Appendix A.6.4 provides an extensive list of pre-installed Python packages with specific version numbers in the Docker image used for grading and model sampling."
     48       },
     49       "reproduction_instructions": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The automated grader at evals.openai.com is available, but the paper does not include instructions for reproducing the full evaluation pipeline (sampling, grading, analysis)."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section A.6.2 states: 'Plots below show 95% confidence intervals obtained by bootstrapping.' Figure 16 shows confidence intervals for both human-automated grader agreement and human inter-rater agreement."
     60       },
     61       "significance_tests": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper compares model win rates against human baselines and across models but does not apply any statistical significance tests (p-values, t-tests, or other hypothesis tests). Claims like 'Claude Opus 4.1 was the best performing model' are based on comparing raw percentages without formal testing."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Win rates are reported with baseline context (e.g., '47.6% of deliverables by Claude Opus 4.1 were graded as better than or as good as the human deliverable'). Speed ratios (90x-327x), cost ratios, and prompting improvements ('5 percentage points') are reported with sufficient context to assess magnitude."
     70       },
     71       "sample_size_justified": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper uses 220 gold subset tasks and 1,320 full set tasks with at least 30 tasks per occupation, but provides no power analysis or explicit justification for why these sample sizes are sufficient for the claims made."
     75       },
     76       "variance_reported": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Bootstrapped 95% confidence intervals are reported in Section A.6.2 and Figure 16. The paper states 9 comparisons per prompt per model (3 samples × 3 graders), and bootstrapping provides variance estimates."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The primary baseline is human expert deliverables. Models are compared against human expert completions in blinded pairwise comparisons (Section 2.5, fig. 5)."
     87       },
     88       "baselines_contemporary": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper evaluates GPT-4o, o4-mini, o3, GPT-5, Claude Opus 4.1, Gemini 2.5 Pro, and Grok 4 — all current frontier models as of 2025."
     92       },
     93       "ablation_study": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 3.4 tests reasoning effort (low/medium/high), prompt tuning, and scaffolding improvements (GET requests, best-of-N sampling). Section A.2.7 tests under-contextualized prompts. These function as ablations of key experimental variables."
     97       },
     98       "multiple_metrics": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper reports win rates, speed ratios, cost ratios, human-automated grader agreement, inter-rater agreement, failure severity categorization, and breakdowns by sector, occupation, deliverable type, and task duration."
    102       },
    103       "human_evaluation": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Human expert pairwise comparison is the primary evaluation metric. Section 2.5: 'blinded expert pairwise comparisons, where experts in the relevant occupation were presented with a request and reference files and asked to rank two or more unlabeled work deliverables.' Grading averaged over an hour per comparison."
    107       },
    108       "held_out_test_set": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The gold subset of 220 tasks is used for all main experiments and is also open-sourced. The prompt-tuning experiment (Section A.3) and scaffolding improvements were developed and evaluated on the same gold subset, with no clear separation of development and test data."
    112       },
    113       "per_category_breakdown": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Breakdowns are provided by sector (fig. 10), occupation (fig. 11), deliverable file type (fig. 12), time to complete (fig. 13), and failure severity (fig. 14)."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 3.3 and fig. 8 analyze why experts preferred or rejected model deliverables (instruction-following failures, formatting errors, hallucinated data, missing deliverables). Section A.2.6 categorizes GPT-5 failures as catastrophic (~3%), bad (~26%), or acceptable but subpar (~48%)."
    122       },
    123       "negative_results_reported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The under-contextualized experiment (Section A.2.7) shows degraded performance. GPT-4o achieves only 12.5% win rate. Formatting errors persist even after prompt tuning (86% → 64% for PowerPoint). Longer tasks show steadily declining win rates (fig. 13)."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Abstract claims about linear improvement (fig. 6), approaching expert parity (fig. 5), speed/cost savings (Table 2, fig. 7), reasoning effort improvements (fig. 9a), and open-sourcing (Section 4) are all supported by data presented in the paper."
    134       },
    135       "causal_claims_justified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Causal claims about reasoning effort improving performance are supported by controlled single-variable experiments (fig. 9a, varying reasoning level while holding model constant). The prompt-tuning claim is supported by a before/after intervention design (fig. 9b). These are adequate for the causal claims made."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 5 (Limitations) explicitly bounds the results: 'only 44 occupations and 30 total tasks per occupation,' 'oriented around knowledge work that can be performed on a computer,' 'Manual labor and physical tasks are not included,' and tasks are 'precisely-specified and one-shot, not interactive.'"
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper does not substantively discuss alternative explanations for the core findings. For example, the linear improvement claim (fig. 6) does not consider whether the trend could be driven by improved prompting/tooling rather than raw model capability. The win rate differences across models are not analyzed for confounds beyond stylistic identification."
    149       },
    150       "proxy_outcome_distinction": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The introduction explicitly distinguishes between measuring AI capabilities and predicting economic impact: 'While informative when available, these methods are lagging indicators of AI impacts. We consider an alternate method for understanding the potential economic impacts of AI: directly measuring AI model capabilities.' The limitations section further notes the gap between one-shot task completion and real-world work."
    154       }
    155     },
    156     "setup_transparency": {
    157       "model_versions_specified": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Models are referred to by marketing names only: 'GPT-4o, o4-mini, o3, GPT-5, Claude Opus 4.1, Gemini 2.5 Pro, and Grok 4.' No API versions, snapshot dates, or model IDs are provided. Model behavior changes across versions."
    161       },
    162       "prompts_provided": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The 220 task prompts and reference files are open-sourced (Section 4). The scaffolding/quality-check prompt used for the prompt-tuning experiment is provided verbatim in Section A.3."
    166       },
    167       "hyperparameters_reported": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No temperature, top-p, or max tokens settings are reported for any model. Reasoning effort levels (low/medium/high) are noted but these are not standard hyperparameters. Claude was 'sampled via the UI' with no parameter specification."
    171       },
    172       "scaffolding_described": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The paper describes: web search and code interpreter tools enabled for OpenAI models, Claude sampled via UI with 'Upgraded file creation and analysis' feature, background sampling, pre-installed packages (Section A.6.4), best-of-N sampling with N=4 and GPT-5 judge, and GET requests enabled in the container."
    176       },
    177       "data_preprocessing_documented": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Sections 2.1-2.4 detail the full pipeline: sector selection by GDP contribution, occupation selection by total wages and digital classification, expert recruitment with criteria, task creation against O*NET categories, and multi-stage quality control (model screening + 5+ human reviews). Appendix A.7 provides further methodological detail."
    181       }
    182     },
    183     "limitations_and_scope": {
    184       "limitations_section_present": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 5 is a dedicated 'Limitations' section covering dataset size, focus on knowledge work, task specification, grader performance, and cost."
    188       },
    189       "threats_to_validity_specific": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The limitations are specific to this study: 'only 44 occupations and 30 total tasks per occupation,' 'Manual labor and physical tasks are not included,' 'tasks that involve extensive tacit knowledge, access to personally identifiable information, use of proprietary software tools, or communication between individuals are out of scope.' Expert time self-reporting bias is acknowledged in footnote 6."
    193       },
    194       "scope_boundaries_stated": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 5 explicitly states what's excluded: manual labor, physical tasks, tasks requiring tacit knowledge, PII access, proprietary tools, or interpersonal communication. Tasks are described as 'precisely-specified and one-shot, not interactive,' and the benchmark covers only 'a limited, initial cut of knowledge work tasks.'"
    198       }
    199     },
    200     "data_integrity": {
    201       "raw_data_available": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "While the 220 gold subset task prompts and reference files are released, the raw grading data (expert judgments, justifications, scores), model outputs, and human expert deliverables are not made available for independent verification."
    205       },
    206       "data_collection_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Sections 2.1-2.5 describe the full data collection procedure: sector identification from GDP data, occupation selection via BLS wage data and digital classification, expert recruitment criteria, task creation against O*NET categories, and iterative quality control pipeline."
    210       },
    211       "recruitment_methods_described": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Section 2.2 describes expert recruitment: minimum 4 years experience, strong resume with professional recognition, video interview, background check, training and quiz. Less than 10% of applicants were selected (Section A.4.4). Each occupation had minimum 5 qualified professionals."
    215       },
    216       "data_pipeline_documented": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The pipeline is documented: sector selection (9 sectors >5% GDP) → occupation selection (5 per sector by wage contribution and digital classification) → expert recruitment (criteria, <10% acceptance) → task creation (with O*NET mapping) → quality control (model screening + 3-stage human review, average 5 reviews per task). Figure 3 illustrates the review pipeline."
    220       }
    221     },
    222     "conflicts_of_interest": {
    223       "funding_disclosed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding disclosure is present. All authors are affiliated with OpenAI, which implicitly funds the research, but there is no explicit funding statement or acknowledgment of funding sources."
    227       },
    228       "affiliations_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "All authors are listed under the OpenAI affiliation prominently at the top of the paper."
    232       },
    233       "funder_independent_of_outcome": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "OpenAI has a direct financial interest in demonstrating that its GPT models perform well on economically valuable tasks. The benchmark was designed and evaluated by OpenAI employees testing OpenAI products (GPT-4o, o4-mini, o3, GPT-5) alongside competitors."
    237       },
    238       "financial_interests_declared": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No competing interests or financial interests statement is provided. All authors are OpenAI employees whose livelihoods depend on the commercial success of the models being evaluated."
    242       }
    243     },
    244     "contamination": {
    245       "training_cutoff_stated": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No training data cutoff dates are provided for any of the evaluated models (GPT-4o, o3, GPT-5, Claude Opus 4.1, Gemini 2.5 Pro, Grok 4)."
    249       },
    250       "train_test_overlap_discussed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The paper does not discuss whether any of the evaluated models could have been trained on content similar to the expert-created tasks, reference files, or related professional materials."
    254       },
    255       "benchmark_contamination_addressed": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "Although GDPval tasks are newly created from expert work (reducing contamination risk), the paper does not explicitly discuss contamination risk or the advantage of novel tasks for contamination avoidance."
    259       }
    260     },
    261     "human_studies": {
    262       "pre_registered": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "The paper is a benchmark evaluation of AI models. The human experts serve as task creators and evaluators, not as subjects of a human-subjects study."
    266       },
    267       "irb_or_ethics_approval": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "This is a benchmark evaluation, not a human-subjects study. Experts are compensated professional evaluators, not research subjects."
    271       },
    272       "demographics_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "The paper is a benchmark evaluation. Expert demographics (years of experience, prior employers) are reported for credibility context, but the study does not investigate human behavior."
    276       },
    277       "inclusion_exclusion_criteria": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "The paper is a benchmark evaluation. Expert selection criteria are described for methodological transparency, but this is not a human-subjects study."
    281       },
    282       "randomization_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "Not a human-subjects experimental study. No treatment/control assignment of human participants."
    286       },
    287       "blinding_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "Not a human-subjects study. Blinding of graders to model identity is described for evaluation quality purposes but is not a human-subjects blinding protocol."
    291       },
    292       "attrition_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "Not a human-subjects study. Expert participation rates are not relevant as human-subjects attrition."
    296       }
    297     },
    298     "cost_and_practicality": {
    299       "inference_cost_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Table 2 reports cost improvement ratios for OpenAI models. Section A.2.1 defines model completion cost MC and states 'we recorded the average invoiced cost per task.' Average human completion cost HC = $361 is reported."
    303       },
    304       "compute_budget_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No total computational budget is stated for the evaluation (total API spend across all models, total expert hours, total project cost). Per-task costs and ratios are reported but not the aggregate compute budget."
    308       }
    309     },
    310     "experimental_rigor": {
    311       "seed_sensitivity_reported": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "While 3 samples per model per prompt are collected, no explicit seed sensitivity analysis is performed or reported. The paper does not discuss how results vary across random seeds or sampling runs."
    315       },
    316       "number_of_runs_stated": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Footnote 2 states: 'we sampled each model 3 times for each prompt, and then had 3 different human graders grade each sample (yielding 9 comparisons per prompt, per model, across 220 tasks).'"
    320       },
    321       "hyperparameter_search_budget": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No hyperparameter search budget is reported. The prompt-tuning experiment (Section A.3) and scaffolding choices appear to have been iteratively developed but no search budget or number of configurations tried is stated."
    325       },
    326       "best_config_selection_justified": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The prompt-tuning and scaffolding configurations (Section A.3, best-of-N with N=4) are presented without justification for how they were selected from alternatives or whether they were tuned on the same data used for evaluation."
    330       },
    331       "multiple_comparison_correction": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper compares 7 models across multiple sectors, occupations, and deliverable types without applying any correction for multiple comparisons."
    335       },
    336       "self_comparison_bias_addressed": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "While the paper acknowledges automated grader self-bias ('models often favor their own responses,' Section A.6.2), it does not address the broader institutional bias of OpenAI designing a benchmark, controlling the experimental setup, selecting which models to test, and evaluating its own commercial products."
    340       },
    341       "compute_budget_vs_performance": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "Models are compared at different reasoning efforts for the same model family (fig. 9a), but cross-model comparisons do not account for compute budget differences. The naive speed ratios (Table 2) vary from 90x to 327x across models, suggesting very different compute usage that is not factored into performance comparisons."
    345       },
    346       "benchmark_construct_validity": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "Section A.7.1 validates the digital-task classification against the Acemoglu & Autor (2011) task-content framework, showing alignment with established economic measures. Expert representativeness ratings average 4.50/5.00 (Table 3). O*NET coverage analysis (Table 6) shows 71.4% skills coverage and 63.4% work activities coverage."
    350       },
    351       "scaffold_confound_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Different models received different scaffolding: OpenAI models got web search and code interpreter tools; Claude was sampled via UI with 'Upgraded file creation and analysis'; packages were preinstalled. Footnote 2 notes these differences but does not address how they confound model comparisons. Performance differences may reflect scaffolding quality rather than model capability."
    355       }
    356     },
    357     "data_leakage": {
    358       "temporal_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Tasks are newly created by experts, which inherently reduces temporal leakage risk, but the paper does not discuss temporal leakage or explicitly note the advantage of using novel tasks."
    362       },
    363       "feature_leakage_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether reference files or task contexts could inadvertently leak information about the expected deliverable quality or structure."
    367       },
    368       "non_independence_addressed": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No discussion of independence between tasks. Multiple tasks per occupation may share structural similarities, and the paper does not analyze whether this affects results."
    372       },
    373       "leakage_detection_method": {
    374         "applies": true,
    375         "answer": false,
    376         "justification": "No concrete leakage detection or prevention methods are described or applied."
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "Frontier model performance on GDPval is improving roughly linearly over time.",
    383       "evidence": "Figure 6 shows performance of OpenAI frontier models (GPT-4o through GPT-5) increasing roughly linearly on the gold subset.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "The current best frontier models are approaching industry experts in deliverable quality, with 47.6% of Claude Opus 4.1 deliverables graded as better than or equal to human expert deliverables.",
    388       "evidence": "Figure 5 shows win rates for 7 models. Claude Opus 4.1 achieves the highest combined win+tie rate of 47.6%. GPT-5 achieves ~39% win rate.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Models can save time and money relative to unaided experts when paired with human oversight.",
    393       "evidence": "Table 2 and Section A.2.1 show speed and cost improvement ratios under 'try n times, then fix it' setups. GPT-5 shows 1.39x speed improvement and 1.63x cost improvement under try-nx.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Increased reasoning effort improves model performance on GDPval.",
    398       "evidence": "Figure 9a shows win rates increasing from low to medium to high reasoning effort for both o3 and GPT-5.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Increased task context improves model performance.",
    403       "evidence": "Section A.2.7 shows GPT-5 performance degrades on under-contextualized prompts (42% of original token length), demonstrating context sensitivity.",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Prompt-tuning and scaffolding improvements increase GPT-5 performance by 5 percentage points.",
    408       "evidence": "Figure 9b shows improvement from prompt-tuning. Prompting 'fully eliminated black-square artifacts' and 'reduced egregious formatting errors in PowerPoint files from 86% to 64%.'",
    409       "supported": "moderate"
    410     },
    411     {
    412       "claim": "Claude Opus 4.1 excels on aesthetics while GPT-5 excels on accuracy.",
    413       "evidence": "Figure 8 and Section 3.1 analyze failure modes. Section A.2.4 shows GPT-5 leads for pure text while Claude leads for .pdf, .xlsx, .pptx deliverables.",
    414       "supported": "moderate"
    415     }
    416   ],
    417   "methodology_tags": ["benchmark-eval"],
    418   "key_findings": "GDPval evaluates frontier AI models on 1,320 real-world economically valuable tasks across 44 occupations and 9 GDP sectors. Claude Opus 4.1 achieves the highest win+tie rate (47.6%) against human expert deliverables, with GPT-5 close behind (39% wins). OpenAI model performance shows roughly linear improvement over time. Increased reasoning effort, context, and scaffolding all predictably improve performance, with prompt-tuning alone adding 5 percentage points to GPT-5 win rates.",
    419   "red_flags": [
    420     {
    421       "flag": "Company evaluating its own products",
    422       "detail": "All authors are OpenAI employees. OpenAI designed the benchmark, controlled the experimental setup, selected models, and evaluated its own commercial products (GPT-4o, o4-mini, o3, GPT-5) alongside competitors. The paper does not acknowledge this institutional conflict of interest despite it being a major potential source of bias."
    423     },
    424     {
    425       "flag": "Non-equivalent scaffolding across models",
    426       "detail": "OpenAI models received web search tools, code interpreter, and pre-installed packages. Claude was sampled via the UI. Different models had access to different capabilities, making win rate comparisons reflect scaffolding quality as much as model capability. The scaffold confound is noted but not addressed."
    427     },
    428     {
    429       "flag": "Self-favoring automated grader",
    430       "detail": "The automated grader is based on GPT-5-high and shows lower agreement with human experts for OpenAI models. The authors acknowledge this bias (Section A.6.2) but the automated grader is still released as the primary grading service."
    431     },
    432     {
    433       "flag": "Linear trend from only 4 data points",
    434       "detail": "The claim of 'roughly linear' improvement (fig. 6) is based on only 4 OpenAI models. This is too few points to establish a trend, and the claim does not extend to non-OpenAI models."
    435     },
    436     {
    437       "flag": "Incomplete cost comparison",
    438       "detail": "Cost analyses (Table 2) cover only OpenAI models. Footnote 4 states: 'We were not able to obtain cost estimates for Claude, Gemini, and Grok.' This makes the cost-effectiveness claims incomplete and potentially misleading."
    439     },
    440     {
    441       "flag": "No statistical significance testing",
    442       "detail": "Win rate differences across models (e.g., Claude 47.6% vs GPT-5 39%) are compared without statistical tests. With 220 tasks and 9 comparisons each, sampling variability could account for some differences, but no formal testing is performed."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "Generative ai at work",
    448       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    449       "year": 2025,
    450       "relevance": "Empirical study of generative AI productivity effects in the workplace, directly relevant to AI-augmented productivity claims."
    451     },
    452     {
    453       "title": "How people use chatgpt",
    454       "authors": ["Aaron Chatterji", "Thomas Cunningham", "David J Deming"],
    455       "year": 2025,
    456       "relevance": "Large-scale study of real-world AI usage patterns, relevant to understanding AI adoption and economic impact."
    457     },
    458     {
    459       "title": "Clio: Privacy-preserving insights into real-world ai use",
    460       "authors": ["Alex Tamkin", "Miles McCain", "Kunal Handa"],
    461       "year": 2024,
    462       "arxiv_id": "2412.13678",
    463       "doi": "10.48550/arXiv.2412.13678",
    464       "relevance": "Privacy-preserving analysis of real-world AI usage data, relevant to understanding AI economic adoption patterns."
    465     },
    466     {
    467       "title": "GPTs are GPTs: An early look at the labor market impact potential of large language models",
    468       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    469       "year": 2023,
    470       "arxiv_id": "2303.10130",
    471       "relevance": "Foundational paper on LLM labor market exposure using O*NET task framework, directly comparable methodology to GDPval."
    472     },
    473     {
    474       "title": "SWE-Lancer: Can frontier LLMs earn $1 million from real-world freelance software engineering?",
    475       "authors": ["Samuel Miserendino", "Michele Wang", "Tejal Patwardhan", "Johannes Heidecke"],
    476       "year": 2025,
    477       "arxiv_id": "2502.12115",
    478       "relevance": "Benchmark evaluating AI on real-world software engineering freelance tasks with economic value framing."
    479     },
    480     {
    481       "title": "Measuring massive multitask language understanding",
    482       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    483       "year": 2020,
    484       "doi": "10.48550/arXiv.2009.03300",
    485       "arxiv_id": "2009.03300",
    486       "relevance": "Major LLM benchmark (MMLU) for evaluating model capabilities across academic domains."
    487     },
    488     {
    489       "title": "GPQA: A graduate-level google-proof q&a benchmark",
    490       "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"],
    491       "year": 2023,
    492       "doi": "10.48550/arXiv.2311.12022",
    493       "arxiv_id": "2311.12022",
    494       "relevance": "Expert-level benchmark requiring graduate knowledge, relevant to AI capability evaluation methodology."
    495     },
    496     {
    497       "title": "AgentBench: Evaluating LLMs as agents",
    498       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    499       "year": 2023,
    500       "doi": "10.48550/arXiv.2308.03688",
    501       "relevance": "Benchmark for evaluating LLMs as agents on diverse tasks, relevant to agentic AI evaluation."
    502     },
    503     {
    504       "title": "LLM evaluators recognize and favor their own generations",
    505       "authors": ["Arjun Panickssery", "Samuel R. Bowman", "Shi Feng"],
    506       "year": 2024,
    507       "arxiv_id": "2404.13076",
    508       "relevance": "Documents self-preference bias in LLM evaluators, directly relevant to GDPval's automated grader limitations."
    509     },
    510     {
    511       "title": "Humanity's last exam",
    512       "authors": ["Long Phan"],
    513       "year": 2025,
    514       "arxiv_id": "2501.14249",
    515       "relevance": "Frontier AI capability benchmark testing expert-level reasoning, comparable benchmark evaluation methodology."
    516     },
    517     {
    518       "title": "The simple macroeconomics of AI",
    519       "authors": ["Daron Acemoglu"],
    520       "year": 2025,
    521       "relevance": "Economic analysis of AI's macroeconomic impact, provides theoretical context for GDPval's economic framing."
    522     },
    523     {
    524       "title": "Displacement or complementarity?: The labor market impact of generative AI",
    525       "authors": ["Wilbur Xinyuan Chen", "Suraj Srinivasan", "Saleh Zakerinia"],
    526       "year": 2025,
    527       "relevance": "Studies whether AI displaces or complements human labor, directly relevant to GDPval's economic impact framing."
    528     }
    529   ]
    530 }

Impressum · Datenschutz