scan-v5.json (21420B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "GAIA: a benchmark for General AI Assistants", 6 "authors": [ 7 "Grégoire Mialon", 8 "Clémentine Fourrier", 9 "Craig Swift", 10 "Thomas Wolf", 11 "Yann LeCun", 12 "Thomas Scialom" 13 ], 14 "year": 2023, 15 "venue": "arXiv.org", 16 "arxiv_id": "2311.12983", 17 "doi": "10.48550/arXiv.2311.12983" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "The 92% human vs ~15% GPT-4+plugins claim is confirmed by Table 4 (aggregate: ~91.7% human, ~14.6% GPT-4+plugins across 466 questions). The performance disparity and difficulty levels are all supported by reported results.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper claims 'augmenting LLMs via tool APIs or access to the web improves answer accuracy,' but the GPT-4+plugins condition involves manually human-selected plugins, which the paper itself acknowledges as an 'oracle estimate' that 'cannot be reproduced exactly.' This confound makes causal attribution to tool augmentation unreliable.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper explicitly states 'GAIA is only a first step to estimate the potential of AI assistants, but should not be seen as an absolute general proof of their success,' and Section 6 bounds the benchmark to English-language questions with compensated US-based annotators.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not systematically consider alternative explanations for the human-AI performance gap, such as whether question design inherently favors human reasoning styles, or whether the specific selection of factoid questions introduces its own biases unrelated to general AI capability.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper treats exact-match accuracy on 466 factoid questions as a measure of 'General AI Assistant' capability without formally arguing the validity of this proxy. The connection between answering specific real-world questions and broader assistant capability is assumed rather than demonstrated.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 6 is a dedicated 'Limitations' section covering missing trace evaluation, the cost of unambiguous question design, and lack of linguistic/cultural diversity.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "The paper identifies specific threats: (1) GPT-4+plugins scores are not reproducible due to manual plugin selection, (2) questions may decay as web sources change, (3) only English is supported excluding ~80% of the global population, and (4) 32% of initially crafted questions required repair or removal.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly states GAIA 'should not be seen as an absolute general proof' of AI assistant success, notes English-only coverage, and acknowledges that the benchmark does not evaluate reasoning traces or tool-call logs.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "There is no funding disclosure in the paper. Surge AI annotators are mentioned as 'compensated' in a footnote, but no funding source for the research itself is disclosed.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations are listed in the header: FAIR Meta, HuggingFace, AutoGPT, and GenAI Meta.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funder is disclosed, so independence cannot be assessed.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement appears anywhere in the paper. Meta authors evaluating OpenAI's GPT-4 creates a potential indirect conflict that is undisclosed.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": false, 102 "justification": "'General AI Assistant' — the central concept — is never formally defined; it is treated as self-evident. 't-AGI' is defined only via a footnote link. The capability categories (web browsing, multi-modality, etc.) are defined in Appendix C but the core benchmark concept is not formally grounded.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper clearly states it contributes a benchmark of 466 questions, a design methodology for creating further questions, a leaderboard, and an analysis of current LLM performance on real-world assistant tasks.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 provides detailed comparison with prior benchmarks (MMLU, GLUE, AgentBench, ToolQA, OpenAGI, Gorilla) explaining specific differences in design philosophy, evaluation environment, and contamination risk.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "benchmark-creation": { 121 "construct_design": { 122 "construct_validity_argued": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper argues that real-world multi-step factoid questions requiring tool use measure 'fundamental abilities' of general AI assistants, explicitly invoking Chollet (2019) and the Proof of Work analogy to justify that successful completion reflects genuine capability rather than pattern matching.", 126 "source": "haiku" 127 }, 128 "difficulty_distribution_characterized": { 129 "applies": true, 130 "answer": true, 131 "justification": "Three difficulty levels are defined by number of steps and tools (Figure 3), with distributions shown graphically and validated by correlation with model performance degradation (Table 4: Level 1 30.3% → Level 3 0% for best LLM).", 132 "source": "haiku" 133 }, 134 "ceiling_floor_effects_checked": { 135 "applies": true, 136 "answer": false, 137 "justification": "All evaluated LLMs score 0% on Level 3 questions, which is a floor effect that limits discriminability among models at the hardest tier. The paper presents this data but does not address it as a floor effect or discuss implications for inter-model ranking at Level 3.", 138 "source": "haiku" 139 }, 140 "human_baseline_included": { 141 "applies": true, 142 "answer": true, 143 "justification": "Human annotator performance is reported as 92% aggregate and broken down by level (94%/92%/87% for Levels 1/2/3 in Table 4), derived from the validation annotation process.", 144 "source": "haiku" 145 }, 146 "scoring_rubric_justified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Quasi-exact-match scoring is justified by the design requirement for unambiguous factoid answers; the justification is that factoid answers enable 'simple, fast and factual evaluation.' The scoring function is released with the leaderboard, though detailed normalization rules are not fully described in the paper.", 150 "source": "haiku" 151 } 152 }, 153 "robustness": { 154 "contamination_resistance_designed": { 155 "applies": true, 156 "answer": true, 157 "justification": "Non-gameability is a stated design principle: answers are absent from pre-training data by design, question diversity prevents brute-forcing, and reasoning traces can be checked. Multiple-choice formats are explicitly avoided because they make contamination harder to detect.", 158 "source": "haiku" 159 }, 160 "temporal_robustness_discussed": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 5 addresses static vs. dynamic benchmark decay, identifying risks of web evidence disappearing or contamination entering future pre-training data, and proposes annual question refresh as a mitigation strategy.", 164 "source": "haiku" 165 }, 166 "failure_modes_discussed": { 167 "applies": true, 168 "answer": true, 169 "justification": "Failure modes discussed include: web sources changing over time, closed-source API behavior changing (Chen et al. 2023), inability to evaluate reasoning traces, robots.txt compliance issues, and the benchmark's English-only coverage limiting generalization.", 170 "source": "haiku" 171 }, 172 "baseline_implementations_provided": { 173 "applies": true, 174 "answer": true, 175 "justification": "The scoring function is released alongside the leaderboard at huggingface.co/gaia-benchmark, 166 questions with answers are released as a developer set, and AutoGPT's git hash is provided (ed172dec). GPT-4+plugins baseline is explicitly noted as non-reproducible.", 176 "source": "haiku" 177 } 178 }, 179 "documentation": { 180 "dataset_documentation_complete": { 181 "applies": true, 182 "answer": true, 183 "justification": "Appendix B provides a data card following Bender & Friedman (2018) covering language variety, annotator demographics (age, gender, academic background), curation rationale, and text characteristics. The question creation and validation process is described in detail in Section 3.4.", 184 "source": "haiku" 185 }, 186 "licensing_and_access_clear": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper states questions are released on HuggingFace with 300 held for the leaderboard, but no license (e.g., CC-BY, MIT) is stated in the paper for use, modification, or redistribution of the dataset.", 190 "source": "haiku" 191 }, 192 "intended_use_specified": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper specifies GAIA is for evaluating general AI assistants in zero-shot settings, explicitly states it 'should not be seen as an absolute general proof' of success, and discusses what the benchmark does NOT evaluate (reasoning traces, fine-grained tool attribution).", 196 "source": "haiku" 197 } 198 } 199 } 200 }, 201 "claims": [ 202 { 203 "claim": "Human respondents achieve 92% accuracy on GAIA while GPT-4 with oracle-selected plugins achieves only ~15%.", 204 "evidence": "Table 4 shows human annotators at 93.9%/91.8%/87.3% across levels; GPT-4+plugins at 30.3%/9.7%/0%, aggregating to ~14.6%.", 205 "supported": "strong" 206 }, 207 { 208 "claim": "GAIA difficulty levels (1/2/3) are validated by their correlation with model performance.", 209 "evidence": "Table 4 shows monotonic performance degradation from Level 1 to Level 3 for all systems, supporting the difficulty ordering.", 210 "supported": "strong" 211 }, 212 { 213 "claim": "AutoGPT with GPT-4 backend underperforms plain GPT-4 on GAIA despite having tool access.", 214 "evidence": "Table 4: AutoGPT scores 14.4%/0.4%/0% vs. GPT-4 (no tools) at 9.1%/2.6%/0%; AutoGPT is worse at Level 2 and similarly poor at Level 3.", 215 "supported": "strong" 216 }, 217 { 218 "claim": "Tool augmentation (web browsing, code execution) significantly improves LLM performance on GAIA.", 219 "evidence": "GPT-4 + plugins scores 30.3% at Level 1 vs. GPT-4 without tools at 9.1%; however, plugin selection was manual/oracle, acknowledged as non-reproducible.", 220 "supported": "moderate" 221 }, 222 { 223 "claim": "GAIA questions are resistant to contamination because answers are absent from pre-training data.", 224 "evidence": "The paper asserts this by design principle but does not empirically verify that answers are absent from training corpora of evaluated models.", 225 "supported": "weak" 226 }, 227 { 228 "claim": "68% of crafted GAIA questions were valid without modification, with Level 3 questions having the lowest initial validity (47%).", 229 "evidence": "Table 3 reports validation statistics: 68% valid overall, 75%/68%/47% for Levels 1/2/3.", 230 "supported": "strong" 231 } 232 ], 233 "methodology_tags": [ 234 "benchmark-eval" 235 ], 236 "key_findings": "GAIA reveals a stark capability gap: human annotators achieve 92% accuracy while the best-equipped LLM (GPT-4 with oracle-selected plugins) reaches only ~15%, with all models scoring 0% on Level 3 questions. The benchmark's three-level difficulty structure is empirically validated by monotonic performance degradation across all evaluated systems. Surprisingly, AutoGPT with automatic tool selection underperforms plain GPT-4 at Level 2, suggesting that autonomous agent orchestration remains immature. The 466-question benchmark is designed around four principles — real-world grounding, interpretability, contamination resistance, and ease of use — and is hosted as a live leaderboard with the developer set publicly released.", 237 "red_flags": [ 238 { 239 "flag": "Oracle plugin selection", 240 "detail": "GPT-4+plugins scores were obtained by a human manually selecting the best plugins per question. The paper acknowledges this is an 'oracle estimate' that 'cannot be reproduced exactly.' This is the strongest-performing condition yet is not a realistic or reproducible experimental setup." 241 }, 242 { 243 "flag": "Level 3 floor effect unaddressed", 244 "detail": "All LLMs score 0% on Level 3 (75 questions), creating a floor that prevents any ranking among models on the hardest tier. The paper presents this data but does not analyze whether Level 3 questions are too difficult to be useful discriminators." 245 }, 246 { 247 "flag": "No competing interests statement", 248 "detail": "Meta FAIR authors evaluate OpenAI's GPT-4 critically, with a potential indirect interest in demonstrating GPT-4's weaknesses. No competing interests or independence statement is provided." 249 }, 250 { 251 "flag": "32% question failure rate", 252 "detail": "Table 3 shows only 68% of crafted questions were valid; the remainder required correction or removal. At Level 3, only 47% passed initial validation. This suggests significant annotation effort and possible quality variation in surviving questions." 253 }, 254 { 255 "flag": "No dataset license stated", 256 "detail": "The paper does not specify a license for the released questions, creating uncertainty about permissible downstream use for research or commercial applications." 257 }, 258 { 259 "flag": "Contamination claim unverified", 260 "detail": "The claim that GAIA answers are absent from pre-training data is a design principle, not an empirically verified property. No membership inference or data provenance analysis is provided." 261 } 262 ], 263 "cited_papers": [ 264 { 265 "title": "On the Measure of Intelligence", 266 "relevance": "Core theoretical framework for evaluating AI capability; GAIA explicitly builds on Chollet's argument for benchmarks requiring flexible reasoning rather than specialized skills." 267 }, 268 { 269 "title": "Measuring Massive Multitask Language Understanding (MMLU)", 270 "relevance": "Primary benchmark GAIA positions itself against; MMLU saturation motivates GAIA's different design philosophy targeting real-world tasks rather than multiple-choice knowledge." 271 }, 272 { 273 "title": "AgentBench: Evaluating LLMs as Agents", 274 "relevance": "Most similar prior work; GAIA distinguishes itself by using the open real world rather than closed environments, avoiding evaluation of API-specific knowledge." 275 }, 276 { 277 "title": "Holistic Evaluation of Language Models (HELM)", 278 "relevance": "Comprehensive benchmark compilation that GAIA critiques for difficulty of meaningful aggregation and sensitivity to experimental setup." 279 }, 280 { 281 "title": "ToolQA: A Dataset for LLM Question Answering with External Tools", 282 "relevance": "Related tool-augmented QA benchmark; GAIA argues ToolQA risks contamination by reusing existing datasets." 283 }, 284 { 285 "title": "Augmented Language Models: A Survey", 286 "relevance": "Survey of tool-augmented LLMs authored by overlapping GAIA authors; provides the context for why evaluating augmented assistants requires a new benchmark." 287 }, 288 { 289 "title": "Levels of AGI: Operationalizing Progress on the Path to AGI", 290 "relevance": "AGI taxonomy used to situate GAIA's purpose; paper argues a GAIA-solving system would qualify as a 'competent General AI' under Morris et al.'s framework." 291 }, 292 { 293 "title": "WebGPT: Browser-Assisted Question-Answering with Human Feedback", 294 "relevance": "Key prior system for web-browsing LLMs that GAIA is designed to evaluate and surpass." 295 } 296 ], 297 "engagement_factors": { 298 "practical_relevance": { 299 "score": 3, 300 "justification": "Direct benchmark for practitioners building AI assistants; live leaderboard and released developer set enable immediate use." 301 }, 302 "surprise_contrarian": { 303 "score": 2, 304 "justification": "Dramatically reverses the 'LLMs surpass humans' narrative — humans dominate 92% vs 15% — while also finding AutoGPT underperforms plain GPT-4 despite having tools." 305 }, 306 "fear_safety": { 307 "score": 1, 308 "justification": "Briefly mentions full automation reshaping socio-economic landscape and favoring open-source, but this is not a primary focus." 309 }, 310 "drama_conflict": { 311 "score": 1, 312 "justification": "Implicitly critiques OpenAI (GPT-4 scores poorly, official search plugin was removed and brought back) but the paper avoids explicit conflict framing." 313 }, 314 "demo_ability": { 315 "score": 3, 316 "justification": "Live leaderboard at HuggingFace with 166 public questions allows anyone to test their system immediately." 317 }, 318 "brand_recognition": { 319 "score": 3, 320 "justification": "Meta FAIR and HuggingFace are high-profile institutions; Yann LeCun's name alone drives significant attention." 321 } 322 }, 323 "hn_data": { 324 "threads": [ 325 { 326 "hn_id": "38388990", 327 "title": "Meta: Gaia - A Benchmark for General AI Assistants", 328 "points": 36, 329 "comments": 8, 330 "url": "https://news.ycombinator.com/item?id=38388990", 331 "created_at": "2023-11-23T03:43:15Z" 332 }, 333 { 334 "hn_id": "39143540", 335 "title": "The Optimal Choice of Hypothesis Is the Weakest, Not the Shortest", 336 "points": 5, 337 "comments": 1, 338 "url": "https://news.ycombinator.com/item?id=39143540", 339 "created_at": "2024-01-26T15:14:52Z" 340 }, 341 { 342 "hn_id": "42413236", 343 "title": "Fast and Efficient Memory Reclamation for Serverless MicroVMs", 344 "points": 2, 345 "comments": 0, 346 "url": "https://news.ycombinator.com/item?id=42413236", 347 "created_at": "2024-12-13T23:18:15Z" 348 }, 349 { 350 "hn_id": "37985842", 351 "title": "Eureka: Human-Level Reward Design via Coding Large Language Models", 352 "points": 2, 353 "comments": 0, 354 "url": "https://news.ycombinator.com/item?id=37985842", 355 "created_at": "2023-10-23T14:06:23Z" 356 }, 357 { 358 "hn_id": "37968009", 359 "title": "Eureka: Human-Level Reward Design via Coding Large Language Models", 360 "points": 2, 361 "comments": 0, 362 "url": "https://news.ycombinator.com/item?id=37968009", 363 "created_at": "2023-10-21T16:09:03Z" 364 }, 365 { 366 "hn_id": "29504080", 367 "title": "DeepMind's PolyViT: A multi-modal AI model", 368 "points": 2, 369 "comments": 0, 370 "url": "https://news.ycombinator.com/item?id=29504080", 371 "created_at": "2021-12-09T22:45:22Z" 372 }, 373 { 374 "hn_id": "38401986", 375 "title": "Gaia: A Benchmark for General AI Assistants", 376 "points": 1, 377 "comments": 0, 378 "url": "https://news.ycombinator.com/item?id=38401986", 379 "created_at": "2023-11-24T09:17:33Z" 380 }, 381 { 382 "hn_id": "38011924", 383 "title": "Eureka: Human level reward design via coding large language models", 384 "points": 1, 385 "comments": 0, 386 "url": "https://news.ycombinator.com/item?id=38011924", 387 "created_at": "2023-10-25T12:09:56Z" 388 } 389 ], 390 "top_points": 36, 391 "total_points": 51, 392 "total_comments": 9 393 } 394 }