scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21105B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "GAIA: a benchmark for General AI Assistants",
      6     "authors": [
      7       "G. Mialon",
      8       "Clémentine Fourrier",
      9       "Craig Swift",
     10       "Thomas Wolf",
     11       "Yann LeCun",
     12       "Thomas Scialom"
     13     ],
     14     "year": 2023,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2311.12983",
     17     "doi": "10.48550/arXiv.2311.12983"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 92% human vs 15% GPT-4 performance, which is supported by Table 4. The claim that questions are 'conceptually simple for humans yet challenging for most advanced AIs' is supported by the 92% human baseline.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper makes causal claims like 'augmenting LLMs via tool APIs or access to the web improves answer accuracy' based on comparing GPT-4 with and without plugins, but this is confounded by plugin selection being done manually per question (an oracle setting the authors acknowledge).",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'General AI Assistants' but Section 6 acknowledges the benchmark is English-only and web-centric. However, the abstract and introduction frame GAIA as a milestone for AGI ('the advent of Artificial General Intelligence hinges on a system's capability to exhibit similar robustness') which significantly overstates the tested scope.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for the human-AI gap. For example, the gap could be partly due to prompt engineering, plugin selection, or the specific types of questions chosen rather than fundamental capability differences.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper frames exact-match accuracy on 466 questions as measuring 'general AI assistant' capability and links it to AGI milestones, without discussing what 'general' means beyond the specific question types tested. The proxy gap between 'answers factoid questions correctly' and 'is a general AI assistant' is not acknowledged.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 'Limitations' is a dedicated section covering missing evaluations, cost of unambiguous question design, and lack of linguistic/cultural diversity.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 6 discusses specific threats: evidence changing over time on the web, reliance on English-only questions (80% of world population excluded), inability to reproduce GPT-4+plugins results due to plugin instability, and the cost and difficulty of designing unambiguous questions.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 6 explicitly states: 'GAIA is only a first step to estimate the potential of AI assistants, but should not be seen as an absolute general proof of their success.' It also notes the benchmark does not test actions beyond clicks, and does not evaluate reasoning traces.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding sources are disclosed. No acknowledgment of grants, corporate funding for annotators, or research support beyond the author affiliations.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: FAIR/Meta, HuggingFace, AutoGPT, GenAI/Meta. These are prominent in the header.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Authors are from Meta and HuggingFace, both with commercial interests in AI assistants. Meta competes with OpenAI; the benchmark primarily evaluates OpenAI's GPT-4. The funder independence is not discussed.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement is present. Authors from Meta and HuggingFace have clear commercial interests in the AI assistant space that are not formally declared.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Level 1/2/3 difficulty are operationally defined via steps and tools; capabilities (web browsing, multi-modality, coding, file reading) are defined in Appendix C with annotator-reported tool examples; AGI is linked to an external definition.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The contribution is clearly stated: a 466-question benchmark with design methodology, a released 166-question developer set, and a hosted leaderboard for evaluating general AI assistants.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 systematically contrasts GAIA with MMLU, AgentBench, ToolQA, Gentopia, OpenAGI, and HELM, explaining specifically how GAIA differs from and improves upon each prior approach.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper explicitly argues that GAIA measures 'General AI Assistant' capability because it requires multi-step real-world task completion (reasoning, tool use, multi-modality) and is grounded in the 'proof of work' analogy — the answer is only obtainable upon successful execution of all steps.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Three levels (146/245/75 questions) are characterized with annotator step counts and tool counts shown in Figure 3; time-to-answer by level is reported; correlation between steps and time is analyzed in Appendix C.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "GPT-4 achieving 0% on Level 3 is noted as a floor effect signaling room for improvement; human scores of 87-94% by level show no ceiling effect; the paper explicitly frames these as validating the difficulty gradient.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": true,
    143           "justification": "Human baseline of 92% overall (94%/92%/87% by level) is prominently featured in both Table 3 and Table 4 alongside all AI system scores, derived from validation annotators.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Quasi-exact match is justified by the factoid, unambiguous nature of answers; normalization by answer type (string, number, comma-separated list) is explained; the system prompt format is published to ensure reproducible answer extraction.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Multiple anti-contamination measures are designed in: answers are absent from plain-text pre-training data by construction, multi-step completion prevents brute-forcing, reasoning trace can be inspected for memorization, and accuracy requirements make contamination detectable.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Section 5 explicitly addresses static vs. dynamic benchmark decay: GAIA may decay via link rot or contamination; annual question refresh is proposed; questions deliberately avoid time-sensitive sources where possible.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Failure modes discussed include: no reasoning trace evaluation, reliance on time-varying closed-source APIs (plugins change regularly), potential web link rot, and English/cultural limitations reducing scope.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Code released at HuggingFace, scoring function provided, and five baselines (GPT-4, GPT-4 Turbo, AutoGPT, web search, human annotators) are reported with reproducible API-based evaluation for non-plugin conditions.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Appendix B provides a datacard following Bender & Friedman (2018) covering curation rationale, language variety, annotator demographics (age, gender, education), and file type distribution; question creation and validation protocols are detailed.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The paper states questions are released on HuggingFace but provides no explicit license terms; access conditions for the held-out 300-question answer set are not specified beyond the leaderboard structure.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The paper specifies GAIA should evaluate multi-step tool-using AI assistants in zero-shot settings; explicitly states it should NOT be used as 'an absolute general proof' of AI success; limitations note English-only scope.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "Human respondents achieve 92% on GAIA while GPT-4 with plugins achieves ~15% overall.",
    204       "evidence": "Table 3 reports 92% human annotator success rate; Table 4 shows GPT-4 + plugins at 30.3%/9.7%/0% across levels yielding ~14.6% weighted average.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "GAIA's three difficulty levels correlate with AI model performance.",
    209       "evidence": "Table 4 and Figure 4 show consistent performance degradation from Level 1 to Level 3 for all evaluated models, validating the difficulty proxy of steps and tools.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Augmenting LLMs with tool access significantly improves GAIA performance.",
    214       "evidence": "GPT-4 without plugins scores 9.1% at Level 1 vs. 30.3% for GPT-4 + plugins; however plugins were manually selected per question (oracle estimate), inflating the apparent benefit.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "GAIA is less gameable than MCQ benchmarks like MMLU because answers cannot be brute-forced.",
    219       "evidence": "Argued theoretically via design principles (factoid answers absent from pretraining data, multi-step completion, diverse action space); no empirical test of memorization resistance is provided.",
    220       "supported": "weak"
    221     },
    222     {
    223       "claim": "Current LLMs excel at tasks harder for humans but fail at tasks conceptually simple for humans.",
    224       "evidence": "GPT-4 passes the bar exam yet scores 0% on Level 3 GAIA questions; human performance of 92% on GAIA vs. 34.5% human non-specialist MMLU performance cited from existing work.",
    225       "supported": "moderate"
    226     },
    227     {
    228       "claim": "AutoGPT with GPT-4 backend performs disappointingly compared to GPT-4 without plugins.",
    229       "evidence": "Table 4 shows AutoGPT Level 2 at 0.4% vs. GPT-4's 2.6%; however the paper's text incorrectly states AutoGPT is worse 'even at Level 1' when Table 4 shows AutoGPT (14.4%) outperforming GPT-4 (9.1%) at Level 1.",
    230       "supported": "weak"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-eval",
    235     "observational"
    236   ],
    237   "key_findings": "GAIA introduces a 466-question benchmark where humans achieve 92% but the best-evaluated AI systems (GPT-4 with oracle-selected plugins) reach only ~15% overall, revealing a substantial and surprising capability gap on conceptually simple real-world tasks. The benchmark deliberately inverts the prevailing trend of making tasks harder for humans, instead targeting multi-step factoid tasks requiring tool use, web browsing, and multi-modality that are trivial for humans but architecturally challenging for current AI. Three difficulty levels validated by annotator step counts and correlated with model performance confirm the difficulty gradient. The work demonstrates that LLMs' success on specialist benchmarks like MMLU does not transfer to basic assistant tasks requiring robust multi-step execution.",
    238   "red_flags": [
    239     {
    240       "flag": "Oracle plugin selection",
    241       "detail": "The headline AI result (GPT-4 + plugins at ~15%) was obtained by manually selecting optimal plugins per question — explicitly described as an 'oracle estimate' that 'cannot be reproduced exactly.' This is not a valid evaluation of autonomous AI assistant capability and inflates reported performance."
    242     },
    243     {
    244       "flag": "Internal factual inconsistency on AutoGPT",
    245       "detail": "Paper text states AutoGPT is disappointing 'even at Level 1 compared to GPT4 without plugins' but Table 4 shows AutoGPT (14.4%) outperforming GPT-4 (9.1%) at Level 1, directly contradicting the textual claim."
    246     },
    247     {
    248       "flag": "No benchmark license",
    249       "detail": "No explicit license or usage terms are provided for the released benchmark questions, creating legal uncertainty for researchers wanting to use, extend, or build upon GAIA."
    250     },
    251     {
    252       "flag": "Overstated construct validity",
    253       "detail": "Framed as a 'General AI Assistants' benchmark but restricted to English-language factoid multi-step retrieval; does not measure conversational ability, creative generation, or non-English capability despite the broad title."
    254     },
    255     {
    256       "flag": "No funding or competing interests disclosure",
    257       "detail": "Authors from Meta/FAIR and HuggingFace — institutions with direct competitive interests in the AI assistant landscape — do not disclose funding sources or potential conflicts of interest."
    258     }
    259   ],
    260   "cited_papers": [
    261     {
    262       "title": "On the Measure of Intelligence (Chollet, 2019)",
    263       "relevance": "Foundational motivation for GAIA's design philosophy — targeting conceptually simple tasks and measuring robust generalization rather than specialist expertise"
    264     },
    265     {
    266       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    267       "relevance": "Key saturated benchmark that GAIA explicitly positions itself against; illustrates the trend of benchmarks being solved before GAIA's alternative approach"
    268     },
    269     {
    270       "title": "AgentBench: Evaluating LLMs as Agents",
    271       "relevance": "Direct competitor for multi-step LLM agent evaluation; GAIA distinguishes itself from AgentBench's closed-environment approach"
    272     },
    273     {
    274       "title": "ToolQA: A Dataset for LLM Question Answering with External Tools",
    275       "relevance": "Related tool-use evaluation benchmark that GAIA contrasts against for relying on repurposed existing datasets risking contamination"
    276     },
    277     {
    278       "title": "Holistic Evaluation of Language Models (HELM)",
    279       "relevance": "Aggregated benchmark compilation that GAIA critiques as lacking efficiency and reliability, motivating GAIA's smaller curated approach"
    280     },
    281     {
    282       "title": "WebGPT: Browser-assisted question-answering with human feedback",
    283       "relevance": "Foundational tool-augmented LLM work underpinning GAIA's web-browsing evaluation component"
    284     },
    285     {
    286       "title": "Levels of AGI: Operationalizing Progress on the Path to AGI (Morris et al., 2023)",
    287       "relevance": "Framework used to situate GAIA's milestone claims in the broader AGI progress taxonomy"
    288     },
    289     {
    290       "title": "Efficient Benchmarking of Language Models (Perlitz et al., 2023)",
    291       "relevance": "Motivates GAIA's preference for fewer high-quality questions over large aggregated benchmarks lacking reliability"
    292     }
    293   ],
    294   "engagement_factors": {
    295     "practical_relevance": {
    296       "score": 3,
    297       "justification": "Immediately usable via a live HuggingFace leaderboard with released questions and scoring code that any researcher can submit to."
    298     },
    299     "surprise_contrarian": {
    300       "score": 3,
    301       "justification": "Inverts the prevailing 'harder for humans = better benchmark' trend; the ~77pp human-AI gap on 'conceptually simple' tasks directly challenges the GPT-4-surpasses-humans narrative."
    302     },
    303     "fear_safety": {
    304       "score": 1,
    305       "justification": "Briefly mentions tool-use safety as a future direction but is not a safety-focused paper."
    306     },
    307     "drama_conflict": {
    308       "score": 2,
    309       "justification": "Shows GPT-4 (OpenAI, Meta's direct competitor) performing at 0% on Level 3 tasks humans find manageable, with implicit competitive framing."
    310     },
    311     "demo_ability": {
    312       "score": 3,
    313       "justification": "Benchmark is publicly available at HuggingFace with a live leaderboard where anyone can evaluate models against GAIA."
    314     },
    315     "brand_recognition": {
    316       "score": 3,
    317       "justification": "Co-authored by Yann LeCun (FAIR/Meta chief AI scientist) and Thomas Wolf (HuggingFace co-founder) with institutional backing from Meta and HuggingFace."
    318     }
    319   },
    320   "hn_data": {
    321     "threads": [
    322       {
    323         "hn_id": "38388990",
    324         "title": "Meta: Gaia - A Benchmark for General AI Assistants",
    325         "points": 36,
    326         "comments": 8,
    327         "url": "https://news.ycombinator.com/item?id=38388990",
    328         "created_at": "2023-11-23T03:43:15Z"
    329       },
    330       {
    331         "hn_id": "39143540",
    332         "title": "The Optimal Choice of Hypothesis Is the Weakest, Not the Shortest",
    333         "points": 5,
    334         "comments": 1,
    335         "url": "https://news.ycombinator.com/item?id=39143540",
    336         "created_at": "2024-01-26T15:14:52Z"
    337       },
    338       {
    339         "hn_id": "42413236",
    340         "title": "Fast and Efficient Memory Reclamation for Serverless MicroVMs",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=42413236",
    344         "created_at": "2024-12-13T23:18:15Z"
    345       },
    346       {
    347         "hn_id": "37985842",
    348         "title": "Eureka: Human-Level Reward Design via Coding Large Language Models",
    349         "points": 2,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=37985842",
    352         "created_at": "2023-10-23T14:06:23Z"
    353       },
    354       {
    355         "hn_id": "37968009",
    356         "title": "Eureka: Human-Level Reward Design via Coding Large Language Models",
    357         "points": 2,
    358         "comments": 0,
    359         "url": "https://news.ycombinator.com/item?id=37968009",
    360         "created_at": "2023-10-21T16:09:03Z"
    361       },
    362       {
    363         "hn_id": "29504080",
    364         "title": "DeepMind's PolyViT: A multi-modal AI model",
    365         "points": 2,
    366         "comments": 0,
    367         "url": "https://news.ycombinator.com/item?id=29504080",
    368         "created_at": "2021-12-09T22:45:22Z"
    369       },
    370       {
    371         "hn_id": "38401986",
    372         "title": "Gaia: A Benchmark for General AI Assistants",
    373         "points": 1,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=38401986",
    376         "created_at": "2023-11-24T09:17:33Z"
    377       },
    378       {
    379         "hn_id": "38011924",
    380         "title": "Eureka: Human level reward design via coding large language models",
    381         "points": 1,
    382         "comments": 0,
    383         "url": "https://news.ycombinator.com/item?id=38011924",
    384         "created_at": "2023-10-25T12:09:56Z"
    385       }
    386     ],
    387     "top_points": 36,
    388     "total_points": 51,
    389     "total_comments": 9
    390   }
    391 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs