scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28505B)
      1 {
      2   "paper": {
      3     "title": "GAIA: A Benchmark for General AI Assistants",
      4     "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Craig Swift", "Thomas Wolf", "Yann LeCun", "Thomas Scialom"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2311.12983",
      8     "doi": "10.48550/arXiv.2311.12983"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "GAIA introduces 466 questions for evaluating general AI assistants, where human respondents achieve 92% accuracy versus 15% for GPT-4 with plugins and 0% for Level 3 questions. The benchmark targets conceptually simple but execution-heavy tasks requiring web browsing, multi-modality, and tool use. The difficulty levels (defined by number of steps and tools needed) correlate with model performance, validating the level design. Augmenting LLMs with tools significantly improves performance but still leaves a large gap to human performance.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a HuggingFace link (https://huggingface.co/gaia-benchmark) for the benchmark data and scoring function. The leaderboard and dev set of 166 questions are released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "166 annotated dev questions are released, and 300 questions are available via a leaderboard (answers withheld). The paper states 'We release our questions while retaining answers to 300 of them to power a leader-board.'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are provided for running the scoring function or reproducing the evaluation."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The evaluation procedure is described at a high level (zero-shot prompting with a system prompt) but no runnable scripts or detailed reproduction guide is given."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Table 4 shows ± notation for GPT-4 and GPT-4 Turbo (from 3 runs), but GPT-4+plugins and AutoGPT have no uncertainty measures. The human baseline has no uncertainty quantification. Partial reporting across only some conditions is insufficient."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used despite claims that GPT-4+plugins outperforms other models and that difficulty levels correlate with performance. All comparisons are based on raw number differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports absolute accuracy scores with baselines (e.g., '92% vs. 15%' for humans vs GPT-4, '30.3% vs 9.1%' for plugins vs no plugins at Level 1), providing enough context to assess magnitude of differences."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The benchmark has 466 questions but no justification for why this number is adequate. The paper acknowledges 'we preferred to favour quality over quantity' but does not justify whether 466 questions provides sufficient statistical power for the comparisons made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only GPT-4 and GPT-4 Turbo report variance from 3 runs (Table 4). GPT-4+plugins was run once manually. AutoGPT shows no variance. The human baseline has no variance measure. Inconsistent reporting across conditions."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares GPT-4, GPT-4 Turbo, AutoGPT, GPT-4+plugins, human annotators, and web search as baselines (Section 4, Table 4)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "GPT-4 and GPT-4 Turbo were the most capable models available at time of writing (Nov 2023). The paper acknowledges API access limitations for evaluating other models."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The paper does not test how removing specific question types, capabilities, or difficulty levels affects the benchmark's discriminative power."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only accuracy (exact match) is used as an evaluation metric. No other metrics such as partial credit, reasoning trace quality, or time-adjusted scores are computed, despite time data being collected."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Human annotators independently answered all questions as part of the validation phase (Section 3.4, Table 3). The 92% human accuracy serves as a key comparison point."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "300 questions have answers withheld for the leaderboard, and 166 are released as a dev set. The paper explicitly separates these: 'We release a developer set of 166 annotated questions and release the remaining 300 questions without annotations.'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by difficulty level (1, 2, 3) in Table 4 and by capability type (web browsing, coding, multi-modality, filetype reading) in Figure 5."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figures 9-11 show detailed examples of GPT-4 failures, including a puzzle failure (Figure 11) and a case where lack of web access prevents answering (Figure 9). Section 4 discusses AutoGPT's disappointing performance."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "AutoGPT's poor performance relative to even base GPT-4 is reported and discussed: 'AutoGPT4, which allows GPT4 to automatically use tools, offer disappointing results.' GPT-4 gets 0% on Level 3."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 92% human vs 15% GPT-4 performance, which is supported by Table 4. The claim that questions are 'conceptually simple for humans yet challenging for most advanced AIs' is supported by the 92% human baseline."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims like 'augmenting LLMs via tool APIs or access to the web improves answer accuracy' based on comparing GPT-4 with and without plugins, but this is confounded by plugin selection being done manually per question (an oracle setting the authors acknowledge)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'General AI Assistants' but Section 6 acknowledges the benchmark is English-only and web-centric. However, the abstract and introduction frame GAIA as a milestone for AGI ('the advent of Artificial General Intelligence hinges on a system's capability to exhibit similar robustness') which significantly overstates the tested scope."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the human-AI gap. For example, the gap could be partly due to prompt engineering, plugin selection, or the specific types of questions chosen rather than fundamental capability differences."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper frames exact-match accuracy on 466 questions as measuring 'general AI assistant' capability and links it to AGI milestones, without discussing what 'general' means beyond the specific question types tested. The proxy gap between 'answers factoid questions correctly' and 'is a general AI assistant' is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper refers to 'GPT-4' and 'GPT-4 Turbo' without specific version identifiers or API snapshot dates. AutoGPT's git hash is provided (ed172dec) but the underlying GPT-4 version is not specified."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full system prompt used for evaluation is provided in Figure 2, including exact formatting instructions for answers."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other API parameters are reported for any of the model evaluations."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "AutoGPT is used but its scaffolding is described only as 'able to do this selection automatically' without detail on retry logic, memory management, or tool selection strategy. GPT-4+plugins setup is described loosely ('we often rely on (i) a tool for reading various types of links, (ii) a web browsing tool, and (iii) a tool for computation')."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The question creation and validation pipeline is well documented in Section 3.4 and Appendix D, with statistics on validation outcomes (Table 3): 68% valid as-is, the rest corrected or removed."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 'Limitations' is a dedicated section covering missing evaluations, cost of unambiguous question design, and lack of linguistic/cultural diversity."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 discusses specific threats: evidence changing over time on the web, reliance on English-only questions (80% of world population excluded), inability to reproduce GPT-4+plugins results due to plugin instability, and the cost and difficulty of designing unambiguous questions."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6 explicitly states: 'GAIA is only a first step to estimate the potential of AI assistants, but should not be seen as an absolute general proof of their success.' It also notes the benchmark does not test actions beyond clicks, and does not evaluate reasoning traces."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The dev set of 166 questions with annotations is released via HuggingFace, allowing independent verification of the question design and human baseline."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.4 and Appendix D describe the question creation process in detail: authors created initial questions, gave examples and instructions to Surge AI annotators, who created more. Validation required two independent annotators per question."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Annotators were from Surge AI (footnote 5) and demographics are reported in Appendix B, but how they were recruited or selected from Surge AI's pool is not described. No discussion of potential selection bias in annotator recruitment."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from question creation → validation by 2 annotators → repair/removal is documented with statistics: 623 questions created, 68% valid as-is, rest repaired or removed, yielding 466 final questions (Table 3)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources are disclosed. No acknowledgment of grants, corporate funding for annotators, or research support beyond the author affiliations."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: FAIR/Meta, HuggingFace, AutoGPT, GenAI/Meta. These are prominent in the header."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Authors are from Meta and HuggingFace, both with commercial interests in AI assistants. Meta competes with OpenAI; the benchmark primarily evaluates OpenAI's GPT-4. The funder independence is not discussed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Authors from Meta and HuggingFace have clear commercial interests in the AI assistant space that are not formally declared."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "GPT-4's training cutoff is not stated. The paper discusses contamination risk conceptually but does not specify when any model's training data ends."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 3.1 extensively discusses contamination mitigation: answers are 'absent by design in plain text from current pre-training data,' questions require multi-step execution rather than memorization, and reasoning traces can be checked. Section 5 further discusses contamination risk."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The benchmark is explicitly designed to resist contamination: 'the accuracy required in the answers, their absence from pre-training data, and the possibility to check the reasoning trace mitigate this risk' (Section 3.1). The benchmark is new, so questions were not in training data at time of release."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The study uses paid human annotators (Surge AI) to create and validate questions. No pre-registration is mentioned."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No IRB or ethics board approval is mentioned despite using paid human annotators for question creation and validation."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Appendix B reports annotator demographics: age distribution (17% 18-25, 39% 26-35, etc.), gender (57% male, 43% female), and academic background (61% bachelor's, 26% master's, 17% PhD)."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No inclusion or exclusion criteria for annotator selection are described. The paper notes annotators were 'based in the US' but does not explain selection criteria."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "This is not an experimental study with treatment/control conditions requiring randomization. Annotators all performed the same task."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Blinding is not applicable to this study design. Validation annotators answered questions independently, which is a form of blinding to the original answer, but this is inherent to the design rather than a blinding protocol."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No information on annotator attrition or dropout is provided. The paper does not state how many annotators started vs. finished the task."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs or token consumption are reported for any model evaluation. Time-to-answer is reported (Table 4) but monetary cost is not."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget is stated for the evaluation. The paper mentions running GPT-4 API three times for variance but does not quantify the total compute or cost."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Only GPT-4 and GPT-4 Turbo are run 3 times. No seed sensitivity analysis is performed. The ± values in Table 4 show some variance but this is not framed as seed sensitivity."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states 'Whenever an API is available, we run the model three times and report the average results' (Section 4). GPT-4+plugins was manually tested once."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "The paper evaluates models in zero-shot with a fixed prompt; no hyperparameter search is performed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "For GPT-4+plugins, the authors manually selected plugins per question ('according to our best guess of the most important capabilities'), creating an oracle setting. This is acknowledged but not justified as a fair comparison methodology."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors created the benchmark and evaluated the models. No discussion of author-evaluation bias, such as whether the question design inadvertently favors or disfavors certain systems."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "AutoGPT uses significantly more time (7.6-11.7 min) than GPT-4 API (0.12-0.24 min) but performance comparison does not account for this compute difference."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3.1 discusses at length what GAIA measures and why, and Section 5 discusses what a system solving GAIA would represent (t-AGI). The paper positions GAIA's design philosophy against existing benchmarks' shortcomings. The difficulty levels are validated against model performance (Section 4)."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "GPT-4+plugins uses manually selected plugins per question (oracle setting), AutoGPT uses automatic selection. The scaffold difference is acknowledged but not controlled: 'our score for GPT4 with plugins is an \"oracle\" estimate' — the confound between model capability and scaffold quality is not resolved."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The benchmark is newly created (Nov 2023) with answers designed to be absent from pre-training data. The paper explicitly addresses this: 'the resulting answer is absent by design in plain text from current pre-training data.'"
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup (e.g., the system prompt, question phrasing) could leak information about the answer format or expected approach."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether questions share structural similarities, common sources, or patterns that could be exploited. Questions may share information sources (e.g., Wikipedia, arXiv) that could create dependencies."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method is applied. The paper discusses contamination resistance by design but does not use canary strings, membership inference, or overlap analysis."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Human respondents achieve 92% accuracy on GAIA versus 15% for GPT-4 equipped with plugins",
    365       "evidence": "Table 4 shows human accuracy of 93.9%/91.8%/87.3% across levels. GPT-4+plugins achieves 30.3%/9.7%/0%. The 15% figure appears to be an approximate overall average. (Section 4, Table 4)",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "GAIA's difficulty levels correlate with model performance",
    370       "evidence": "Table 4 shows monotonic decrease in model performance from Level 1 to Level 3 for all models. All models achieve 0% on Level 3. (Section 4, Figure 4)",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Augmenting LLMs with tools improves answer accuracy on GAIA",
    375       "evidence": "GPT-4 alone: 9.1% Level 1. GPT-4+plugins: 30.3% Level 1. However, plugin selection was manual/oracle, making this an upper bound rather than a fair comparison. (Section 4, Table 4)",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "GAIA is robust against memorization and data contamination",
    380       "evidence": "The paper argues questions require multi-step execution with answers absent from plain text on the internet, and reasoning traces can be checked. This is argued by design rather than empirically verified. (Section 3.1)",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "A system solving GAIA would represent a milestone toward AGI",
    385       "evidence": "The paper links GAIA to t-AGI and Morris et al. (2023) levels framework, but this is a framing argument rather than an empirical claim. No evidence that GAIA comprehensively covers AGI capabilities. (Section 1, Section 5)",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Oracle plugin selection",
    392       "detail": "GPT-4+plugins results used human-selected plugins per question, making this an oracle upper bound rather than a reproducible evaluation. The paper acknowledges this but still uses it as a primary comparison point."
    393     },
    394     {
    395       "flag": "Conflict of interest: Meta evaluating OpenAI",
    396       "detail": "Authors from Meta (a direct competitor) primarily evaluate OpenAI's GPT-4. While the benchmark itself is model-agnostic, the initial evaluation results only cover a competitor's product."
    397     },
    398     {
    399       "flag": "AGI framing exceeds evidence",
    400       "detail": "The paper repeatedly frames GAIA as an AGI benchmark, claiming 'the advent of Artificial General Intelligence hinges on a system's capability to exhibit similar robustness as the average human does on such questions.' This is a very strong claim for a 466-question English-only benchmark."
    401     },
    402     {
    403       "flag": "Non-reproducible evaluation",
    404       "detail": "The paper explicitly states that GPT-4+plugins scores 'cannot be reproduced exactly' due to changing plugins. The evaluation relies on closed-source APIs that may change behavior over time (acknowledged in Section 5)."
    405     },
    406     {
    407       "flag": "No statistical tests for any comparison",
    408       "detail": "All claims of performance differences between models are based solely on comparing raw numbers without any statistical testing, despite having multiple runs for some conditions."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "AgentBench: Evaluating LLMs as Agents",
    414       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    415       "year": 2023,
    416       "relevance": "Benchmark for evaluating LLM agents in closed-box environments, a direct comparison point for GAIA's open-world evaluation approach."
    417     },
    418     {
    419       "title": "On the Measure of Intelligence",
    420       "authors": ["François Chollet"],
    421       "year": 2019,
    422       "relevance": "Foundational work on measuring AI intelligence that influenced GAIA's design philosophy of testing fundamental abilities over specialized skills."
    423     },
    424     {
    425       "title": "Augmented Language Models: A Survey",
    426       "authors": ["Grégoire Mialon", "Roberto Dessì", "Maria Lomeli"],
    427       "year": 2023,
    428       "relevance": "Survey of tool-augmented LLMs directly relevant to understanding the agentic capabilities GAIA evaluates."
    429     },
    430     {
    431       "title": "Levels of AGI: Operationalizing Progress on the Path to AGI",
    432       "authors": ["Meredith Ringel Morris", "Jascha Sohl-dickstein", "Noah Fiedel"],
    433       "year": 2023,
    434       "relevance": "Framework for measuring AGI progress that GAIA positions itself within."
    435     },
    436     {
    437       "title": "Holistic Evaluation of Language Models",
    438       "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"],
    439       "year": 2022,
    440       "relevance": "HELM provides comprehensive LLM evaluation; GAIA positions itself as addressing HELM's limitations around tool use and real-world grounding."
    441     },
    442     {
    443       "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models",
    444       "authors": ["Aarohi Srivastava", "Abhinav Rastogi", "Abhishek Rao"],
    445       "year": 2023,
    446       "relevance": "BIG-Bench aggregated benchmark that GAIA contrasts with by focusing on fewer, higher-quality questions."
    447     },
    448     {
    449       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    450       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    451       "year": 2023,
    452       "relevance": "Model-based evaluation approach that GAIA avoids by using factual exact-match scoring."
    453     },
    454     {
    455       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    456       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    457       "year": 2023,
    458       "relevance": "Key work on LLM tool use, a core capability that GAIA benchmarks."
    459     },
    460     {
    461       "title": "Measuring Massive Multitask Language Understanding",
    462       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    463       "year": 2021,
    464       "relevance": "MMLU benchmark that GAIA positions against as being near-saturated and susceptible to contamination."
    465     },
    466     {
    467       "title": "Efficient Benchmarking (of Language Models)",
    468       "authors": ["Yotam Perlitz", "Elron Bandel", "Ariel Gera"],
    469       "year": 2023,
    470       "relevance": "Discusses efficiency and reliability of LLM benchmarks, a concern GAIA addresses through curated high-quality questions."
    471     },
    472     {
    473       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    474       "authors": ["Shishir G. Patil", "Tianjun Zhang", "Xin Wang"],
    475       "year": 2023,
    476       "relevance": "API-calling benchmark for LLMs; GAIA contrasts by not specifying possible APIs."
    477     }
    478   ]
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs