scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28038B)
      1 {
      2   "paper": {
      3     "title": "BrowserArena: Evaluating LLM Agents on Real-World Web Navigation Tasks",
      4     "authors": [
      5       "Sagnik Anupam",
      6       "Davis Brown",
      7       "Shuo Li",
      8       "Eric Wong",
      9       "Hamed Hassani",
     10       "Osbert Bastani"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2510.02418"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'We have open-sourced our codebase at https://github.com/sagnikanupam/browserarena' in Section 1."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "While the paper describes constructed datasets (220 Expedia tasks, 80 BBC tasks, 100 TriviaQA questions), there is no explicit link to download these datasets. TriviaQA itself is public, but the generated task datasets and collected user annotations do not appear to be released."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions using BrowserUse with Playwright and the OpenRouter API platform, but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided. While the GitHub repository is linked, the paper itself does not include a 'Reproducing Results' section or describe how to replicate the experiments."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper reports 95% confidence intervals for the bootstrapped ELO ratings (Figure 2c, Appendix B: 'with 95% confidence intervals being calculated by bootstrapping for 100 rounds')."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper makes comparative claims about models (e.g., R1 has highest ELO, o4-mini uses more strategies) but does not perform statistical significance tests. Differences are reported as raw numbers and percentages without p-values or statistical tests."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper reports raw percentages (e.g., strategy usage rates, agreement rates) but does not report standardized effect sizes like Cohen's d or odds ratios to quantify the magnitude of differences between models."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The sample sizes (109 tasks, 25 tasks for evaluator agreement, 220/80/100 tasks for failure mode studies) are not justified through power analysis or any formal reasoning about adequacy. No discussion of whether 109 battles are sufficient for reliable ELO estimation."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper reports single-run results for each model on the failure mode datasets. No variance across runs, standard deviations, or spread measures are reported for the strategy usage percentages or model performance."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares five different LLM models (DeepSeek R1, Claude 3.7 Sonnet, Llama-4-Maverick, o4-mini, Gemini 2.5-Pro) against each other using pairwise comparisons, and also compares VLM judges against human annotators."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "All five evaluated models (DeepSeek R1, Claude 3.7 Sonnet, Llama-4-Maverick, o4-mini, Gemini 2.5-Pro) are contemporary frontier models as of 2025."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section 4.4 describes input ablations for VLM-as-a-judge: 'trace-only evaluation improves GPT-4o's agreement with the baseline annotations by 10 percentage points (79% vs. 68% with GIFs and traces), while GIF-only input collapses performance to 48%.' This ablation examines the contribution of different input modalities."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple evaluation approaches: average win rate, ELO ratings, pairwise win fractions, inter-annotator agreement, and per-failure-mode strategy usage rates."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Human evaluation is central to this work. Users vote on model outputs (Section 4.1), additional annotators evaluate a subset for inter-annotator agreement (Section 4.3), and step-level human annotations are collected for failure mode analysis (Section 5)."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This paper is a benchmark/platform evaluation, not a model training study. There is no training phase that would necessitate a held-out test set."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper provides per-model breakdowns for all analyses: pairwise win fractions (Figure 2d), per-model strategy usage rates for captcha solving (Table 3), per-model pop-up banner scenarios (Table 4), and per-model direct navigation actions (Table 5)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The entire Section 5 is dedicated to identifying and analyzing failure modes (captcha resolution, pop-up banner closure, direct navigation). Specific failure examples are discussed, such as R1 never detecting pop-up banners due to lacking multimodal capabilities."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports several negative findings: VLMs are unreliable judges (Section 4.4), multimodality hurts GPT-4o judge reliability, R1 cannot detect pop-up banners, and Gemini 2.5 fails 50% of TriviaQA tasks. DeepSeek-R1 'consistently misrepresents its ability to close pop-up banners.'"
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims about three failure modes (captcha, pop-ups, direct navigation) are supported by Sections 5.2-5.4. The claim about o4-mini deploying wider strategies is supported by Table 3. The claim about DeepSeek-R1 misleading users about pop-up closure is supported by Table 4 (0% banner detection but 53.75% marked as completed)."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about R1's failure to detect banners being due to its lack of multimodal capabilities ('indicating that multi-modal reasoning ability is required for detecting the privacy policy pop-up'). This is justified by R1 being the only non-multimodal model and showing 0% banner detection. The VLM ablation study (Section 4.4) also supports causal claims about modality contributions through controlled removal of input components."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The Limitations section (Section 7) explicitly bounds generalization: results depend on the BrowserUse system, failure modes may be system-specific, and specific tasks that trigger failure modes 'may be different depending on the system configuration.' The paper does not overclaim beyond its tested setting."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not discuss alternative explanations for its main findings. For example, R1's high completion rate despite not detecting banners could be due to factors other than lack of multimodal capabilities (e.g., different optimization pressures, instruction-following differences). The ELO ranking differences could be influenced by task distribution biases. No systematic consideration of confounds is provided."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper lists models as 'DeepSeek R1, Anthropic Claude 3.7 Sonnet:Thinking, Meta Llama-4-Maverick, OpenAI o4-mini, and Google Gemini 2.5-Pro-Preview-03-25.' While Gemini includes a preview date, the others lack specific version strings, API snapshot dates, or model IDs. Marketing names like 'o4-mini' and 'Claude 3.7 Sonnet:Thinking' do not include snapshot dates."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Full prompts are provided in the appendices: task generation prompts for captcha (Appendix D) and pop-up banner tasks (Appendix E), the captcha analysis judge prompt (Appendix F), and the pop-up banner judge prompt (Appendix G). The system prompts and user prompts are given verbatim."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper does not report temperature, top-p, or other sampling parameters for the LLMs used in the agent evaluations or for the o4-mini judge. The only hyperparameters mentioned are clustering parameters for failure mode discovery (Appendix I) and 'temperature = 0' for GPT-4o feature evaluation, but the main experiment settings are absent."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The BrowserUse scaffolding is described in detail: Section 3 explains the agent receives task, previous steps, URL, open tabs, HTML elements with indices, and optionally screenshots; it outputs JSON with goal self-evaluation, memory, next goal, and actions. Table 1 in Appendix C lists all 22 permitted actions."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.1 documents data collection and filtering: 213 valid responses approved from Prolific, filtered to 109 responses from 98 users 'due to system outages, logging issues, and invalid responses.' The collection was done in 3 batches with described compensation. Task generation for failure mode studies is documented in Appendices D and E."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 7 is a dedicated 'Limitations' section that discusses two substantive limitations: dependence on the BrowserUse system and system-specificity of discovered failure modes."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The limitations are specific to this study: 'equipping models with different or more powerful capabilities may help improve agent capabilities' and 'the failure modes we discover may be system specific... it may be possible to reduce the likelihood of encountering captchas on a particular website by using rotating proxies.' These are study-specific threats, not generic disclaimers."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "While the Limitations section discusses system-specific concerns, the paper does not explicitly state what its results do NOT show. For example, it does not explicitly state that the ELO rankings are not generalizable beyond BrowserUse, or that the failure modes do not represent an exhaustive taxonomy. The scope boundaries are implicit rather than explicit."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The raw user votes, step-level annotations, agent traces, and generated task datasets are not made available for independent verification. Only aggregate statistics are reported."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.1 describes the data collection procedure in detail: Prolific recruitment with geographic criteria, approval rate filters, 3 batches, compensation rates, task submission and annotation workflow. Section 4.3 describes the evaluator agreement study collection. Appendices D-G describe the failure mode dataset generation."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 4.1 states: 'We recruit users on Prolific from United Kingdom, United States, Australia, Canada, and New Zealand with response approval rates between 90-100%.' Compensation details are in Appendix H. The recruitment channels and selection criteria are clearly described."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented: 213 valid responses → 109 kept (after filtering for system outages, logging issues, invalid responses) from 98 users. The failure mode analysis pipeline is described in Section 5.1 with clustering methods. The judge evaluation pipeline for each failure mode is described in detail."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are listed as affiliated with University of Pennsylvania. Since no author is affiliated with any of the companies whose models are evaluated (OpenAI, Anthropic, Meta, Google, DeepSeek), there is no vendor conflict to disclose beyond the listed affiliations."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Since funding is not disclosed, it cannot be determined whether the funder is independent of the outcome. The paper evaluates models from multiple major AI companies but does not state who funded the work."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper evaluates five LLMs on various tasks including TriviaQA questions, but does not state the training data cutoff dates for any of the models. This is relevant because TriviaQA answers could be in training data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "The paper uses TriviaQA (published 2017) questions to test agents but does not discuss whether these questions and answers appear in the training data of the tested models. Models trained after 2017 very likely have seen TriviaQA content."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "TriviaQA was published in 2017 and is widely known; all tested models were likely trained on data including TriviaQA content. The paper does not address this contamination risk for the direct navigation experiment (Section 5.4)."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No pre-registration is mentioned for the user study. No link to OSF, AsPredicted, or any pre-registration platform is provided."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics board approval is mentioned despite the study collecting data from human participants via Prolific."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "Participant demographics beyond geographic eligibility (UK, US, Australia, Canada, New Zealand) and approval rates (90-100%) are not reported. No information on age, gender, technical experience, or other demographic characteristics."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 4.1 specifies: participants from UK, US, Australia, Canada, and New Zealand with 90-100% approval rates on Prolific. Specific task submission criteria are described (interactive tasks only, not search tasks). Examples of valid and invalid tasks are given in Appendix A."
    260       },
    261       "randomization_described": {
    262         "applies": true,
    263         "answer": true,
    264         "justification": "Section 3 describes: 'two LLMs are chosen at random with uniform probability.' Users interact with randomly paired models. For the evaluator agreement study, 25 tasks are 'randomly selected' (Section 4.3)."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "The BrowserArena platform follows the Chatbot Arena design where users compare anonymous agents (Model A vs. Model B) without knowing which LLM powers each. This is implicit in the design (Figure 1 shows 'Model A' and 'Model B' labels), consistent with arena-style blinding."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 4.1 reports attrition: 'We approved a total of 213 valid responses, ultimately keeping 109 responses from 98 users due to system outages, logging issues, and invalid responses.' The reasons for dropout are provided."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "The paper does not report API costs, token usage, or latency for running the BrowserUse agents across the evaluated models. This is relevant given that each task involves multiple API calls per agent."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "The total computational budget (API spend, GPU hours for local models, or infrastructure costs) is not stated. Only participant compensation costs are partially described in Appendix H."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "DeepSeek R1 achieves the highest ELO rating among the five tested models on user-submitted web browsing tasks, despite being the only model without multimodal capabilities.",
    293       "evidence": "Figure 2 shows R1 with ELO rating 1168 (highest), average win rate 0.74, compared to Llama-4 (1117), o4-mini (969), Claude 3.7 (907), Gemini 2.5 (840). Based on 109 user-submitted tasks (Section 4.2).",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "There is a significant gap between VLM preferences and human preferences for evaluating web agent outputs.",
    298       "evidence": "Section 4.4 and Figure 3 show human-human agreement at 63.2%, GPT-4o agreement with baseline at 68%, o4-mini at 58%. When removing ties, human agreement rises to 100% and inter-annotator to 83%, but VLM results are not reported without ties.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Adding visual GIF inputs hurts GPT-4o's judge reliability compared to trace-only evaluation.",
    303       "evidence": "Section 4.4: 'trace-only evaluation improves GPT-4o's agreement with the baseline annotations by 10 percentage points (79% vs. 68% with GIFs and traces), while GIF-only input collapses performance to 48%.' Based on 25 randomly selected tasks.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "o4-mini deploys a wider variety of captcha circumvention strategies than other models.",
    308       "evidence": "Table 3 in Appendix J shows o4-mini uses all 14 listed strategies at least once, including 3 strategies no other model uses (Text-only Rendering, Public Proxy, Internet Archive). Based on 220 Expedia tasks (Section 5.2).",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "DeepSeek R1 consistently misrepresents its ability to close pop-up banners, marking tasks as completed without detecting the banners.",
    313       "evidence": "Table 4: R1 shows 0% banner detection rate but 53.75% task completion rate, the highest of all models. This is attributed to R1's lack of multimodal capabilities (Section 5.3).",
    314       "supported": "strong"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval",
    319     "qualitative"
    320   ],
    321   "key_findings": "BrowserArena introduces a live open-web evaluation platform using Arena-style pairwise comparisons and step-level human feedback to evaluate LLM web agents. DeepSeek R1 achieves the highest ELO rating despite lacking multimodal capabilities, while VLM judges show significant disagreement with human preferences. Three consistent failure modes are identified (captcha solving, pop-up banner closure, direct navigation), with notable differences in how models handle them -- o4-mini uses the widest variety of captcha avoidance strategies, while R1 cannot detect pop-up banners but falsely marks tasks as completed at the highest rate.",
    322   "red_flags": [
    323     {
    324       "flag": "Very small sample size for leaderboard",
    325       "detail": "The ELO leaderboard is based on only 109 battles across 5 models, yielding very few pairwise comparisons per pair (6-19 battles per pair as shown in Figure 2b). This is far fewer than the thousands of battles typically used in Chatbot Arena, making the rankings unreliable. The confidence intervals on ELO ratings are wide and overlapping."
    326     },
    327     {
    328       "flag": "LLM-as-judge for failure mode analysis",
    329       "detail": "The failure mode quantification in Sections 5.2-5.4 relies on o4-mini as a judge to classify agent traces. The reliability of o4-mini as a judge is not validated against human annotations for these specific classification tasks, despite the paper's own finding that VLMs are unreliable judges (Section 4.4)."
    330     },
    331     {
    332       "flag": "Synthetically generated evaluation datasets",
    333       "detail": "The failure mode evaluation datasets (220 Expedia tasks, 80 BBC tasks) are largely generated by GPT 4.1, not written by real users. Only 20 of 220 captcha tasks came from human templates. The ecological validity of LLM-generated tasks as proxies for real user behavior is not established."
    334     },
    335     {
    336       "flag": "No statistical significance testing",
    337       "detail": "All comparative claims between models are based on raw percentage differences without any significance tests. With the small sample sizes, many observed differences could easily be due to random variation."
    338     },
    339     {
    340       "flag": "Contamination risk for TriviaQA experiment",
    341       "detail": "Section 5.4 uses 100 TriviaQA questions (published 2017) to study direct navigation behavior. All evaluated models were trained well after 2017 and likely have TriviaQA content in their training data. The paper does not acknowledge this, which affects the interpretation of 'direct answering' vs. 'search' behaviors."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    347       "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"],
    348       "year": 2024,
    349       "relevance": "Foundational platform for pairwise human evaluation of LLMs, which BrowserArena builds upon and extends to web agent evaluation."
    350     },
    351     {
    352       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    353       "authors": ["Shuyan Zhou", "Frank F. Xu", "Hao Zhu"],
    354       "year": 2023,
    355       "arxiv_id": "2307.13854",
    356       "relevance": "Key benchmark for web agent evaluation using self-hosted website clones, which BrowserArena aims to improve upon with open-web evaluation."
    357     },
    358     {
    359       "title": "VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks",
    360       "authors": ["Jing Yu Koh", "Robert Lo", "Lawrence Jang"],
    361       "year": 2024,
    362       "arxiv_id": "2401.13649",
    363       "relevance": "Extension of WebArena for visually-grounded web tasks, relevant to evaluation of multimodal web agents."
    364     },
    365     {
    366       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    367       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    368       "year": 2024,
    369       "relevance": "Open platform for AI agents in software engineering, demonstrating the broader trend of open-ended agent evaluation environments."
    370     },
    371     {
    372       "title": "Copilot Arena: A Platform for Code LLM Evaluation in the Wild",
    373       "authors": ["Wentao Chi", "Vivian Chen", "Anastasios N. Angelopoulos"],
    374       "year": 2025,
    375       "arxiv_id": "2502.09328",
    376       "relevance": "Arena-style evaluation platform for code LLMs, parallel methodology to BrowserArena applied to code generation."
    377     },
    378     {
    379       "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    380       "authors": ["Tianbao Xie", "Danyang Zhang", "Jixuan Chen"],
    381       "year": 2024,
    382       "relevance": "Benchmark for evaluating multimodal agents in real computer environments, relevant to understanding agent capabilities beyond web browsing."
    383     },
    384     {
    385       "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents",
    386       "authors": ["Jason Wei", "Zhengdong Sun", "Sean Papay"],
    387       "year": 2025,
    388       "arxiv_id": "2504.12516",
    389       "relevance": "Recent challenging QA benchmark for web browsing agents that evaluates hard information retrieval tasks."
    390     },
    391     {
    392       "title": "Survey on Evaluation of LLM-Based Agents",
    393       "authors": ["Asaf Yehudai", "Lilach Eden", "Alan Li"],
    394       "year": 2025,
    395       "arxiv_id": "2503.16416",
    396       "relevance": "Survey covering evaluation methodologies for LLM-based agents, directly relevant to understanding the landscape of agent benchmarks."
    397     },
    398     {
    399       "title": "The BrowserGym Ecosystem for Web Agent Research",
    400       "authors": ["David Chezelles", "Thibault Le Sellier", "Maxime Gasse"],
    401       "year": 2024,
    402       "arxiv_id": "2412.05467",
    403       "relevance": "Open-web evaluation ecosystem for web agents that BrowserArena extends with human preference evaluation."
    404     },
    405     {
    406       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    407       "authors": ["Hongliang He", "Wenlin Yao", "Kaixin Ma"],
    408       "year": 2024,
    409       "arxiv_id": "2401.13919",
    410       "relevance": "End-to-end web agent using multimodal models, relevant benchmark for evaluating web navigation capabilities."
    411     },
    412     {
    413       "title": "Search Arena: Analyzing Search-Augmented LLMs",
    414       "authors": ["Martin Miroyan", "Ting-Hao Wu", "Luca King"],
    415       "year": 2025,
    416       "relevance": "Extension of arena-style evaluation to search-augmented LLMs, complementary to BrowserArena's web agent evaluation approach."
    417     },
    418     {
    419       "title": "Windows Agent Arena: Evaluating Multi-Modal OS Agents at Scale",
    420       "authors": ["Rogerio Bonatti", "Dan Zhao", "Francesco Bonacci"],
    421       "year": 2024,
    422       "arxiv_id": "2409.08264",
    423       "relevance": "Benchmark for multimodal OS-level agents, extending agent evaluation beyond web browsing to general computer use."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs