scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33210B)
      1 {
      2   "paper": {
      3     "title": "Mind2Web: Towards a Generalist Agent for the Web",
      4     "authors": [
      5       "Xiang Deng",
      6       "Yu Gu",
      7       "Boyuan Zheng",
      8       "Shijie Chen",
      9       "Samuel Stevens",
     10       "Boshi Wang",
     11       "Huan Sun",
     12       "Yu Su"
     13     ],
     14     "year": 2023,
     15     "venue": "Neural Information Processing Systems",
     16     "arxiv_id": "2306.06070",
     17     "doi": "10.48550/arXiv.2306.06070"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Mind2Web introduces the first large-scale dataset (2,350 tasks, 137 websites, 31 domains) for developing and evaluating generalist web agents on real-world websites. The proposed MINDACT framework, which uses a small LM (DeBERTa) for candidate element filtering followed by an LLM for action prediction via multi-choice QA, achieves 52.0% step success rate on seen websites but drops to ~39% on unseen websites/domains, revealing that generalization challenges stem from website design diversity rather than domain differences. GPT-4 with only 3-shot in-context learning matches fine-tuned Flan-T5 on cross-website/domain settings, suggesting promising LLM-based paths toward generalist web agents.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Code released under MIT License at https://github.com/OSU-NLP-Group/Mind2Web, as documented in the artifacts table in the Supplementary (Section A)."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Training data released on HuggingFace (CC BY 4.0) at https://huggingface.co/datasets/osunlp/Mind2Web, and test data available via a separate link with password. Both listed in the artifacts table."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions hardware (4×A100 80GB, A6000 48GB) and libraries (Sentence-Transformers, Transformers, Playwright) but does not provide a requirements.txt, Dockerfile, or environment specification listing library versions."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "While code and data are released, the paper itself does not contain step-by-step reproduction instructions. Hyperparameters are listed in Table 4 and model links are provided, but no explicit 'how to reproduce' guide is given in the paper."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The main results in Table 2 report only point estimates with no confidence intervals or error bars. Table 5 in the appendix reports mean ± std over 5 seeds for an auxiliary experiment, but the primary results table has no uncertainty quantification."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are reported. All comparisons between methods (e.g., MINDACT vs. baselines, GPT-4 vs. Flan-T5) are based solely on comparing raw numbers without any hypothesis testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports absolute performance numbers with baselines for context (e.g., 'over 10% absolute gap in step SR between Cross-Task and Cross-Website settings'). Table 2 provides all raw numbers enabling direct comparison of magnitudes."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for the number of tasks (2,350), websites (137), or the GPT-4 evaluation subset (50 tasks per setting). No power analysis is discussed."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Main results (Table 2) are single-run numbers without variance. Table 5 in the appendix reports std dev over 5 seeds for Flan-T5 models as an auxiliary experiment, but the primary reported results lack any spread measure."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 2 compares MINDACT against a classification baseline (DeBERTa for element selection) and a generation baseline (Flan-T5 autoregressive generation), plus GPT-3.5 and GPT-4 via in-context learning."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include DeBERTa-v3, Flan-T5 (2022), GPT-3.5-turbo, and GPT-4 — all state-of-the-art or near-state-of-the-art at the time of submission. The classification approach mirrors prior work [14, 35]."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The two-stage pipeline is effectively ablated: classification-only baseline tests candidate generation alone, generation baseline tests direct autoregressive prediction, and MINDACT combines both stages. Table 2 shows the contribution of each component. Multi-choice vs direct generation is also compared (Figure 5)."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Four metrics are used: Element Accuracy, Operation F1, Step Success Rate, and (whole-task) Success Rate. These capture different aspects of performance (element selection, operation correctness, per-step, and end-to-end)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "All evaluation is automated by comparing predicted actions against ground-truth annotation. No human evaluation of the system's outputs is conducted. The paper acknowledges offline evaluation limitations in Section 6."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Three held-out test sets with clear separation: TestCross-Task (random 20% split, 252 tasks), TestCross-Website (held-out websites, 177 tasks), TestCross-Domain (held-out domains, 912 tasks). Section 4.1 describes the splits."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 6 provides per-website step success rate breakdowns across all three evaluation settings. Table 2 also reports results separately for each of the three generalization levels."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 4.3 discusses failure modes: GPT-3.5's propensity to select the 'None' option, the overall low task success rate even for the best models, and the challenge of grounding commonsense knowledge into specific website actions."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative results: GPT-3.5 underperforms fine-tuned smaller models (Table 2), the generation baseline underperforms classification despite larger model size, zero-shot Flan-T5XL 'fails to perform the task without fine-tuning' (Table 6, Appendix D.2), and overall task success rate is very low (≤7.1%)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims are supported: 'first dataset' for generalist web agents (Table 1 comparison), 'diverse domains/websites/tasks' (137 websites, 31 domains, 2350 tasks), 'filtering with small LM improves effectiveness' (Table 2 MINDACT vs baselines), 'decent level of performance even on unseen websites/domains' (Table 2 cross-website/domain results), 'substantial room to improve' (low task success rates)."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The main causal claim — that small-LM filtering improves LLM effectiveness — is supported by controlled comparison: same LLM (Flan-T5) with and without candidate generation, with the only difference being the filtering stage. The ablation design adequately isolates the filtering component's contribution."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Claims are appropriately bounded. The abstract states 'a decent level of performance' rather than overstating, and explicitly notes 'substantial room to improve.' Section 6 discusses specific limitations including English-only websites, US-centric selection, and MTurk annotator bias."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not substantively discuss alternative explanations for observed results. For instance, the similar performance on Cross-Website and Cross-Domain is attributed to 'challenges primarily stemming from website design diversity' without considering alternatives like dataset difficulty distribution or sample size effects."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper's metrics (element accuracy, step success rate, task success rate) directly measure what is claimed — the ability to select correct web elements and complete multi-step tasks. No proxy gap exists between what is measured and what is claimed."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Open-source models are specified precisely with HuggingFace links (deberta-v3-base, flan-t5-base/large/xl). However, GPT-3.5-turbo and GPT-4 are referenced only by marketing names without snapshot dates or API versions, despite model behavior changing across versions."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Table 8 in the appendix provides the full prompt used for GPT in-context learning, including system message, three demonstration examples with expected outputs. Figures 4 and 5 show the input templates for candidate generation and action prediction."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Table 4 in the appendix reports all hyperparameters: batch size (32), epochs (5), learning rates (3e-5 for candidate generation, 5e-5 for action prediction), and temperature (0) and number of demonstrations (3) for GPT models."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "MINDACT is a two-stage model pipeline (candidate generation → action prediction), not an agentic scaffolding system. No tools, retry logic, memory, or feedback mechanisms are involved."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.2 documents preprocessing: 'We apply simple heuristics to clean the raw HTML documents, keeping only elements that are visible and carry substantial semantic meaning... This effectively reduces the average number of elements from 1,135 to 580, while still maintaining an overall recall of 94.7% for the target element.' Appendix C.1 describes element normalization for evaluation."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 'Limitations and Potential Societal Impact' provides extensive discussion covering data diversity, multimodal information, interaction dynamics, human-agent interaction, offline/online evaluation, and safety in deployment."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 discusses specific threats: 'predominantly comprises English-language websites primarily used in the U.S.', MTurk annotators 'might be biased towards a group that is more proficient in web use', offline evaluation 'potentially leading to false negatives due to the existence of multiple paths for completing the same task.'"
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 6 explicitly states what was not tested: multimodal understanding not used, no interactive/conversational setting, no live website evaluation, no non-English websites, no users with accessibility challenges. Each limitation is framed as a specific boundary with a future direction."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The full dataset including MHTML files, DOM snapshots, HAR files, and trace files is released on HuggingFace (CC BY 4.0), enabling independent verification of the raw data."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.2 describes the four-stage collection process in detail: website selection (by popularity from similarweb.com, 3-5 per domain), task proposal (MTurk with ChatGPT seed tasks), task demonstration (Playwright-based tool), and task verification (by authors)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Appendix B.1 describes annotator recruitment: Amazon MTurk platform, minimum 1,000 approved HITs with >98% approval rate, compensation at $10.10/hr estimated rate, $0.80 per approved task, consent form required, qualification assessment with questionnaire and test demonstrations."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from collection to final dataset is documented with counts at each stage: 2,411 total collected tasks → 61 discarded → 2,350 retained, with 390 task descriptions refined and 187 instances with extraneous steps removed (Section 2.2). Preprocessing reduces elements from 1,135 to 580 average (Section 4.2)."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Acknowledgements section lists funding: 'sponsored in part by NSF OAC 2112606, NSF CAREER #1942980, ARL W911NF2220144 and Ohio Supercomputer Center.'"
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All authors are affiliated with The Ohio State University, clearly stated in the paper header. No commercial product is being evaluated, so no undisclosed product-company conflicts exist."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Funding comes from NSF (National Science Foundation) and ARL (Army Research Laboratory) — government agencies with no financial stake in the benchmark results or any specific model's performance."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper. Absence of a declaration is not the same as absence of conflict."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper evaluates GPT-3.5-turbo and GPT-4 on their benchmark but does not state the training data cutoff dates for either model. While the benchmark is new, the underlying websites existed prior to model training."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of potential overlap between the pre-trained models' training data and the test websites/tasks. While the task annotations are new, the website content itself may have appeared in GPT training data."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The benchmark is newly created, making direct contamination of task annotations unlikely. However, the paper does not explicitly discuss this or address whether GPT models may have encountered the underlying website structures during pre-training."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "The MTurk workers are data collection annotators, not study participants. The paper studies web agent performance, not human behavior. IRB exemption confirms this is not formal human subjects research."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "While the paper mentions IRB exemption for the annotation process (Appendix B.1), the MTurk workers are dataset annotators, not subjects of a study on human behavior."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants as study subjects. The MTurk annotators are data collectors, not subjects of investigation."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants as study subjects. Annotator qualification criteria (1,000+ HITs, 98%+ approval) are for data quality, not study design."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants as study subjects. This is a dataset and benchmark paper, not an experimental study of human behavior."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants as study subjects. Blinding is not applicable to a dataset construction effort."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants as study subjects. Task attrition (61/2,411 discarded) is dataset quality control, not participant dropout."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper mentions GPT-4's 'high operational cost remains a concern' and that they could only evaluate 50 tasks per setting 'due to limited budget,' but no specific dollar amounts, token counts, or per-example costs are reported."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Hardware is mentioned (4×A100 80GB, A6000 48GB) but total training time, GPU hours, or API spend are not quantified. No total computational budget is stated."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Table 5 (Appendix D.1) reports results across 5 random seeds with mean and standard deviation, showing sensitivity is small (std < 1.0 for all settings)."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Appendix D.1: 'we show the average and standard deviation of 5 runs with different random seeds.' However, it is unclear whether the main Table 2 results represent a single run or the mean."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. Table 4 lists fixed hyperparameters without explaining how they were selected or how many configurations were tried."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No justification for why the specific hyperparameters (learning rate 3e-5/5e-5, batch size 32, 5 epochs) were chosen. No mention of selection on a validation set."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement all baselines (DeBERTa classification, Flan-T5 generation) and compare against their own MINDACT framework without acknowledging the potential bias of evaluating their own system against their own baseline implementations."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Models range from 86M parameters (DeBERTa) to GPT-4 (unknown size). While Flan-T5 results across three sizes (base/large/xl) show scaling effects, the compute differences between methods are not discussed or controlled for."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "Section 6 discusses offline evaluation limitations: 'the task will fail immediately if an action was not cached during data collection, potentially leading to false negatives.' The paper also acknowledges the gap between offline evaluation and live website interaction, and discusses missing multimodal signals."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "MINDACT uses the same pipeline architecture across all LLM comparisons (same candidate generation + same multi-choice format). No scaffolding confound exists because the evaluation framework is held constant."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The paper does not discuss whether GPT models' training data includes information from the real-world websites used in the benchmark. While the task annotations are new, the website content existed before model training."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup (providing HTML snippets, candidate elements) leaks more information than would be available in a realistic deployment scenario."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether tasks from the same website in training and test sets share structural patterns that could inflate generalization estimates. The Cross-Task setting explicitly includes same-website tasks in training and test."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are mentioned."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Mind2Web is the first dataset for developing and evaluating generalist agents for the web, spanning 137 websites, 31 domains, and 2,350 tasks.",
    374       "evidence": "Table 1 comparison with existing datasets (MiniWoB++, WebShop, RUSS, PixelHelp, META-GUI, MoTIF) shows Mind2Web is unique in combining real-world websites, high diversity, and high-level task descriptions. Section 2 details the dataset composition.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "MINDACT with multi-choice QA formulation substantially outperforms direct classification and generation baselines.",
    379       "evidence": "Table 2 shows MINDACT with Flan-T5XL achieves 52.0% step SR on Cross-Task vs. 26.8% element accuracy for classification and 17.5% step SR for generation baseline.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Small LM filtering as a first stage significantly improves LLM effectiveness for web action prediction.",
    384       "evidence": "Table 2: DeBERTa candidate generation achieves 88.9%/85.3%/85.7% Recall@50, and the two-stage MINDACT outperforms single-stage baselines by large margins (Section 4.3).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "GPT-4 achieves performance comparable to fine-tuned Flan-T5 on cross-website and cross-domain settings with only 3-shot in-context learning.",
    389       "evidence": "Table 2 shows GPT-4 at 30.1%/26.4% step SR vs Flan-T5XL at 38.9%/39.6% on Cross-Website/Cross-Domain. The claim of being 'on par' for element selection is supported, but step SR shows a gap. Only evaluated on 50 tasks per setting (Appendix D.3).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Generalization challenges primarily stem from website design diversity rather than domain specifics.",
    394       "evidence": "Table 2 and Figure 6 show Cross-Website (38.9% step SR) and Cross-Domain (39.6% step SR) are notably similar despite Cross-Domain being a harder setting. The paper argues this suggests domain knowledge is less important than website-specific grounding.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "The data collection pipeline produces high-quality data, with only 61/2,411 tasks discarded and 94.7% target element recall after preprocessing.",
    399       "evidence": "Section 2.2 details the four-stage pipeline with verification by authors. Section 4.2 reports preprocessing recall. However, no inter-annotator agreement metrics are provided.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "Very small GPT-4 evaluation set",
    406       "detail": "GPT-4 was evaluated on only 50 tasks per setting due to budget constraints. This is too small to draw reliable conclusions about comparative performance, yet the paper makes claims about GPT-4 being 'on par' with fine-tuned models based on these results."
    407     },
    408     {
    409       "flag": "No statistical significance tests",
    410       "detail": "All method comparisons in Table 2 rely on comparing raw numbers without any statistical testing. Given the variance shown in Table 5 (std up to 0.8), some differences between methods may not be statistically significant."
    411     },
    412     {
    413       "flag": "No inter-annotator agreement",
    414       "detail": "Despite being a dataset paper, no inter-annotator agreement metrics are reported. Tasks were verified by authors, but no systematic measurement of annotation consistency (e.g., Cohen's kappa for action correctness) is provided."
    415     },
    416     {
    417       "flag": "Main results lack variance reporting",
    418       "detail": "Table 2 presents single-run results for the main experiments. While Table 5 in the appendix shows multi-seed results for an auxiliary analysis, the primary results table lacks error bars or confidence intervals."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Do as I can, not as I say: Grounding language in robotic affordances",
    424       "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"],
    425       "year": 2022,
    426       "arxiv_id": "2204.01691",
    427       "relevance": "Foundational work on grounding language in robotic actions (SayCan), relevant to agentic AI grounding capabilities."
    428     },
    429     {
    430       "title": "Reinforcement learning on web interfaces using workflow-guided exploration",
    431       "authors": ["Evan Zheran Liu", "Kelvin Guu", "Panupong Pasupat", "Tianlin Shi", "Percy Liang"],
    432       "year": 2018,
    433       "relevance": "MiniWoB++ benchmark for web RL agents — directly comparable prior work on web agent evaluation."
    434     },
    435     {
    436       "title": "WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents",
    437       "authors": ["Shunyu Yao", "Howard Chen", "John Yang", "Karthik Narasimhan"],
    438       "year": 2022,
    439       "arxiv_id": "2207.01206",
    440       "relevance": "Simulated web shopping environment with grounded language agents — key prior benchmark for web agents."
    441     },
    442     {
    443       "title": "ReAct: Synergizing reasoning and acting in language models",
    444       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    445       "year": 2022,
    446       "arxiv_id": "2210.03629",
    447       "relevance": "Influential agentic framework combining reasoning and acting with LLMs, foundational for web agent architectures."
    448     },
    449     {
    450       "title": "Toolformer: Language models can teach themselves to use tools",
    451       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    452       "year": 2023,
    453       "arxiv_id": "2302.04761",
    454       "relevance": "LLMs learning to use external tools autonomously — directly relevant to agentic tool-use capabilities."
    455     },
    456     {
    457       "title": "Augmented language models: a survey",
    458       "authors": ["Grégoire Mialon", "Roberto Dessì", "Maria Lomeli"],
    459       "year": 2023,
    460       "arxiv_id": "2302.07842",
    461       "relevance": "Survey of augmented LLMs covering tool use, retrieval, and reasoning — maps the landscape Mind2Web contributes to."
    462     },
    463     {
    464       "title": "Tool learning with foundation models",
    465       "authors": ["Yujia Qin", "Shengding Hu", "Yankai Lin"],
    466       "year": 2023,
    467       "arxiv_id": "2304.08354",
    468       "relevance": "Comprehensive framework for foundation model tool learning, directly relevant to web agents as tool users."
    469     },
    470     {
    471       "title": "ToolkenGPT: Augmenting frozen language models with massive tools via tool embeddings",
    472       "authors": ["Shibo Hao", "Tianyang Liu", "Zhen Wang", "Zhiting Hu"],
    473       "year": 2023,
    474       "arxiv_id": "2305.11554",
    475       "relevance": "Method for augmenting LLMs with tools via embeddings, relevant to practical agentic tool integration."
    476     },
    477     {
    478       "title": "LLM-Planner: Few-shot grounded planning for embodied agents with large language models",
    479       "authors": ["Chan Hee Song", "Jiaman Wu", "Clayton Washington", "Brian M. Sadler", "Wei-Lun Chao", "Yu Su"],
    480       "year": 2022,
    481       "arxiv_id": "2212.04088",
    482       "relevance": "Few-shot grounded planning with LLMs for embodied agents — directly relevant to LLM-based agent planning."
    483     },
    484     {
    485       "title": "Don't generate, discriminate: A proposal for grounding language models to real-world environments",
    486       "authors": ["Yu Gu", "Xiang Deng", "Yu Su"],
    487       "year": 2022,
    488       "arxiv_id": "2212.09736",
    489       "relevance": "Proposes discrimination over generation for grounding LMs in real environments — foundational idea behind MINDACT's multi-choice formulation."
    490     },
    491     {
    492       "title": "Understanding HTML with large language models",
    493       "authors": ["Izzeddin Gur", "Ofir Nachum", "Yingjie Miao"],
    494       "year": 2023,
    495       "relevance": "LLMs for HTML understanding — directly relevant prior work on the core technical challenge of web agent grounding."
    496     },
    497     {
    498       "title": "Scaling instruction-finetuned language models",
    499       "authors": ["Hyung Won Chung", "Le Hou", "Shayne Longpre"],
    500       "year": 2022,
    501       "arxiv_id": "2210.11416",
    502       "relevance": "Flan-T5 paper — the backbone model used in MINDACT experiments, relevant to instruction-tuned LLM capabilities."
    503     },
    504     {
    505       "title": "ALFRED: A benchmark for interpreting grounded instructions for everyday tasks",
    506       "authors": ["Mohit Shridhar", "Jesse Thomason", "Daniel Gordon"],
    507       "year": 2020,
    508       "relevance": "Benchmark for grounded instruction following in embodied settings — related evaluation paradigm for agent grounding."
    509     }
    510   ],
    511   "engagement_factors": {
    512     "practical_relevance": {
    513       "score": 2,
    514       "justification": "Dataset and model code are released and directly usable by researchers building web agents, though not yet a deployable end-user tool."
    515     },
    516     "surprise_contrarian": {
    517       "score": 1,
    518       "justification": "Fills a clear gap in web agent benchmarks but doesn't challenge conventional wisdom — results confirming LLMs struggle with real-world web tasks are expected."
    519     },
    520     "fear_safety": {
    521       "score": 1,
    522       "justification": "Section 6 briefly raises safety concerns about autonomous web agents (financial transactions, CAPTCHA bypassing, malicious use) but this is not the paper's focus."
    523     },
    524     "drama_conflict": {
    525       "score": 0,
    526       "justification": "No controversial claims or conflicts — a straightforward dataset and benchmark contribution."
    527     },
    528     "demo_ability": {
    529       "score": 2,
    530       "justification": "Code, data, and trained models are open-sourced on GitHub and HuggingFace, enabling researchers to download and experiment immediately."
    531     },
    532     "brand_recognition": {
    533       "score": 1,
    534       "justification": "From the OSU NLP Group, a respected academic lab but not a household-name organization like OpenAI or Google DeepMind."
    535     }
    536   }
    537 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs