scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32595B)
      1 {
      2   "paper": {
      3     "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
      4     "authors": [
      5       "Tianbao Xie",
      6       "Danyang Zhang",
      7       "Jixuan Chen",
      8       "Xiaochuan Li",
      9       "Siheng Zhao",
     10       "Ruisheng Cao",
     11       "Toh Jing Hua",
     12       "Zhoujun Cheng",
     13       "Dongchan Shin",
     14       "Fangyu Lei",
     15       "Yitao Liu",
     16       "Yiheng Xu",
     17       "Shuyan Zhou",
     18       "Silvio Savarese",
     19       "Caiming Xiong",
     20       "Victor Zhong",
     21       "Tao Yu"
     22     ],
     23     "year": 2024,
     24     "venue": "Neural Information Processing Systems",
     25     "arxiv_id": "2404.07972",
     26     "doi": "10.48550/arXiv.2404.07972"
     27   },
     28   "scan_version": 2,
     29   "active_modules": ["experimental_rigor", "data_leakage"],
     30   "checklist": {
     31     "artifacts": {
     32       "code_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper states 'Our code, environment, baseline models, and data are publicly available at https://os-world.github.io' (Abstract, Section 1) and 'We open-source OSWORLD environment and benchmark, including environment initial state setup, reliable evaluation scripts, documentation, and our implementation of baseline models' (Section 1)."
     36       },
     37       "data_released": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The benchmark of 369 tasks with initial state configs and evaluation scripts is publicly released. The paper states data is available at https://os-world.github.io. Task configurations, evaluation functions, and VM snapshots are included."
     41       },
     42       "environment_specified": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Detailed environment specifications are provided: Ubuntu 22.04, LibreOffice 7.3.7.2, Thunderbird 115.6.0, specific VM configurations, screen resolution 1920x1080. Software dependencies including pyautogui, pyatspi, openpyxl, python-docx, python-pptx, Playwright are documented throughout Appendix B."
     46       },
     47       "reproduction_instructions": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper provides detailed documentation of the environment setup (Section 2.2), task configuration files (Figure 2), evaluation scripts, and releases baseline implementations. The open-source release includes documentation for reproducing experiments."
     51       }
     52     },
     53     "statistical_methodology": {
     54       "confidence_intervals_or_error_bars": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Table 5 reports only point estimates of success rates (e.g., 12.24%, 5.26%) with no confidence intervals or error bars. No uncertainty quantification is provided for any result."
     58       },
     59       "significance_tests": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper makes numerous comparative claims (e.g., 'GPT-4V falls far behind', model X outperforms model Y) based solely on comparing raw percentages without any statistical significance tests."
     63       },
     64       "effect_sizes_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results are reported as raw success rate percentages. No formal effect sizes (Cohen's d, odds ratios, etc.) are computed. The paper compares numbers directly (e.g., 12.24% vs 72.36%) without formal effect size analysis."
     68       },
     69       "sample_size_justified": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No justification is given for why 369 tasks constitute a sufficient sample. No power analysis is discussed. The ablation studies (Figures 5, 7, 8) are run on subsets described as '10% of examples' or '28 tasks' without justifying these sample sizes."
     73       },
     74       "variance_reported": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "All results appear to be single-run numbers. No standard deviations, variance across seeds, or interquartile ranges are reported in any table or figure."
     78       }
     79     },
     80     "evaluation_design": {
     81       "baselines_included": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper includes extensive baseline comparisons across 8+ models (GPT-4V, GPT-4, Gemini-Pro, Claude-3 Opus, Mixtral, Llama-3, CogAgent, Qwen-Max) and human performance (Table 5)."
     85       },
     86       "baselines_contemporary": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Baselines include GPT-4V, GPT-4o, Gemini-Pro-1.5, Claude-3 Opus, and Llama-3 — all state-of-the-art at time of writing (2024). Models represent both open-source and closed-source frontiers."
     90       },
     91       "ablation_study": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple ablation studies are conducted: input modality variations (a11y tree, screenshot, SoM, combined in Table 5), screenshot resolution (Figure 5), trajectory history length (Figure 7), and window perturbation robustness (Figure 8)."
     95       },
     96       "multiple_metrics": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The primary and essentially only metric is success rate. While the paper breaks results down by category, the underlying metric is the same binary task completion rate. No complementary metrics (e.g., partial credit distribution, step efficiency, action accuracy) are systematically reported in main results."
    100       },
    101       "human_evaluation": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Human performance evaluation is conducted (Section 3.4): 'We conduct human evaluations on each example in our dataset, with annotators being computer science major college students.' Human accuracy (72.36%) and completion time (median 111.94s) are reported (Figure 4)."
    105       },
    106       "held_out_test_set": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The benchmark tasks are a clean evaluation set — all agent evaluations are zero-shot with no training on benchmark tasks. The tasks were created specifically for evaluation and models were not fine-tuned on them."
    110       },
    111       "per_category_breakdown": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 5 provides breakdowns by OS, Office, Daily, Professional, and Workflow categories. Table 14 (Appendix) provides per-app breakdowns (Calc, Impress, Writer, Chrome, VLC, etc.). Table 6 breaks down by difficulty, feasibility, and number of apps."
    115       },
    116       "failure_cases_discussed": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 5.4 provides extensive qualitative analysis of failure cases with specific examples (Figures 9, 10, 16-18). Common error types are catalogued: mouse click inaccuracies (>75% of failures), repetitive clicks, environmental noise, lack of domain knowledge."
    120       },
    121       "negative_results_reported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Several negative results are reported: SoM hurts GPT-4V performance vs screenshot+a11y tree (Section 4.2), screenshot-only history doesn't improve performance (Figure 7), few-shot prompting failed (2.79%, Section 4.1), window perturbation causes 60-80% performance drops (Figure 8)."
    125       }
    126     },
    127     "claims_and_evidence": {
    128       "abstract_claims_supported": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Abstract claims are supported: 'first-of-its-kind scalable, real computer environment' (Table 4 comparison), '369 computer tasks' (Table 3), 'over 72.36% of the tasks' by humans (Section 3.4, Figure 4), 'best model achieves only 12.24%' (Table 5, GPT-4 with a11y tree)."
    132       },
    133       "causal_claims_justified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Causal claims like 'higher resolution leads to improved performance' (Figure 5) and 'longer text history improves performance' (Figure 7) are supported by controlled single-variable manipulation experiments. Ablation studies in Table 5 isolate input modalities. The study designs are adequate for these causal claims."
    137       },
    138       "generalization_bounded": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The title claims 'Open-Ended Tasks in Real Computer Environments' and the abstract says the environment supports 'various operating systems such as Ubuntu, Windows, and macOS,' but the benchmark evaluation is almost entirely Ubuntu-based (369 tasks) with only 43 supplementary Windows tasks and no macOS tasks. The conclusion generalizes to 'autonomous digital agents' broadly."
    142       },
    143       "alternative_explanations_discussed": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper provides explanations for results (e.g., SoM fails due to noise from too many elements, Claude struggles with grounding) but doesn't systematically consider alternative explanations. No discussion of confounds like model size, training data differences, or API version effects on comparative results."
    147       },
    148       "proxy_outcome_distinction": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper measures success rate on 369 curated tasks but frames results in terms of 'ability to serve as computer assistants' (Section 4.2) and 'generalist capable computer agents' (Section 1). The gap between curated benchmark performance and real-world computer assistance is not discussed."
    152       }
    153     },
    154     "setup_transparency": {
    155       "model_versions_specified": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Appendix C.1 specifies 'gpt-3.5-turbo-16k, gpt-4-0125-preview, and gpt-4-vision-preview' and 'gemini-pro and gemini-pro-vision.' These are specific API version identifiers."
    159       },
    160       "prompts_provided": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Full prompts are provided in Appendix C.2 (Sections C.2.1 and C.2.2) for all four input settings: a11y tree, screenshot, screenshot+a11y tree, and Set-of-Mark."
    164       },
    165       "hyperparameters_reported": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section C.1 states: 'temperature parameter to 1.0, and top_p to 0.9, and the maximum number of tokens for generation is set to 1500. We set the maximum steps of interaction to 15 and the maximum time limits to 30 minutes.' Screen resolution 1920x1080 is also specified."
    169       },
    170       "scaffolding_described": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The agent scaffolding is described in detail: observation space (Section 2.3), action space via pyautogui (Section 2.4, Table 2), agent interaction loop with POMDP formalization (Section 2.1), history encoding using recent 3 observations in chat mode (Section 4.1), and special actions (WAIT, FAIL, DONE)."
    174       },
    175       "data_preprocessing_documented": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "A11y tree filtering is documented in detail (Section C.3, Table 13) with specific criteria for keeping nodes (tag types, visibility, availability, etc.). Screenshot downsampling procedures are described (Section 5.2). SoM implementation is detailed (Section C.4)."
    179       }
    180     },
    181     "limitations_and_scope": {
    182       "limitations_section_present": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 7 'Conclusion and Future Work' includes substantive discussion of limitations across four dedicated subsections: VLM capabilities, agent methodologies, safety challenges, and data/environment expansion. Quality control (Section 3.2) also acknowledges false positives/negatives."
    186       },
    187       "threats_to_validity_specific": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Specific threats are discussed: a11y tree quality varies across applications (Section 7), copyright prevents macOS evaluation (Appendix B.1), evaluation may have false positives/negatives that 'further investment of time and a more red teaming could further reduce' (Section 3.2), few-shot prompting scheme limitation (Section 4.1)."
    191       },
    192       "scope_boundaries_stated": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Scope boundaries are stated: Ubuntu focus with limited Windows supplementary tasks (Section 3.1), 8 representative applications selected with stated criteria (Appendix B.2), macOS excluded due to copyright (Appendix B.1), max 15 steps per task (Section 4.1)."
    196       }
    197     },
    198     "data_integrity": {
    199       "raw_data_available": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The full benchmark data is released: task configurations, evaluation scripts, VM snapshots, and baseline implementations are available at https://os-world.github.io. Individual task results can be verified through the execution-based evaluation framework."
    203       },
    204       "data_collection_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Section 3.2 describes task collection in detail: sources include 'forums, tutorials, guidelines' (listed in Table 9), selection criteria based on 'popularity, helpfulness, and diversity, revealed by the views and votes,' with cross-checking by two non-annotating authors per task."
    208       },
    209       "recruitment_methods_described": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "Human evaluators are described as 'computer science major college students who possess basic software usage skills but have not been exposed to the samples or software before' (Section 3.4). However, how they were recruited, the total number, and whether they were compensated is not described."
    213       },
    214       "data_pipeline_documented": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The full pipeline is documented: source collection (Table 9), task selection criteria, annotation process (Section 3.2), cross-checking by two additional authors, four rounds of quality control checks with 400+ man-hours, human evaluation and baseline experiment iterations."
    218       }
    219     },
    220     "conflicts_of_interest": {
    221       "funding_disclosed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The Acknowledgements section thanks various researchers for feedback but does not disclose any funding sources, grants, or financial support."
    225       },
    226       "affiliations_disclosed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Author affiliations are clearly listed: The University of Hong Kong, CMU, Salesforce Research, and University of Waterloo. These are prominently displayed under author names."
    230       },
    231       "funder_independent_of_outcome": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No funding is disclosed, so independence cannot be assessed. Two authors are from Salesforce Research, but no funding relationship or potential conflicts are discussed."
    235       },
    236       "financial_interests_declared": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No competing interests statement is present. No disclosure of patents, equity, or financial interests is provided despite Salesforce Research affiliation."
    240       }
    241     },
    242     "contamination": {
    243       "training_cutoff_stated": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "While specific API versions are stated (e.g., 'gpt-4-0125-preview'), the training data cutoff dates for the evaluated models are not explicitly stated. The paper does not discuss when each model's training data was collected."
    247       },
    248       "train_test_overlap_discussed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "The paper integrates 84 tasks from existing benchmarks (NL2Bash, Mind2Web, SheetCopilot, PPTC, GAIA) but does not discuss whether the evaluated models could have seen these tasks during training."
    252       },
    253       "benchmark_contamination_addressed": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No contamination analysis is provided. While the majority of tasks are newly created (reducing contamination risk), the 84 integrated tasks from existing public benchmarks (22.8% of the dataset) could be in the training data of evaluated models, and this is never discussed."
    257       }
    258     },
    259     "human_studies": {
    260       "pre_registered": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No pre-registration is mentioned for the human performance study."
    264       },
    265       "irb_or_ethics_approval": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No IRB or ethics board approval is mentioned for the human performance evaluation study involving college students."
    269       },
    270       "demographics_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "Evaluators are described only as 'computer science major college students who possess basic software usage skills' (Section 3.4). No age, gender, geographic distribution, years of experience, or detailed characterization is provided."
    274       },
    275       "inclusion_exclusion_criteria": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "Only minimal criteria mentioned: CS major, basic software skills, no prior exposure to samples or software. No formal inclusion/exclusion criteria, screening process, or eligibility requirements are documented."
    279       },
    280       "randomization_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "The human performance study is observational — participants complete tasks without experimental conditions or group assignments. No randomization is needed or applicable."
    284       },
    285       "blinding_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "The human performance study measures completion ability on tasks, not a comparative experiment with conditions. Blinding is not applicable to this observational measurement."
    289       },
    290       "attrition_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No information about participant attrition or dropout from the human evaluation study is reported. The number of evaluators who started vs completed all tasks is not stated."
    294       }
    295     },
    296     "cost_and_practicality": {
    297       "inference_cost_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No API costs, tokens consumed, or cost per task are reported despite extensive use of commercial APIs (GPT-4V, Gemini, Claude-3) across 369 tasks in multiple settings."
    301       },
    302       "compute_budget_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Annotation effort is quantified (1800 man-hours) but computational budget for running experiments (GPU hours, API spend, total inference time) is not reported."
    306       }
    307     },
    308     "experimental_rigor": {
    309       "seed_sensitivity_reported": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No seed sensitivity analysis is reported. All results appear to be from single runs with temperature=1.0 and top_p=0.9, meaning outputs are stochastic but no multi-seed analysis is conducted."
    313       },
    314       "number_of_runs_stated": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The number of experimental runs per configuration is not stated. Results appear to be single-run numbers, but this is never explicitly confirmed."
    318       },
    319       "hyperparameter_search_budget": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Hyperparameters (temperature=1.0, top_p=0.9, max_tokens=1500) are reported but no search budget is given. The paper mentions trying few-shot prompting first (2.79%) but doesn't systematically report the search space explored."
    323       },
    324       "best_config_selection_justified": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "The paper justifies its configuration: 'Our prior experiments following VisualWebArena adopt few-shot prompting... but this resulted in poor performance (success rate of 2.79%)... We attribute the result to a lack of history encoding and change in the prompting scheme' (Section 4.1). The switch to chat mode with trajectory history is motivated."
    328       },
    329       "multiple_comparison_correction": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable. All comparisons are based on raw percentage comparisons."
    333       },
    334       "self_comparison_bias_addressed": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The authors implement all baseline agents themselves using the same framework but do not acknowledge self-implementation bias. No independent evaluation or discussion of potential bias from authors implementing their own baselines."
    338       },
    339       "compute_budget_vs_performance": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "Performance is not reported as a function of compute budget. Different models have vastly different computational costs (e.g., GPT-4V vs CogAgent) but no compute-normalized comparison is provided."
    343       },
    344       "benchmark_construct_validity": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "As the benchmark's creators, the authors extensively discuss what it measures: real-world computer task completion across diverse applications. Table 4 compares construct dimensions with existing benchmarks. Section 3 motivates task design from real user scenarios. Section 3.4 validates task difficulty via human performance comparison with WebArena."
    348       },
    349       "scaffold_confound_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The same scaffolding framework (pyautogui action space, same prompt structure, same observation processing) is used across all model comparisons within each input setting. Table 5 is organized by input setting, so model comparisons are within controlled scaffolding conditions."
    353       }
    354     },
    355     "data_leakage": {
    356       "temporal_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of temporal leakage. The paper does not address whether task solutions or related tutorials could appear in models' training data, particularly for the 84 tasks integrated from existing benchmarks."
    360       },
    361       "feature_leakage_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether the evaluation setup leaks information. The a11y tree provides structured element information that agents might exploit in ways not representative of real use — this is never discussed as a potential leakage concern."
    365       },
    366       "non_independence_addressed": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "The paper does not discuss independence of integrated benchmark tasks from training data. 84 tasks from NL2Bash, Mind2Web, SheetCopilot, PPTC, and GAIA are included without verifying they are not in training sets of evaluated models."
    370       },
    371       "leakage_detection_method": {
    372         "applies": true,
    373         "answer": false,
    374         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, temporal splits, or decontamination pipelines are mentioned."
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "The best model achieves only 12.24% success rate on OSWorld, compared to 72.36% human performance.",
    381       "evidence": "Table 5 shows GPT-4 with a11y tree input achieves 12.24% overall success rate. Human performance is 72.36% (Section 3.4, Figure 4). Consistent across detailed per-app breakdowns in Table 14.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Current LLMs and VLMs are far from capable of serving as computer assistants.",
    386       "evidence": "Table 5 shows all models range from 0.99% to 12.24%, with workflow tasks reaching only 6.57% at best. The gap to human performance (72.36%) is consistently large across all categories.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Higher screenshot resolution typically leads to improved performance for VLM agents.",
    391       "evidence": "Figure 5 shows performance increases with resolution for screenshot-only setting. However, this is tested on only 10% of examples (about 37 tasks), and the SoM setting shows a non-monotonic pattern.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Longer text-based trajectory history improves performance, but screenshot-based history does not.",
    396       "evidence": "Figure 7 shows SoM performance increases with more a11y tree history, while screenshot-only performance remains flat or decreases. Tested on 10% of examples.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Agent performance varies significantly more across task categories than human performance.",
    401       "evidence": "Table 5 shows agent performance ranges from 0% to 41.67% across categories while human performance stays within 70-75% range across all categories.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "OSWorld tasks are more challenging and time-consuming than WebArena tasks.",
    406       "evidence": "Figure 4 shows median completion time of 111.94s vs WebArena's 35.38s, and human accuracy of 72.36% vs WebArena's 88%. However, WebArena comparison uses only 100 sampled examples.",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "VLM agents are not robust to perturbations of position, size, and clutter of application windows.",
    411       "evidence": "Figure 8 shows performance drops from 50.79% to 36.5% (position), 15.04% (size), and 25.39% (clutter) on a subset of 28 well-performing tasks.",
    412       "supported": "moderate"
    413     },
    414     {
    415       "claim": "More than 75% of GPT-4V agent failures involve mouse click inaccuracies.",
    416       "evidence": "Section 5.4 states 'Among the 550 failed examples from different settings in our sample, more than 75% exist mouse click inaccuracies.' Qualitative examples are provided in Appendix D.",
    417       "supported": "moderate"
    418     }
    419   ],
    420   "methodology_tags": ["benchmark-eval"],
    421   "key_findings": "OSWorld introduces the first scalable, real computer environment for multimodal agents spanning 369 tasks across Ubuntu applications. The best model (GPT-4) achieves only 12.24% success compared to 72.36% human performance, with workflow tasks proving especially difficult (6.57% best). Key bottlenecks identified include GUI grounding accuracy (75%+ failures involve mouse click errors), lack of application-specific operational knowledge, and inability to handle environmental noise from unexpected windows. Higher screenshot resolution and longer text-based trajectory history improve performance, while Set-of-Mark augmentation shows inconsistent benefits across models.",
    422   "red_flags": [
    423     {
    424       "flag": "No uncertainty quantification",
    425       "detail": "All results in Table 5 and throughout the paper are single-run point estimates with no error bars, confidence intervals, or variance across runs. Given stochastic sampling (temperature=1.0), results could vary substantially across runs."
    426     },
    427     {
    428       "flag": "Self-implementation bias unacknowledged",
    429       "detail": "All baseline agents are implemented by the paper's authors using their framework. Lucic et al. (2018) showed that authors' implementations of baselines systematically underperform. This bias is not discussed."
    430     },
    431     {
    432       "flag": "Contamination risk for integrated tasks",
    433       "detail": "84 tasks (22.8%) are integrated from existing public benchmarks (NL2Bash, Mind2Web, SheetCopilot, PPTC, GAIA). These could appear in the training data of evaluated models, yet contamination is never analyzed or discussed."
    434     },
    435     {
    436       "flag": "Ablation studies on very small subsets",
    437       "detail": "Resolution (Figure 5) and history (Figure 7) ablations use only '10% of examples' (~37 tasks). Window perturbation (Figure 8) uses 28 cherry-picked well-performing tasks. These small samples make the ablation conclusions fragile."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Mind2Web: Towards a Generalist Agent for the Web",
    443       "authors": ["Xiang Deng", "Yu Gu", "Boyuan Zheng"],
    444       "year": 2023,
    445       "arxiv_id": "2306.06070",
    446       "relevance": "Web agent benchmark with 2,350 tasks used as a comparison point and source of integrated tasks for OSWorld."
    447     },
    448     {
    449       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    450       "authors": ["Shuyan Zhou", "Frank F Xu", "Hao Zhu"],
    451       "year": 2023,
    452       "arxiv_id": "2307.13854",
    453       "relevance": "Realistic web agent benchmark with execution-based evaluation, directly compared to OSWorld for task difficulty and human performance."
    454     },
    455     {
    456       "title": "VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks",
    457       "authors": ["Jing Yu Koh", "Robert Lo"],
    458       "year": 2024,
    459       "arxiv_id": "2401.13649",
    460       "relevance": "Multimodal web agent benchmark extending WebArena with visual tasks; influenced OSWorld's SoM and prompting methodology."
    461     },
    462     {
    463       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    464       "authors": ["Carlos E Jimenez", "John Yang"],
    465       "year": 2023,
    466       "arxiv_id": "2310.06770",
    467       "relevance": "Influential code agent benchmark for real-world software engineering tasks, related to OSWorld's goal of evaluating agents on real computer tasks."
    468     },
    469     {
    470       "title": "AgentBench: Evaluating LLMs as Agents",
    471       "authors": ["Xiao Liu", "Hao Yu"],
    472       "year": 2023,
    473       "arxiv_id": "2308.03688",
    474       "relevance": "Multi-environment benchmark for LLM agents across isolated environments, compared to OSWorld's unified computer environment approach."
    475     },
    476     {
    477       "title": "CogAgent: A Visual Language Model for GUI Agents",
    478       "authors": ["Wenyi Hong", "Weihan Wang"],
    479       "year": 2023,
    480       "arxiv_id": "2312.08914",
    481       "relevance": "Specialized VLM for GUI agent tasks, evaluated as an open-source baseline in OSWorld."
    482     },
    483     {
    484       "title": "GAIA: A Benchmark for General AI Assistants",
    485       "authors": ["Grégoire Mialon", "Clémentine Fourrier"],
    486       "year": 2023,
    487       "arxiv_id": "2311.12983",
    488       "relevance": "General AI assistant benchmark with 466 tasks requiring real-world tool use; source of integrated tasks for OSWorld."
    489     },
    490     {
    491       "title": "WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?",
    492       "authors": ["Alexandre Drouin", "Maxime Gasse"],
    493       "year": 2024,
    494       "arxiv_id": "2403.07718",
    495       "relevance": "Web agent benchmark for knowledge work tasks with 23k instances, compared to OSWorld's approach in Table 4."
    496     },
    497     {
    498       "title": "InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback",
    499       "authors": ["John Yang", "Akshara Prabhakar"],
    500       "year": 2023,
    501       "arxiv_id": "2306.14898",
    502       "relevance": "Interactive coding environment benchmark with execution feedback, related to OSWorld's execution-based evaluation approach."
    503     },
    504     {
    505       "title": "GPT-4V(ision) is a Generalist Web Agent, if Grounded",
    506       "authors": ["Boyuan Zheng", "Boyu Gou"],
    507       "year": 2024,
    508       "arxiv_id": "2401.01614",
    509       "relevance": "Study demonstrating GPT-4V as a web agent when properly grounded; influenced OSWorld's SoM and grounding analysis."
    510     },
    511     {
    512       "title": "Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V",
    513       "authors": ["Jianwei Yang", "Hao Zhang"],
    514       "year": 2023,
    515       "arxiv_id": "2310.11441",
    516       "relevance": "Visual prompting technique evaluated as an input method in OSWorld; shown to have inconsistent effectiveness for GUI agent tasks."
    517     },
    518     {
    519       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    520       "authors": ["Hongliang He", "Wenlin Yao"],
    521       "year": 2024,
    522       "arxiv_id": "2401.13919",
    523       "relevance": "End-to-end multimodal web agent approach relevant to OSWorld's investigation of vision-based agent capabilities."
    524     }
    525   ]
    526 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs