scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27978B)
      1 {
      2   "paper": {
      3     "title": "Fara-7B: An Efficient Agentic Model for Computer Use",
      4     "authors": [
      5       "Ahmed Awadallah",
      6       "Yash Lara",
      7       "Raghav Magazine",
      8       "Hussein Mozannar",
      9       "Akshay Nambi",
     10       "Yash Pandya",
     11       "Aravind Rajeswaran",
     12       "Corby Rosset",
     13       "Alexey Taymanov",
     14       "Vibhav Vineet",
     15       "Spencer Whitehead",
     16       "Andrew Zhao"
     17     ],
     18     "year": 2025,
     19     "venue": "arXiv",
     20     "arxiv_id": "2511.19663",
     21     "doi": "10.48550/arXiv.2511.19663"
     22   },
     23   "scan_version": 2,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper provides a GitHub link (https://github.com/microsoft/fara) and model weights on HuggingFace (https://huggingface.co/microsoft/fara-7b) and Azure Foundry."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "WebTailBench (609 tasks) is stated to be released. The model is open-weight. However, the full 145K trajectory training dataset is not explicitly stated as released. WebTailBench counts as partial data release; standard benchmarks used (WebVoyager, Mind2Web) are public."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Appendix C mentions DeepSpeed Stage 3 and bf16 precision on 64 H100 GPUs, but no library versions or dependency specifications."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is referenced but the paper itself does not include a reproducing-results section."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Table 19 in Appendix D.3 reports mean ± standard deviation for all benchmarks across three runs (e.g., 'Fara-7B 73.5 ± 1.0' on WebVoyager)."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No statistical significance tests are used. Claims of outperformance are based on comparing mean success rates without any hypothesis testing."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Absolute accuracy differences are reported with baseline context throughout (e.g., Fara-7B 73.5% vs UI-TARS-1.5-7B 66.4% on WebVoyager, Table 9). Cost differences are also contextualized ($0.025 vs $0.30+ per task, Table 10)."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification for why the benchmark sizes or number of runs (3) are sufficient. The critical point evaluation uses only 23 tasks with no power analysis."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 19 reports standard deviation across three independent runs for all benchmarks. Tables 10 and 12 report mean ± std for actions per task, input/output tokens."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baselines included: UI-TARS-1.5-7B, OpenAI computer-use-preview, SoM agents with GPT-4o/o3/GPT-5, and GLM-4.1V-9B-Thinking (Tables 9, 11)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Baselines include GPT-5, o3, and UI-TARS-1.5-7B — all 2025 models. These are state-of-the-art at time of writing."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table 4 shows cumulative ablations of the task solving pipeline. Figure 7 (left) shows data scaling ablation (1%, 10%, 100%). Figure 7 (middle/right) shows inference step scaling."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Success rate across 4 benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench), plus cost per task, actions per task, token usage, grounding accuracy (ScreenSpot), and safety refusal rate."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 5.1.2 states Browserbase independently verified Fara-7B with human annotators, establishing 62% accuracy on WebVoyager. However, the gap between auto-eval and human eval is acknowledged but not deeply explored."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Evaluation is on separate benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench) not used in training. The model was trained on FaraGen data, evaluated on independent benchmarks."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table 11 provides per-category breakdown of WebTailBench across 11 segments. Table 13b breaks down grounding by Mobile/Desktop/Web and Text/Icon."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2.2 discusses looping failures. Table 2 shows the 'funnel' of trajectory losses. Section 5.4 discusses 4 critical point failures. Section 7 discusses limitations including inability to drag-and-drop, hallucinations, etc."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Table 2 shows very low success rates for difficult tasks (flights 3% without BrowserBase). Table 4 shows weak baseline performance (33%). The human evaluation gap (62% human vs 73.5% auto) is a negative finding."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims about outperforming comparable-size models and being competitive with larger frontier models are supported by Tables 9 and 10. The ~$1 per trajectory claim is supported by Table 6."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Ablation studies (Table 4, Figure 7) support causal claims about the contribution of data scaling and pipeline modifications. These are controlled single-variable manipulations."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title says 'Computer Use' broadly but evaluation is web-only. The paper acknowledges limitations (no drag-and-drop, no audio/video) in Section 7, but the abstract and framing suggest broader computer use capability than what was tested."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No substantive discussion of alternative explanations for the results. For example, the base model (Qwen2.5-VL) may already have web navigation capabilities; the paper doesn't control for this beyond the grounding comparison in Table 13a."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper measures success rate on benchmarks and frames this as 'agentic capabilities' and 'computer use' ability. The gap between benchmark task completion and real-world computer use utility is not discussed, though the human eval gap (62% vs 73.5%) hints at it."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper refers to 'GPT-4o', 'o3', 'GPT-5', 'o4-mini' without API version strings or snapshot dates. The base model is specified as 'Qwen2.5-VL-7B' which is more specific but still lacks a checkpoint hash or date."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The paper describes prompts conceptually (e.g., the Orchestrator ledger fields in Table 1, verifier descriptions) but does not provide the actual prompt text used for any component. The appendix shows a single screenshot QA prompt excerpt but not the full prompts."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Appendix C reports training hyperparameters: AdamW with β1=0.9, β2=0.95, cosine warmup for 10% of steps, learning rate 5e-6, gradient clipping max 1, 2 epochs (~28k iterations), batch size 128, 64 H100 GPUs, DeepSpeed Stage 3, bf16."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The multi-agent scaffolding (Orchestrator, WebSurfer, UserSimulator, verifiers) is described in extensive detail in Sections 2.1-2.3 with figures, logic tables (Tables 1, 3), and the action space (Table 7)."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 3.2 describes trajectory data processing: extracting screenshots/reasoning/actions from WebSurfer outputs, replacing SoM element IDs with center coordinates, keeping only N=3 recent observations. Table 16 shows data mixture composition."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 7 (Discussion) contains a 'Limitations' paragraph discussing action space limitations, accuracy on complex tasks, hallucinations, and the critical point framework's incompleteness."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Specific threats discussed: no training data beyond critical points may cause unexpected behavior (Section 2.2), BrowserBase dependency for reliable evaluation (Section 5.1.1), time-sensitive tasks going stale, human eval vs auto-eval gap (Section 5.1.2), small critical point evaluation dataset (Section 5.4)."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 7 explicitly states what the model cannot do: drag-and-drop, video/audio consumption, game playing, ultra-low latency tasks. Guidelines for Safe Use state the model should not be used in regulated domains or commercial applications."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The full 145K trajectory training dataset is not released. Only the model weights, WebTailBench tasks, and code are released. The raw FaraGen data is not available for verification."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 2 extensively describes the data collection: three task proposal strategies (targeted URL, agentic exploration, exemplar), task solving with Magentic-One, and three-stage verification. Table 5 provides statistics. Table 2 shows per-segment yields."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants were recruited as subjects. Data is synthetically generated from websites. The third-party human evaluation by Browserbase is briefly described but this is evaluation, not data collection for training."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The full pipeline is documented: task proposal → task solving → verification → filtering. Table 2 shows the funnel with error rates, completion rates, and verification rates per segment. Table 5 provides final statistics."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding source is disclosed. The paper is from 'AI Frontiers' (Microsoft) but no explicit funding statement is provided."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The paper is labeled 'AI Frontiers' which is a Microsoft research group. Authors are from Microsoft. The affiliation is visible in the header."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Microsoft funds this research and has a direct financial interest in the outcome — Fara-7B is released on Azure Foundry (a Microsoft product). The funder is not independent."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is provided. Microsoft employees are evaluating a Microsoft product released on a Microsoft platform."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The base model Qwen2.5-VL's training cutoff is not stated. The FaraGen data is collected from live websites but no cutoff date is given for when data collection occurred."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether WebVoyager tasks, Online-Mind2Web tasks, or other benchmark tasks overlap with FaraGen training data or the base model's pretraining data."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "WebVoyager and Mind2Web were published before the model's likely training cutoff. No contamination analysis is performed. WebTailBench is new but the other benchmarks are not addressed."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants as research subjects. The third-party Browserbase evaluation is quality assurance, not a human subjects study."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants as research subjects."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants as research subjects."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants as research subjects."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants as research subjects."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants as research subjects."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants as research subjects."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Table 10 reports $0.025 per task for Fara-7B on WebVoyager. Table 12 reports $0.069 per task on WebTailBench. Table 6 reports data generation costs (~$1 per trajectory)."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Appendix C states 64 H100 GPUs for training, 2 epochs (~28k iterations). Section 2.2 states 40 nodes with 4 browsers each for data generation achieving 600 trajectories/hour. Table 6 gives per-trajectory generation costs."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No random seed sensitivity analysis. The model training does not report results across multiple training seeds. The 3 evaluation runs capture evaluation variance but not training variance."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Section 5.1.1 states 'we run three independent evaluations for each online benchmark and report the average.' Table 19 confirms this with mean ± std across 3 runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget is reported. The paper mentions 'we tune the mixing ratios of the data' (Section 3.2) and 'based on early experiments, we set N=3' without stating how many configurations were tried."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The final data mixture ratios, N=3 observation window, and other design choices appear tuned but the selection process is not documented. No validation set-based selection is described."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "No statistical tests are performed at all, so no correction for multiple comparisons. Comparisons across 4 benchmarks and 7+ models would warrant correction."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors (Microsoft) evaluate their own model against competitors. The SoM agents use their own implementation. No discussion of self-evaluation bias. The Browserbase human eval partially mitigates this but the gap is not explored."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Figure 1 and Tables 10/12 explicitly compare accuracy vs. cost across models. The paper's central argument is about the Pareto frontier of cost vs. performance."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section 4 (WebTailBench) extensively discusses limitations of existing benchmarks — lack of diversity, unrealistic tasks, poor alignment with human judgment. The paper creates WebTailBench specifically to address construct validity gaps."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The paper explicitly separates SoM agents (using accessibility trees) from native CUA models (using only screenshots) in all result tables. Section 7 discusses why SoM vs native CUA is a confound and compares at matched paradigms."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the base model Qwen2.5-VL was trained on data that includes benchmark solutions or similar web interaction patterns."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the FaraGen training data includes websites or task types that overlap with evaluation benchmarks."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No analysis of whether FaraGen training domains overlap with benchmark domains. The training data visits 70K unique domains; no check against benchmark domains is reported."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is applied. No decontamination pipeline, overlap analysis, or temporal splits are described."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Fara-7B achieves 73.5% success on WebVoyager, outperforming all other 7B-scale CUA models and being competitive with larger frontier models.",
    376       "evidence": "Table 9 shows Fara-7B at 73.5% vs UI-TARS-1.5-7B at 66.4% and OpenAI computer-use-preview at 70.9%. Table 19 shows 73.5 ± 1.0 across 3 runs.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "FaraGen generates verified trajectories at approximately $1 each.",
    381       "evidence": "Table 6 shows costs of $0.59 (o4-mini), $1.08 (o3), $1.00 (GPT-5) per trajectory across all pipeline components.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Fara-7B is significantly more cost-efficient than larger models, averaging $0.025 per task on WebVoyager.",
    386       "evidence": "Table 10 reports $0.025 for Fara-7B vs $0.302-$0.913 for proprietary baselines, based on token pricing.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Fara-7B achieves 38.4% on WebTailBench, substantially outperforming all models in its class.",
    391       "evidence": "Table 9 shows Fara-7B at 38.4% vs UI-TARS-1.5-7B at 19.5% and OpenAI computer-use-preview at 25.7%.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "High-quality synthetic data can unlock agentic capabilities in small models, establishing a new Pareto frontier.",
    396       "evidence": "Figure 1 and Figure 7 (left) show data scaling trends. However, this is demonstrated only on web tasks and the 'Pareto frontier' claim depends on specific pricing assumptions.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Fara-7B achieves the highest refusal rates among computer-use models (94.2% on AgentHarm-Chat).",
    401       "evidence": "Table 14 shows 94.2% refusal on AgentHarm-Chat vs 84.6% for OpenAI and 3.8% for UI-TARS-1.5-7B. However, the paper acknowledges Fara-7B was trained on similar refusal data.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": ["benchmark-eval"],
    406   "key_findings": "Fara-7B, a 7B-parameter CUA model trained on 145K synthetic web trajectories from FaraGen, achieves 73.5% on WebVoyager, outperforming same-size models and competing with much larger frontier models at ~1/10th the cost ($0.025 vs $0.30+ per task). The paper introduces WebTailBench (609 tasks across 11 categories) to address gaps in existing CUA benchmarks. Data scaling experiments show strong positive trends from 18K to 1.8M training steps, and the model demonstrates robust safety behavior with 94.2% harmful task refusal rate.",
    407   "red_flags": [
    408     {
    409       "flag": "Company evaluating its own product",
    410       "detail": "Microsoft researchers evaluate Fara-7B, a Microsoft product released on Azure Foundry. While baselines are included, the SoM agent implementations are also the authors' own. The Browserbase human evaluation partially mitigates this, but it was commissioned rather than independent."
    411     },
    412     {
    413       "flag": "No contamination analysis",
    414       "detail": "No analysis of whether the 70K training domains overlap with benchmark domains. No discussion of base model (Qwen2.5-VL) potentially having seen benchmark-related content. This is especially concerning given FaraGen uses live websites."
    415     },
    416     {
    417       "flag": "LLM-as-judge overestimates performance",
    418       "detail": "The paper's own human evaluation shows 62% accuracy on WebVoyager vs 73.5% from GPT-4o-as-judge — an 11.5pp gap. The main results (Tables 9, 11) use auto-evaluation only. If all models are similarly overestimated, rankings may hold, but absolute numbers are inflated."
    419     },
    420     {
    421       "flag": "No statistical significance tests",
    422       "detail": "All comparative claims are based on mean differences across 3 runs without hypothesis testing. Given the standard deviations reported (e.g., Fara-7B 73.5±1.0 vs OpenAI 70.9±1.9 on WebVoyager), some differences may not be statistically significant."
    423     },
    424     {
    425       "flag": "WebTailBench evaluated by authors' own verification system",
    426       "detail": "WebTailBench uses the same Task Verification system developed for FaraGen (which achieves only 83.3% agreement with humans). The benchmark and its evaluator are both from the same team, creating a closed evaluation loop."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    432       "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"],
    433       "year": 2024,
    434       "arxiv_id": "2411.04468",
    435       "relevance": "Multi-agent framework that FaraGen builds on for task solving; key work in agentic AI systems."
    436     },
    437     {
    438       "title": "UI-TARS: Pioneering Automated GUI Interaction with Native Agents",
    439       "authors": ["Yujia Qin"],
    440       "year": 2025,
    441       "arxiv_id": "2501.12326",
    442       "relevance": "Primary baseline comparison for Fara-7B; same base model with different post-training (SFT vs RL)."
    443     },
    444     {
    445       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    446       "authors": ["Hongliang He"],
    447       "year": 2024,
    448       "arxiv_id": "2401.13919",
    449       "relevance": "Key benchmark used for evaluating CUA models; LLM-based web agent evaluation methodology."
    450     },
    451     {
    452       "title": "Mind2Web: Towards a Generalist Agent for the Web",
    453       "authors": ["Xiang Deng"],
    454       "year": 2023,
    455       "arxiv_id": "2306.06070",
    456       "relevance": "Benchmark for realistic web interactions; evaluation dataset for CUA models."
    457     },
    458     {
    459       "title": "An Illusion of Progress? Assessing the Current State of Web Agents",
    460       "authors": ["Tianci Xue"],
    461       "year": 2025,
    462       "arxiv_id": "2504.01382",
    463       "relevance": "Critical assessment of web agent evaluation methodology; documents gap between auto-eval and human eval."
    464     },
    465     {
    466       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    467       "authors": ["Maksym Andriushchenko"],
    468       "year": 2024,
    469       "arxiv_id": "2410.09024",
    470       "relevance": "Safety benchmark used to evaluate Fara-7B's refusal capabilities for harmful agent tasks."
    471     },
    472     {
    473       "title": "Why do multi-agent LLM systems fail?",
    474       "authors": ["Mert Cemri"],
    475       "year": 2025,
    476       "arxiv_id": "2503.13657",
    477       "relevance": "Analysis of multi-agent LLM system failures; motivated Fara-7B's single-model distillation approach."
    478     },
    479     {
    480       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    481       "authors": ["Shunyu Yao"],
    482       "year": 2023,
    483       "relevance": "Foundational work on tool-augmented LLM agents interleaving reasoning with tool calls."
    484     },
    485     {
    486       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    487       "authors": ["Timo Schick"],
    488       "year": 2023,
    489       "arxiv_id": "2302.04761",
    490       "relevance": "Early work on enabling LLMs to use external tools; foundation for agentic capabilities."
    491     },
    492     {
    493       "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    494       "authors": ["Tianbao Xie"],
    495       "year": 2024,
    496       "arxiv_id": "2404.07972",
    497       "relevance": "Computer use benchmark in real environments; used as evaluation infrastructure for UI-TARS baseline."
    498     },
    499     {
    500       "title": "Magentic-UI: Towards Human-in-the-Loop Agentic Systems",
    501       "authors": ["Hussein Mozannar"],
    502       "year": 2025,
    503       "relevance": "Human-in-the-loop agentic system; source of adversarial testing tasks for CUA safety evaluation."
    504     },
    505     {
    506       "title": "DeepShop: A Benchmark for Deep Research Shopping Agents",
    507       "authors": ["Yougang Lyu"],
    508       "year": 2025,
    509       "arxiv_id": "2506.02839",
    510       "relevance": "Shopping-specific CUA benchmark used in Fara-7B evaluation."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs